264 files changed, 10943 insertions, 7595 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
 source "fs/nfs/Kconfig"
 source "fs/nfsd/Kconfig"
+config GRACE_PERIOD
+        tristate
 config LOCKD
        tristate
        depends on FILE_LOCKING
+        select GRACE_PERIOD
 config LOCKD_V4
        bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
 config NFS_COMMON
        bool
-        depends on NFSD || NFS_FS
+        depends on NFSD || NFS_FS || LOCKD
        default y
 source "net/sunrpc/Kconfig"
diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
        struct {
                unsigned        tail;
+                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
@@ -660,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        INIT_LIST_HEAD(&ctx->active_reqs);
-        if (percpu_ref_init(&ctx->users, free_ioctx_users))
+        if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
                goto err;
-        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
                goto err;
        ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -792,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
        for (i = 0; i < table->nr; ++i) {
                struct kioctx *ctx = table->table[i];
+                struct completion requests_done =
+                        COMPLETION_INITIALIZER_ONSTACK(requests_done);
                if (!ctx)
                        continue;
@@ -803,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
                 * that it needs to unmap the area, just set it to 0.
                 */
                ctx->mmap_size = 0;
-                kill_ioctx(mm, ctx, NULL);
+                kill_ioctx(mm, ctx, &requests_done);
+                /* Wait until all IO for the context are done. */
+                wait_for_completion(&requests_done);
        }
        RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -857,6 +863,68 @@ out:
        return ret;
 }
+/* refill_reqs_available
+ *      Updates the reqs_available reference counts used for tracking the
+ *      number of free slots in the completion ring.  This can be called
+ *      from aio_complete() (to optimistically update reqs_available) or
+ *      from aio_get_req() (the we're out of events case).  It must be
+ *      called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+        unsigned events_in_ring, completed;
+        /* Clamp head since userland can write to it. */
+        head %= ctx->nr_events;
+        if (head <= tail)
+                events_in_ring = tail - head;
+        else
+                events_in_ring = ctx->nr_events - (head - tail);
+        completed = ctx->completed_events;
+        if (events_in_ring < completed)
+                completed -= events_in_ring;
+        else
+                completed = 0;
+        if (!completed)
+                return;
+        ctx->completed_events -= completed;
+        put_reqs_available(ctx, completed);
+}
+/* user_refill_reqs_available
+ *      Called to refill reqs_available when aio_get_req() encounters an
+ *      out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+        spin_lock_irq(&ctx->completion_lock);
+        if (ctx->completed_events) {
+                struct aio_ring *ring;
+                unsigned head;
+                /* Access of ring->head may race with aio_read_events_ring()
+                 * here, but that's okay since whether we read the old version
+                 * or the new version, and either will be valid.  The important
+                 * part is that head cannot pass tail since we prevent
+                 * aio_complete() from updating tail by holding
+                 * ctx->completion_lock.  Even if head is invalid, the check
+                 * against ctx->completed_events below will make sure we do the
+                 * safe/right thing.
+                 */
+                ring = kmap_atomic(ctx->ring_pages[0]);
+                head = ring->head;
+                kunmap_atomic(ring);
+                refill_reqs_available(ctx, head, ctx->tail);
+        }
+        spin_unlock_irq(&ctx->completion_lock);
+}
 /* aio_get_req
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
@@ -865,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req;
-        if (!get_reqs_available(ctx))
+        if (!get_reqs_available(ctx)) {
-                return NULL;
+                user_refill_reqs_available(ctx);
+                if (!get_reqs_available(ctx))
+                        return NULL;
+        }
        req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
        if (unlikely(!req))
@@ -925,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
+        unsigned tail, pos, head;
        unsigned long   flags;
-        unsigned tail, pos;
        /*
         * Special case handling for sync iocbs:
@@ -987,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ctx->tail = tail;
        ring = kmap_atomic(ctx->ring_pages[0]);
+        head = ring->head;
        ring->tail = tail;
        kunmap_atomic(ring);
        flush_dcache_page(ctx->ring_pages[0]);
+        ctx->completed_events++;
+        if (ctx->completed_events > 1)
+                refill_reqs_available(ctx, head, tail);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
-        put_reqs_available(ctx, 1);
        /*
         * We have to order our ring_info tail store above and test
@@ -1042,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
        tail = ring->tail;
        kunmap_atomic(ring);
+        /*
+         * Ensure that once we've read the current tail pointer, that
+         * we also see the events that were stored up to the tail.
+         */
+        smp_rmb();
        pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
        if (head == tail)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..e2f3ad0879ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
        return block_read_full_page(page, blkdev_get_block);
 }
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
+        .readpages      = blkdev_readpages,
        .writepage      = blkdev_writepage,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/workqueue.h>
 #include "async-thread.h"
 #include "ctree.h"
@@ -55,13 +54,45 @@ struct btrfs_workqueue {
        struct __btrfs_workqueue *high;
 };
-static inline struct __btrfs_workqueue
+static void normal_work_helper(struct btrfs_work *work);
-*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+#define BTRFS_WORK_HELPER(name)                                 \
+void btrfs_##name(struct work_struct *arg)                              \
+{                                                                       \
+        struct btrfs_work *work = container_of(arg, struct btrfs_work,  \
+                                               normal_work);            \
+        normal_work_helper(work);                                       \
+}
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-        if (unlikely(!ret))
+        if (!ret)
                return NULL;
        ret->max_active = max_active;
@@ -85,7 +116,7 @@ static inline struct __btrfs_workqueue
                ret->normal_wq = alloc_workqueue("%s-%s", flags,
                                                 ret->max_active, "btrfs",
                                                 name);
-        if (unlikely(!ret->normal_wq)) {
+        if (!ret->normal_wq) {
                kfree(ret);
                return NULL;
        }
@@ -107,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 {
        struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-        if (unlikely(!ret))
+        if (!ret)
                return NULL;
        ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
                                              max_active, thresh);
-        if (unlikely(!ret->normal)) {
+        if (!ret->normal) {
                kfree(ret);
                return NULL;
        }
@@ -120,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
        if (flags & WQ_HIGHPRI) {
                ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
                                                    thresh);
-                if (unlikely(!ret->high)) {
+                if (!ret->high) {
                        __btrfs_destroy_workqueue(ret->normal);
                        kfree(ret);
                        return NULL;
@@ -232,13 +263,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
        spin_unlock_irqrestore(lock, flags);
 }
-static void normal_work_helper(struct work_struct *arg)
+static void normal_work_helper(struct btrfs_work *work)
 {
-        struct btrfs_work *work;
        struct __btrfs_workqueue *wq;
        int need_order = 0;
-        work = container_of(arg, struct btrfs_work, normal_work);
        /*
         * We should not touch things inside work in the following cases:
         * 1) after work->func() if it has no ordered_free
@@ -262,7 +291,7 @@ static void normal_work_helper(struct work_struct *arg)
                trace_btrfs_all_work_done(work);
 }
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free)
@@ -270,7 +299,7 @@ void btrfs_init_work(struct btrfs_work *work,
        work->func = func;
        work->ordered_func = ordered_func;
        work->ordered_free = ordered_free;
-        INIT_WORK(&work->normal_work, normal_work_helper);
+        INIT_WORK(&work->normal_work, uniq_func);
        INIT_LIST_HEAD(&work->ordered_list);
        work->flags = 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
 struct btrfs_workqueue;
 /* Internal use only */
 struct __btrfs_workqueue;
 struct btrfs_work;
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
 struct btrfs_work {
        btrfs_func_t func;
@@ -38,11 +40,36 @@ struct btrfs_work {
        unsigned long flags;
 };
+#define BTRFS_WORK_HELPER_PROTO(name)                                   \
+void btrfs_##name(struct work_struct *arg)
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              int flags,
                                              int max_active,
                                              int thresh);
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 54a201dac7f9..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
 #include "delayed-ref.h"
 #include "locking.h"
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
 struct extent_inode_elem {
        u64 inum;
        u64 offset;
@@ -377,7 +380,8 @@ out:
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                                   struct btrfs_path *path, u64 time_seq,
                                   struct list_head *head,
-                                   const u64 *extent_item_pos, u64 total_refs)
+                                   const u64 *extent_item_pos, u64 total_refs,
+                                   u64 root_objectid)
 {
        int err;
        int ret = 0;
@@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        continue;
                if (ref->count == 0)
                        continue;
+                if (root_objectid && ref->root_id != root_objectid) {
+                        ret = BACKREF_FOUND_SHARED;
+                        goto out;
+                }
                err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
                                             parents, extent_item_pos,
                                             total_refs);
@@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
                        continue;
                BUG_ON(!ref->wanted_disk_byte);
                eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
-                                     fs_info->tree_root->leafsize, 0);
+                                     0);
                if (!eb || !extent_buffer_uptodate(eb)) {
                        free_extent_buffer(eb);
                        return -EIO;
@@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
 * smaller or equal that seq to the list
 */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-                              struct list_head *prefs, u64 *total_refs)
+                              struct list_head *prefs, u64 *total_refs,
+                              u64 inum)
 {
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
        struct rb_node *n = &head->node.rb_node;
@@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        key.objectid = ref->objectid;
                        key.type = BTRFS_EXTENT_DATA_KEY;
                        key.offset = ref->offset;
+                        /*
+                         * Found a inum that doesn't match our known inum, we
+                         * know it's shared.
+                         */
+                        if (inum && ref->objectid != inum) {
+                                ret = BACKREF_FOUND_SHARED;
+                                break;
+                        }
                        ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
                                               node->bytenr,
                                               node->ref_mod * sgn, GFP_ATOMIC);
@@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path, u64 bytenr,
                             int *info_level, struct list_head *prefs,
-                             u64 *total_refs)
+                             u64 *total_refs, u64 inum)
 {
        int ret = 0;
        int slot;
@@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                                                                      dref);
                        key.type = BTRFS_EXTENT_DATA_KEY;
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                        if (inum && key.objectid != inum) {
+                                ret = BACKREF_FOUND_SHARED;
+                                break;
+                        }
                        root = btrfs_extent_data_ref_root(leaf, dref);
                        ret = __add_prelim_ref(prefs, root, &key, 0, 0,
                                               bytenr, count, GFP_NOFS);
@@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 */
 static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                            struct btrfs_path *path, u64 bytenr,
-                            int info_level, struct list_head *prefs)
+                            int info_level, struct list_head *prefs, u64 inum)
 {
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
@@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                                                                      dref);
                        key.type = BTRFS_EXTENT_DATA_KEY;
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                        if (inum && key.objectid != inum) {
+                                ret = BACKREF_FOUND_SHARED;
+                                break;
+                        }
                        root = btrfs_extent_data_ref_root(leaf, dref);
                        ret = __add_prelim_ref(prefs, root, &key, 0, 0,
                                               bytenr, count, GFP_NOFS);
@@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
                             u64 time_seq, struct ulist *refs,
-                             struct ulist *roots, const u64 *extent_item_pos)
+                             struct ulist *roots, const u64 *extent_item_pos,
+                             u64 root_objectid, u64 inum)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -929,7 +961,8 @@ again:
                        }
                        spin_unlock(&delayed_refs->lock);
                        ret = __add_delayed_refs(head, time_seq,
-                                                 &prefs_delayed, &total_refs);
+                                                 &prefs_delayed, &total_refs,
+                                                 inum);
                        mutex_unlock(&head->mutex);
                        if (ret)
                                goto out;
@@ -951,11 +984,11 @@ again:
                     key.type == BTRFS_METADATA_ITEM_KEY)) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
                                                &info_level, &prefs,
-                                                &total_refs);
+                                                &total_refs, inum);
                        if (ret)
                                goto out;
                        ret = __add_keyed_refs(fs_info, path, bytenr,
-                                               info_level, &prefs);
+                                               info_level, &prefs, inum);
                        if (ret)
                                goto out;
                }
@@ -971,7 +1004,8 @@ again:
        __merge_refs(&prefs, 1);
        ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-                                      extent_item_pos, total_refs);
+                                      extent_item_pos, total_refs,
+                                      root_objectid);
        if (ret)
                goto out;
@@ -981,6 +1015,11 @@ again:
                ref = list_first_entry(&prefs, struct __prelim_ref, list);
                WARN_ON(ref->count < 0);
                if (roots && ref->count && ref->root_id && ref->parent == 0) {
+                        if (root_objectid && ref->root_id != root_objectid) {
+                                ret = BACKREF_FOUND_SHARED;
+                                goto out;
+                        }
                        /* no parent == root of tree */
                        ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
                        if (ret < 0)
@@ -989,12 +1028,10 @@ again:
                if (ref->count && ref->parent) {
                        if (extent_item_pos && !ref->inode_list &&
                            ref->level == 0) {
-                                u32 bsz;
                                struct extent_buffer *eb;
-                                bsz = btrfs_level_size(fs_info->extent_root,
-                                                        ref->level);
                                eb = read_tree_block(fs_info->extent_root,
-                                                           ref->parent, bsz, 0);
+                                                           ref->parent, 0);
                                if (!eb || !extent_buffer_uptodate(eb)) {
                                        free_extent_buffer(eb);
                                        ret = -EIO;
@@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        ret = find_parent_nodes(trans, fs_info, bytenr,
-                                time_seq, *leafs, NULL, extent_item_pos);
+                                time_seq, *leafs, NULL, extent_item_pos, 0, 0);
        if (ret < 0 && ret != -ENOENT) {
                free_leaf_list(*leafs);
                return ret;
@@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
        ULIST_ITER_INIT(&uiter);
        while (1) {
                ret = find_parent_nodes(trans, fs_info, bytenr,
-                                        time_seq, tmp, *roots, NULL);
+                                        time_seq, tmp, *roots, NULL, 0, 0);
                if (ret < 0 && ret != -ENOENT) {
                        ulist_free(tmp);
                        ulist_free(*roots);
@@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
        return ret;
 }
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 root_objectid,
+                       u64 inum, u64 bytenr)
+{
+        struct ulist *tmp = NULL;
+        struct ulist *roots = NULL;
+        struct ulist_iterator uiter;
+        struct ulist_node *node;
+        struct seq_list elem = {};
+        int ret = 0;
+        tmp = ulist_alloc(GFP_NOFS);
+        roots = ulist_alloc(GFP_NOFS);
+        if (!tmp || !roots) {
+                ulist_free(tmp);
+                ulist_free(roots);
+                return -ENOMEM;
+        }
+        if (trans)
+                btrfs_get_tree_mod_seq(fs_info, &elem);
+        else
+                down_read(&fs_info->commit_root_sem);
+        ULIST_ITER_INIT(&uiter);
+        while (1) {
+                ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+                                        roots, NULL, root_objectid, inum);
+                if (ret == BACKREF_FOUND_SHARED) {
+                        ret = 1;
+                        break;
+                }
+                if (ret < 0 && ret != -ENOENT)
+                        break;
+                node = ulist_next(tmp, &uiter);
+                if (!node)
+                        break;
+                bytenr = node->val;
+                cond_resched();
+        }
+        if (trans)
+                btrfs_put_tree_mod_seq(fs_info, &elem);
+        else
+                up_read(&fs_info->commit_root_sem);
+        ulist_free(tmp);
+        ulist_free(roots);
+        return ret;
+}
 /*
 * this makes the path point to (inum INODE_ITEM ioff)
 */
@@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
        unsigned long ptr;
        key.objectid = inode_objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = start_off;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                ret = -ENOENT;
                if (found_key.objectid != inode_objectid)
                        break;
-                if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+                if (found_key.type != BTRFS_INODE_EXTREF_KEY)
                        break;
                ret = 0;
@@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        }
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type == BTRFS_METADATA_ITEM_KEY)
-                size = fs_info->extent_root->leafsize;
+                size = fs_info->extent_root->nodesize;
        else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
                size = found_key->offset;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                          u64 start_off, struct btrfs_path *path,
                          struct btrfs_inode_extref **ret_extref,
                          u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 root_objectid,
+                       u64 inum, u64 bytenr);
 int __init btrfs_prelim_ref_init(void);
 void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST            9
 #define BTRFS_INODE_READDIO_NEED_LOCK           10
 #define BTRFS_INODE_HAS_PROPS                   11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR                   12
+#define BTRFS_INODE_BTREE_LOG1_ERR              13
+#define BTRFS_INODE_BTREE_LOG2_ERR              14
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -121,6 +132,12 @@ struct btrfs_inode {
        u64 delalloc_bytes;
        /*
+         * total number of bytes pending defrag, used by stat to check whether
+         * it needs COW.
+         */
+        u64 defrag_bytes;
+        /*
         * the size of the file stored in the metadata on disk.  data=ordered
         * means the in-memory i_size might be larger than the size on disk
         * because not all the blocks are written yet.
@@ -234,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
            BTRFS_I(inode)->last_sub_trans <=
            BTRFS_I(inode)->last_log_commit &&
            BTRFS_I(inode)->last_sub_trans <=
-            BTRFS_I(inode)->root->last_log_commit)
+            BTRFS_I(inode)->root->last_log_commit) {
-                return 1;
+                /*
+                 * After a ranged fsync we might have left some extent maps
+                 * (that fall outside the fsync's range). So return false
+                 * here if the list isn't empty, to make sure btrfs_log_inode()
+                 * will be called and process those extent maps.
+                 */
+                smp_mb();
+                if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+                        return 1;
+        }
        return 0;
 }
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED    0x1
 struct btrfs_dio_private {
        struct inode *inode;
+        unsigned long flags;
        u64 logical_offset;
        u64 disk_bytenr;
        u64 bytes;
@@ -257,7 +286,12 @@ struct btrfs_dio_private {
        /* dio_bio came from fs/direct-io.c */
        struct bio *dio_bio;
-        u8 csum[0];
+        /*
+         * The original bio may be splited to several sub-bios, this is
+         * done during endio of sub-bios
+         */
+        int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
 };
 /*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
        /* super block bytenr is always the unmapped device bytenr */
        dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-        if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+        if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
                return -1;
        bh = __bread(superblock_bdev, dev_bytenr / 4096,
                     BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
            btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
            memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
            btrfs_super_nodesize(super_tmp) != state->metablock_size ||
-            btrfs_super_leafsize(super_tmp) != state->metablock_size ||
            btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
                brelse(bh);
                return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
        while (len > 0) {
                cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
-                BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+                BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
-                            PAGE_CACHE_SHIFT);
                kaddr = block_ctx->datav[i];
                memcpy(dst, kaddr + offset_in_page, cur);
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
        struct list_head *dev_head = &fs_devices->devices;
        struct btrfs_device *device;
-        if (root->nodesize != root->leafsize) {
-                printk(KERN_INFO
-                       "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
-                       root->nodesize, root->leafsize);
-                return -1;
-        }
        if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
                printk(KERN_INFO
                       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
                       root->nodesize, PAGE_CACHE_SIZE);
                return -1;
        }
-        if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
-                printk(KERN_INFO
-                       "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-                       root->leafsize, PAGE_CACHE_SIZE);
-                return -1;
-        }
        if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
                printk(KERN_INFO
                       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        return sizeof(struct compressed_bio) +
-                ((disk_size + root->sectorsize - 1) / root->sectorsize) *
+                (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
-                csum_size;
 }
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                         * freed before we're done setting it up
                         */
                        atomic_inc(&cb->pending_bios);
-                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                        BTRFS_WQ_ENDIO_DATA);
                        BUG_ON(ret); /* -ENOMEM */
                        if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        }
        bio_get(bio);
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
        BUG_ON(ret); /* -ENOMEM */
        if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->compress_type = extent_compress_type(bio_flags);
        cb->orig_bio = bio;
-        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+        nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
-                                 PAGE_CACHE_SIZE;
        cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
                                       GFP_NOFS);
        if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                    PAGE_CACHE_SIZE) {
                        bio_get(comp_bio);
-                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+                                        BTRFS_WQ_ENDIO_DATA);
                        BUG_ON(ret); /* -ENOMEM */
                        /*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                                        comp_bio, sums);
                                BUG_ON(ret); /* -ENOMEM */
                        }
-                        sums += (comp_bio->bi_iter.bi_size +
+                        sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
-                                 root->sectorsize - 1) / root->sectorsize;
+                                             root->sectorsize);
                        ret = btrfs_map_bio(root, READ, comp_bio,
                                            mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        }
        bio_get(comp_bio);
-        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+                        BTRFS_WQ_ENDIO_DATA);
        BUG_ON(ret); /* -ENOMEM */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44ee5d2e52a4..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        else
                btrfs_node_key(buf, &disk_key, 0);
-        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
+        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
-                                     new_root_objectid, &disk_key, level,
+                        &disk_key, level, buf->start, 0);
-                                     buf->start, 0);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        } else
                parent_start = 0;
-        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+        cow = btrfs_alloc_tree_block(trans, root, parent_start,
-                                     root->root_key.objectid, &disk_key,
+                        root->root_key.objectid, &disk_key, level,
-                                     level, search_start, empty_size);
+                        search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        struct tree_mod_root *old_root = NULL;
        u64 old_generation = 0;
        u64 logical;
-        u32 blocksize;
        eb_root = btrfs_read_lock_root_node(root);
        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-                blocksize = btrfs_level_size(root, old_root->level);
+                old = read_tree_block(root, logical, 0);
-                old = read_tree_block(root, logical, blocksize, 0);
                if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
                        free_extent_buffer(old);
                        btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
 {
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(root))
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
                return 0;
-#endif
        /* ensure we can see the force_cow */
        smp_rmb();
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        WARN_ON(trans->transid != root->fs_info->generation);
        parent_nritems = btrfs_header_nritems(parent);
-        blocksize = btrfs_level_size(root, parent_level - 1);
+        blocksize = root->nodesize;
        end_slot = parent_nritems;
        if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        continue;
                }
-                cur = btrfs_find_tree_block(root, blocknr, blocksize);
+                cur = btrfs_find_tree_block(root, blocknr);
                if (cur)
                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
                else
                        uptodate = 0;
                if (!cur || !uptodate) {
                        if (!cur) {
-                                cur = read_tree_block(root, blocknr,
+                                cur = read_tree_block(root, blocknr, gen);
-                                                         blocksize, gen);
                                if (!cur || !extent_buffer_uptodate(cur)) {
                                        free_extent_buffer(cur);
                                        return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
        BUG_ON(level == 0);
        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
-                             btrfs_level_size(root, level - 1),
                             btrfs_node_ptr_generation(parent, slot));
        if (eb && !extent_buffer_uptodate(eb)) {
                free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
        node = path->nodes[level];
        search = btrfs_node_blockptr(node, slot);
-        blocksize = btrfs_level_size(root, level - 1);
+        blocksize = root->nodesize;
-        eb = btrfs_find_tree_block(root, search, blocksize);
+        eb = btrfs_find_tree_block(root, search);
        if (eb) {
                free_extent_buffer(eb);
                return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        gen = btrfs_node_ptr_generation(node, nr);
-                        readahead_tree_block(root, search, blocksize, gen);
+                        readahead_tree_block(root, search, blocksize);
                        nread += blocksize;
                }
                nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
-        blocksize = btrfs_level_size(root, level);
+        blocksize = root->nodesize;
        if (slot > 0) {
                block1 = btrfs_node_blockptr(parent, slot - 1);
                gen = btrfs_node_ptr_generation(parent, slot - 1);
-                eb = btrfs_find_tree_block(root, block1, blocksize);
+                eb = btrfs_find_tree_block(root, block1);
                /*
                 * if we get -eagain from btrfs_buffer_uptodate, we
                 * don't want to return eagain here.  That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
-                eb = btrfs_find_tree_block(root, block2, blocksize);
+                eb = btrfs_find_tree_block(root, block2);
                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
                        block2 = 0;
                free_extent_buffer(eb);
        }
        if (block1)
-                readahead_tree_block(root, block1, blocksize, 0);
+                readahead_tree_block(root, block1, blocksize);
        if (block2)
-                readahead_tree_block(root, block2, blocksize, 0);
+                readahead_tree_block(root, block2, blocksize);
 }
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 {
        u64 blocknr;
        u64 gen;
-        u32 blocksize;
        struct extent_buffer *b = *eb_ret;
        struct extent_buffer *tmp;
        int ret;
        blocknr = btrfs_node_blockptr(b, slot);
        gen = btrfs_node_ptr_generation(b, slot);
-        blocksize = btrfs_level_size(root, level - 1);
-        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+        tmp = btrfs_find_tree_block(root, blocknr);
        if (tmp) {
                /* first we do an atomic uptodate check */
                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(p);
        ret = -EAGAIN;
-        tmp = read_tree_block(root, blocknr, blocksize, 0);
+        tmp = read_tree_block(root, blocknr, 0);
        if (tmp) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
                        if (!should_cow_block(trans, root, b))
                                goto cow_done;
-                        btrfs_set_path_blocking(p);
                        /*
                         * must have write locks on this node and the
                         * parent
@@ -2807,6 +2797,7 @@ again:
                                goto again;
                        }
+                        btrfs_set_path_blocking(p);
                        err = btrfs_cow_block(trans, root, b,
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        else
                btrfs_node_key(lower, &lower_key, 0);
-        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-                                   root->root_key.objectid, &lower_key,
+                                   &lower_key, level, root->node->start, 0);
-                                   level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        mid = (c_nritems + 1) / 2;
        btrfs_node_key(c, &disk_key, mid);
-        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-                                        root->root_key.objectid,
+                        &disk_key, level, c->start, 0);
-                                        &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
@@ -4282,13 +4271,12 @@ again:
        else
                btrfs_item_key(l, &disk_key, mid);
-        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-                                        root->root_key.objectid,
+                        &disk_key, 0, l->start, 0);
-                                        &disk_key, 0, l->start, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
-        root_add_used(root, root->leafsize);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
                                ptr = btrfs_item_ptr_offset(leaf, slot);
                                memmove_extent_buffer(leaf, ptr,
                                      (unsigned long)fi,
-                                      offsetof(struct btrfs_file_extent_item,
+                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
-                                                 disk_bytenr));
                        }
                }
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        int slot;
        struct btrfs_map_token token;
+        if (path->slots[0] == 0) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+                fixup_low_keys(root, path, &disk_key, 1);
+        }
+        btrfs_unlock_up_safe(path, 1);
        btrfs_init_map_token(&token);
        leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        }
        btrfs_set_header_nritems(leaf, nritems + nr);
-        if (slot == 0) {
-                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-                fixup_low_keys(root, path, &disk_key, 1);
-        }
-        btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(leaf);
        if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
        u32 nritems;
        int level;
        int ret = 1;
+        int keep_locks = path->keep_locks;
-        WARN_ON(!path->keep_locks);
+        path->keep_locks = 1;
 again:
        cur = btrfs_read_lock_root_node(root);
        level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
                path->slots[level] = slot;
                if (level == path->lowest_level) {
                        ret = 0;
-                        unlock_up(path, level, 1, 0, NULL);
                        goto out;
                }
                btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
                btrfs_clear_path_blocking(path, NULL, 0);
        }
 out:
-        if (ret == 0)
+        path->keep_locks = keep_locks;
+        if (ret == 0) {
+                btrfs_unlock_up_safe(path, path->lowest_level + 1);
+                btrfs_set_path_blocking(path);
                memcpy(min_key, &found_key, sizeof(found_key));
-        btrfs_set_path_blocking(path);
+        }
        return ret;
 }
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                goto out;
        }
-        tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+        tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
        if (!tmp_buf) {
                ret = -ENOMEM;
                goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                        goto out;
                                advance_right = ADVANCE;
                        } else {
-                                enum btrfs_compare_tree_result cmp;
+                                enum btrfs_compare_tree_result result;
                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                ret = tree_compare_item(left_root, left_path,
                                                right_path, tmp_buf);
                                if (ret)
-                                        cmp = BTRFS_COMPARE_TREE_CHANGED;
+                                        result = BTRFS_COMPARE_TREE_CHANGED;
                                else
-                                        cmp = BTRFS_COMPARE_TREE_SAME;
+                                        result = BTRFS_COMPARE_TREE_SAME;
                                ret = changed_cb(left_root, right_root,
                                                 left_path, right_path,
-                                                 &left_key, cmp, ctx);
+                                                 &left_key, result, ctx);
                                if (ret < 0)
                                        goto out;
                                advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e29b614fe93..d557264ee974 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
 #include <linux/pagemap.h>
 #include <linux/btrfs.h>
 #include <linux/workqueue.h>
+#include <linux/security.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
 #define BTRFS_COMPAT_EXTENT_TREE_V0
-/*
- * files bigger than this get some pre-flushing when they are added
- * to the ordered operations list.  That way we limit the total
- * work done by the commit
- */
-#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -391,10 +385,12 @@ struct btrfs_header {
                                      sizeof(struct btrfs_header)) / \
                                     sizeof(struct btrfs_key_ptr))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START             \
+                (offsetof(struct btrfs_file_extent_item, disk_bytenr))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
                                        sizeof(struct btrfs_item) - \
-                                        sizeof(struct btrfs_file_extent_item))
+                                        BTRFS_FILE_EXTENT_INLINE_DATA_START)
 #define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
                                 sizeof(struct btrfs_item) -\
                                 sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
        __le64 num_devices;
        __le32 sectorsize;
        __le32 nodesize;
-        __le32 leafsize;
+        __le32 __unused_leafsize;
        __le32 stripesize;
        __le32 sys_chunk_array_size;
        __le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
        /*
         * disk space consumed by the extent, checksum blocks are included
         * in these numbers
+         *
+         * At this offset in the structure, the inline extent data start.
         */
        __le64 disk_bytenr;
        __le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
         */
        struct list_head cluster_list;
-        /* For delayed block group creation */
+        /* For delayed block group creation or deletion of empty block groups */
-        struct list_head new_bg_list;
+        struct list_head bg_list;
 };
 /* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *endio_workers;
        struct btrfs_workqueue *endio_meta_workers;
        struct btrfs_workqueue *endio_raid56_workers;
+        struct btrfs_workqueue *endio_repair_workers;
        struct btrfs_workqueue *rmw_workers;
        struct btrfs_workqueue *endio_meta_write_workers;
        struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+        int open;
        u64 total_pinned;
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
        /* Used to reclaim the metadata space in the background. */
        struct work_struct async_reclaim_work;
+        spinlock_t unused_bgs_lock;
+        struct list_head unused_bgs;
+        /* For btrfs to record security options */
+        struct security_mnt_opts security_opts;
 };
 struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
        /* free ino cache stuff */
        struct btrfs_free_space_ctl *free_ino_ctl;
-        enum btrfs_caching_type cached;
+        enum btrfs_caching_type ino_cache_state;
-        spinlock_t cache_lock;
+        spinlock_t ino_cache_lock;
-        wait_queue_head_t cache_wait;
+        wait_queue_head_t ino_cache_wait;
        struct btrfs_free_space_ctl *free_ino_pinned;
-        u64 cache_progress;
+        u64 ino_cache_progress;
-        struct inode *cache_inode;
+        struct inode *ino_cache_inode;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
        /* node allocations are done in nodesize units */
        u32 nodesize;
-        /* leaf allocations are done in leafsize units */
-        u32 leafsize;
        u32 stripesize;
        u32 type;
        u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
        u64 alloc_bytenr;
-#endif
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHANGE_INODE_CACHE  (1 << 24)
 #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
+#define BTRFS_DEFAULT_MAX_INLINE        (8192)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
                         sectorsize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
                         nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
-                         leafsize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
                         stripesize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
 {
-        unsigned long offset = (unsigned long)e;
+        return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
-        offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
-        return offset;
 }
 static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 {
-        return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+        return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
 }
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
 static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
                                                    struct btrfs_item *e)
 {
-        unsigned long offset;
+        return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
-        offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-        return btrfs_item_size(eb, e) - offset;
 }
 /* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
        return sb->s_fs_info;
 }
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
-{
-        if (level == 0)
-                return root->leafsize;
-        return root->nodesize;
-}
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
        ((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
 {
-        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+        return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
                2 * num_items;
 }
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
 {
-        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+        return root->nodesize * BTRFS_MAX_LEVEL * num_items;
-                num_items;
 }
 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int get_block_group_index(struct btrfs_block_group_cache *cache);
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root, u32 blocksize,
+                                        struct btrfs_root *root, u64 parent,
-                                        u64 parent, u64 root_objectid,
+                                        u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->uuid_root);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
+        security_free_mnt_opts(&fs_info->security_opts);
        kfree(fs_info);
 }
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-                              struct btrfs_dio_private *dip, struct bio *bio,
+                              struct bio *bio, u64 logical_offset);
-                              u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode);
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
-                               u64 rfer, u64 excl);
 #endif
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+                return 1;
+#endif
+        return 0;
+}
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
        int ret;
        key.objectid = node->inode_id;
-        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
 search:
        btrfs_release_path(path);
-        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = -1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                return -ENOMEM;
        async_work->delayed_root = delayed_root;
-        btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+        btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
-                        NULL, NULL);
+                        btrfs_async_run_delayed_root, NULL, NULL);
        async_work->nr = nr;
        btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        }
        delayed_item->key.objectid = btrfs_ino(dir);
-        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+        delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
        delayed_item->key.offset = index;
        dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
                return PTR_ERR(node);
        item_key.objectid = btrfs_ino(dir);
-        btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+        item_key.type = BTRFS_DIR_INDEX_KEY;
        item_key.offset = index;
        ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
                                        dev_replace->srcdev->total_bytes;
                                dev_replace->tgtdev->disk_total_bytes =
                                        dev_replace->srcdev->disk_total_bytes;
+                                dev_replace->tgtdev->commit_total_bytes =
+                                        dev_replace->srcdev->commit_total_bytes;
                                dev_replace->tgtdev->bytes_used =
                                        dev_replace->srcdev->bytes_used;
+                                dev_replace->tgtdev->commit_bytes_used =
+                                        dev_replace->srcdev->commit_bytes_used;
                        }
                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
            args->start.tgtdev_name[0] == '\0')
                return -EINVAL;
-        mutex_lock(&fs_info->volume_mutex);
+        /*
-        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+         * Here we commit the transaction to make sure commit_total_bytes
-                                            &tgt_device);
+         * of all the devices are updated.
-        if (ret) {
+         */
-                btrfs_err(fs_info, "target device %s is invalid!",
+        trans = btrfs_attach_transaction(root);
-                       args->start.tgtdev_name);
+        if (!IS_ERR(trans)) {
-                mutex_unlock(&fs_info->volume_mutex);
+                ret = btrfs_commit_transaction(trans, root);
-                return -EINVAL;
+                if (ret)
+                        return ret;
+        } else if (PTR_ERR(trans) != -ENOENT) {
+                return PTR_ERR(trans);
        }
+        /* the disk copy procedure reuses the scrub code */
+        mutex_lock(&fs_info->volume_mutex);
        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
                                            args->start.srcdev_name,
                                            &src_device);
-        mutex_unlock(&fs_info->volume_mutex);
        if (ret) {
-                ret = -EINVAL;
+                mutex_unlock(&fs_info->volume_mutex);
-                goto leave_no_lock;
+                return ret;
        }
-        if (tgt_device->total_bytes < src_device->total_bytes) {
+        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
-                btrfs_err(fs_info, "target device is smaller than source device!");
+                                            src_device, &tgt_device);
-                ret = -EINVAL;
+        mutex_unlock(&fs_info->volume_mutex);
-                goto leave_no_lock;
+        if (ret)
-        }
+                return ret;
        btrfs_dev_replace_lock(dev_replace);
        switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                      src_device->devid,
                      rcu_str_deref(tgt_device->name));
-        tgt_device->total_bytes = src_device->total_bytes;
-        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
-        tgt_device->bytes_used = src_device->bytes_used;
        /*
         * from now on, the writes to the srcdev are all duplicated to
         * go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        /* the disk copy procedure reuses the scrub code */
        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
-                              src_device->total_bytes,
+                              btrfs_device_get_total_bytes(src_device),
                              &dev_replace->scrub_progress, 0, 1);
        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
        dev_replace->srcdev = NULL;
        dev_replace->tgtdev = NULL;
        btrfs_dev_replace_unlock(dev_replace);
-leave_no_lock:
+        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
-        if (tgt_device)
-                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
        return ret;
 }
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        ret = btrfs_commit_transaction(trans, root);
        WARN_ON(ret);
+        mutex_lock(&uuid_mutex);
        /* keep away write_all_supers() during the finishing procedure */
-        mutex_lock(&root->fs_info->chunk_mutex);
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_lock(&root->fs_info->chunk_mutex);
        btrfs_dev_replace_lock(dev_replace);
        dev_replace->replace_state =
                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                              src_device->devid,
                              rcu_str_deref(tgt_device->name), scrub_ret);
                btrfs_dev_replace_unlock(dev_replace);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                mutex_unlock(&root->fs_info->chunk_mutex);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&uuid_mutex);
                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        }
        printk_in_rcu(KERN_INFO
-                      "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
+                      "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
                      src_device->missing ? "<missing disk>" :
                        rcu_str_deref(src_device->name),
                      src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        tgt_device->is_tgtdev_for_dev_replace = 0;
        tgt_device->devid = src_device->devid;
        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
-        tgt_device->bytes_used = src_device->bytes_used;
        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
-        tgt_device->total_bytes = src_device->total_bytes;
+        btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
-        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+        btrfs_device_set_disk_total_bytes(tgt_device,
-        tgt_device->bytes_used = src_device->bytes_used;
+                                          src_device->disk_total_bytes);
+        btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+        ASSERT(list_empty(&src_device->resized_list));
+        tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+        tgt_device->commit_bytes_used = src_device->bytes_used;
        if (fs_info->sb->s_bdev == src_device->bdev)
                fs_info->sb->s_bdev = tgt_device->bdev;
        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+        fs_info->fs_devices->rw_devices++;
        /* replace the sysfs entry */
        btrfs_kobj_rm_device(fs_info, src_device);
        btrfs_kobj_add_device(fs_info, tgt_device);
+        btrfs_dev_replace_unlock(dev_replace);
        btrfs_rm_dev_replace_blocked(fs_info);
        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * superblock is scratched out so that it is no longer marked to
         * belong to this filesystem.
         */
-        btrfs_dev_replace_unlock(dev_replace);
-        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        mutex_unlock(&root->fs_info->chunk_mutex);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_unlock(&uuid_mutex);
        /* write back the superblocks */
        trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                              struct btrfs_ioctl_dev_replace_args *args)
 {
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_device *srcdev;
        btrfs_dev_replace_lock(dev_replace);
        /* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                srcdev = dev_replace->srcdev;
                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
-                        div64_u64(dev_replace->srcdev->total_bytes, 1000));
+                        div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
                break;
        }
        btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
                              dev_replace->committed_cursor_left,
-                              dev_replace->srcdev->total_bytes,
+                              btrfs_device_get_total_bytes(dev_replace->srcdev),
                              &dev_replace->scrub_progress, 0, 1);
        ret = btrfs_dev_replace_finishing(fs_info, ret);
        WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
        BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
        key.objectid = objectid;
-        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);
        data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        u32 data_size;
        key.objectid = btrfs_ino(dir);
-        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);
        path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
        int cow = mod != 0;
        key.objectid = dir;
-        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
                return -ENOMEM;
        key.objectid = dir;
-        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
        int cow = mod != 0;
        key.objectid = dir;
-        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = objectid;
        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
        int cow = mod != 0;
        key.objectid = dir;
-        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);
        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
        if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0ed9e664f7d..fa45e3cae40d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
-#include "async-thread.h"
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
@@ -73,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
 static void btrfs_error_commit_super(struct btrfs_root *root);
 /*
- * end_io_wq structs are used to do processing in task context when an IO is
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * complete.  This is used during reads to verify checksums, and it is used
+ * is complete.  This is used during reads to verify checksums, and it is used
 * by writes to insert metadata for new file extents after IO is complete.
 */
-struct end_io_wq {
+struct btrfs_end_io_wq {
        struct bio *bio;
        bio_end_io_t *end_io;
        void *private;
        struct btrfs_fs_info *info;
        int error;
-        int metadata;
+        enum btrfs_wq_endio_type metadata;
        struct list_head list;
        struct btrfs_work work;
 };
+static struct kmem_cache *btrfs_end_io_wq_cache;
+int __init btrfs_end_io_wq_init(void)
+{
+        btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+                                        sizeof(struct btrfs_end_io_wq),
+                                        0,
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
+        if (!btrfs_end_io_wq_cache)
+                return -ENOMEM;
+        return 0;
+}
+void btrfs_end_io_wq_exit(void)
+{
+        if (btrfs_end_io_wq_cache)
+                kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
 /*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
@@ -328,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 {
        struct extent_state *cached_state = NULL;
        int ret;
-        bool need_lock = (current->journal_info ==
+        bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
-                          (void *)BTRFS_SEND_TRANS_STUB);
        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
                return 0;
@@ -349,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        printk_ratelimited("parent transid verify failed on %llu wanted %llu "
+        printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
-                       "found %llu\n",
+                        eb->fs_info->sb->s_id, eb->start,
-                       eb->start, parent_transid, btrfs_header_generation(eb));
+                        parent_transid, btrfs_header_generation(eb));
        ret = 1;
        /*
@@ -608,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                goto err;
        eb->read_mirror = mirror;
-        if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+        if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
                ret = -EIO;
                goto err;
        }
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
-                printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+                printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
                               "%llu %llu\n",
-                               found_start, eb->start);
+                               eb->fs_info->sb->s_id, found_start, eb->start);
                ret = -EIO;
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
+                printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
-                               eb->start);
+                               eb->fs_info->sb->s_id, eb->start);
                ret = -EIO;
                goto err;
        }
@@ -681,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        eb = (struct extent_buffer *)page->private;
-        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        eb->read_mirror = failed_mirror;
        atomic_dec(&eb->io_pages);
        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -691,52 +709,55 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 static void end_workqueue_bio(struct bio *bio, int err)
 {
-        struct end_io_wq *end_io_wq = bio->bi_private;
+        struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        fs_info = end_io_wq->info;
        end_io_wq->error = err;
-        btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
-                        btrfs_queue_work(fs_info->endio_meta_write_workers,
+                        wq = fs_info->endio_meta_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_meta_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
-                        btrfs_queue_work(fs_info->endio_freespace_worker,
+                        wq = fs_info->endio_freespace_worker;
-                                         &end_io_wq->work);
+                        func = btrfs_freespace_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                        wq = fs_info->endio_raid56_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_raid56_helper;
-                else
+                } else {
-                        btrfs_queue_work(fs_info->endio_write_workers,
+                        wq = fs_info->endio_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_write_helper;
+                }
        } else {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                if (unlikely(end_io_wq->metadata ==
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                             BTRFS_WQ_ENDIO_DIO_REPAIR)) {
-                                         &end_io_wq->work);
+                        wq = fs_info->endio_repair_workers;
-                else if (end_io_wq->metadata)
+                        func = btrfs_endio_repair_helper;
-                        btrfs_queue_work(fs_info->endio_meta_workers,
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                                         &end_io_wq->work);
+                        wq = fs_info->endio_raid56_workers;
-                else
+                        func = btrfs_endio_raid56_helper;
-                        btrfs_queue_work(fs_info->endio_workers,
+                } else if (end_io_wq->metadata) {
-                                         &end_io_wq->work);
+                        wq = fs_info->endio_meta_workers;
+                        func = btrfs_endio_meta_helper;
+                } else {
+                        wq = fs_info->endio_workers;
+                        func = btrfs_endio_helper;
+                }
        }
+        btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+        btrfs_queue_work(wq, &end_io_wq->work);
 }
-/*
- * For the metadata arg you want
- *
- * 0 - if data
- * 1 - if normal metadta
- * 2 - if writing to the free space cache area
- * 3 - raid parity work
- */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-                        int metadata)
+                        enum btrfs_wq_endio_type metadata)
 {
-        struct end_io_wq *end_io_wq;
+        struct btrfs_end_io_wq *end_io_wq;
-        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+        end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
        if (!end_io_wq)
                return -ENOMEM;
@@ -828,7 +849,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->submit_bio_start = submit_bio_start;
        async->submit_bio_done = submit_bio_done;
-        btrfs_init_work(&async->work, run_one_async_start,
+        btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
                        run_one_async_done, run_one_async_free);
        async->bio_flags = bio_flags;
@@ -920,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                 * can happen in the async kernel threads
                 */
                ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-                                          bio, 1);
+                                          bio, BTRFS_WQ_ENDIO_METADATA);
                if (ret)
                        goto out_w_error;
                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1052,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
        .set_page_dirty = btree_set_page_dirty,
 };
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
-                         u64 parent_transid)
 {
        struct extent_buffer *buf = NULL;
        struct inode *btree_inode = root->fs_info->btree_inode;
-        int ret = 0;
        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!buf)
-                return 0;
+                return;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
                                 buf, 0, WAIT_NONE, btree_get_extent, 0);
        free_extent_buffer(buf);
-        return ret;
 }
 int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1101,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 }
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize)
+                                            u64 bytenr)
 {
        return find_extent_buffer(root->fs_info, bytenr);
 }
@@ -1109,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                 u64 bytenr, u32 blocksize)
 {
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(root))
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
                return alloc_test_extent_buffer(root->fs_info, bytenr,
                                                blocksize);
-#endif
        return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
 }
@@ -1131,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 }
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-                                      u32 blocksize, u64 parent_transid)
+                                      u64 parent_transid)
 {
        struct extent_buffer *buf = NULL;
        int ret;
-        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
        if (!buf)
                return NULL;
@@ -1178,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
        if (!writers)
                return ERR_PTR(-ENOMEM);
-        ret = percpu_counter_init(&writers->counter, 0);
+        ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
        if (ret < 0) {
                kfree(writers);
                return ERR_PTR(ret);
@@ -1195,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
        kfree(writers);
 }
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
-                         u32 stripesize, struct btrfs_root *root,
+                         struct btrfs_root *root, struct btrfs_fs_info *fs_info,
-                         struct btrfs_fs_info *fs_info,
                         u64 objectid)
 {
        root->node = NULL;
        root->commit_root = NULL;
        root->sectorsize = sectorsize;
        root->nodesize = nodesize;
-        root->leafsize = leafsize;
        root->stripesize = stripesize;
        root->state = 0;
        root->orphan_cleanup_state = 0;
@@ -1290,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
        root = btrfs_alloc_root(NULL);
        if (!root)
                return ERR_PTR(-ENOMEM);
-        __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+        __setup_root(4096, 4096, 4096, root, NULL, 1);
        set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
        root->alloc_bytenr = 0;
@@ -1313,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
        if (!root)
                return ERR_PTR(-ENOMEM);
-        __setup_root(tree_root->nodesize, tree_root->leafsize,
+        __setup_root(tree_root->nodesize, tree_root->sectorsize,
-                     tree_root->sectorsize, tree_root->stripesize,
+                tree_root->stripesize, root, fs_info, objectid);
-                     root, fs_info, objectid);
        root->root_key.objectid = objectid;
        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
        root->root_key.offset = 0;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
-                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                leaf = NULL;
@@ -1391,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        if (!root)
                return ERR_PTR(-ENOMEM);
-        __setup_root(tree_root->nodesize, tree_root->leafsize,
+        __setup_root(tree_root->nodesize, tree_root->sectorsize,
-                     tree_root->sectorsize, tree_root->stripesize,
+                     tree_root->stripesize, root, fs_info,
-                     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+                     BTRFS_TREE_LOG_OBJECTID);
        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1408,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         * updated (along with back refs to the log tree).
         */
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+        leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
-                                      BTRFS_TREE_LOG_OBJECTID, NULL,
+                        NULL, 0, 0, 0);
-                                      0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1460,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_generation(inode_item, 1);
        btrfs_set_stack_inode_size(inode_item, 3);
        btrfs_set_stack_inode_nlink(inode_item, 1);
-        btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+        btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
        btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1480,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_path *path;
        u64 generation;
-        u32 blocksize;
        int ret;
        path = btrfs_alloc_path();
@@ -1493,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
                goto alloc_fail;
        }
-        __setup_root(tree_root->nodesize, tree_root->leafsize,
+        __setup_root(tree_root->nodesize, tree_root->sectorsize,
-                     tree_root->sectorsize, tree_root->stripesize,
+                tree_root->stripesize, root, fs_info, key->objectid);
-                     root, fs_info, key->objectid);
        ret = btrfs_find_root(tree_root, key, path,
                              &root->root_item, &root->root_key);
@@ -1506,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        }
        generation = btrfs_root_generation(&root->root_item);
-        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                     blocksize, generation);
+                                     generation);
        if (!root->node) {
                ret = -ENOMEM;
                goto find_fail;
@@ -1568,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
        root->subv_writers = writers;
        btrfs_init_free_ino_ctl(root);
-        spin_lock_init(&root->cache_lock);
+        spin_lock_init(&root->ino_cache_lock);
-        init_waitqueue_head(&root->cache_wait);
+        init_waitqueue_head(&root->ino_cache_wait);
        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
@@ -1703,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        return ret;
 }
-/*
- * If this fails, caller must call bdi_destroy() to get rid of the
- * bdi again.
- */
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
@@ -1729,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 static void end_workqueue_fn(struct btrfs_work *work)
 {
        struct bio *bio;
-        struct end_io_wq *end_io_wq;
+        struct btrfs_end_io_wq *end_io_wq;
        int error;
-        end_io_wq = container_of(work, struct end_io_wq, work);
+        end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
        bio = end_io_wq->bio;
        error = end_io_wq->error;
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
-        kfree(end_io_wq);
+        kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
        bio_endio_nodec(bio, error);
 }
@@ -1767,6 +1771,7 @@ static int cleaner_kthread(void *arg)
                }
                btrfs_run_delayed_iputs(root);
+                btrfs_delete_unused_bgs(root->fs_info);
                again = btrfs_clean_one_deleted_snapshot(root);
                mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2058,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
        btrfs_destroy_workqueue(fs_info->endio_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+        btrfs_destroy_workqueue(fs_info->endio_repair_workers);
        btrfs_destroy_workqueue(fs_info->rmw_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2138,8 +2144,6 @@ int open_ctree(struct super_block *sb,
 {
        u32 sectorsize;
        u32 nodesize;
-        u32 leafsize;
-        u32 blocksize;
        u32 stripesize;
        u64 generation;
        u64 features;
@@ -2183,7 +2187,7 @@ int open_ctree(struct super_block *sb,
                goto fail_srcu;
        }
-        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
        if (ret) {
                err = ret;
                goto fail_bdi;
@@ -2191,13 +2195,13 @@ int open_ctree(struct super_block *sb,
        fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
                                        (1 + ilog2(nr_cpu_ids));
-        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
        if (ret) {
                err = ret;
                goto fail_dirty_metadata_bytes;
        }
-        ret = percpu_counter_init(&fs_info->bio_counter, 0);
+        ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
        if (ret) {
                err = ret;
                goto fail_delalloc_bytes;
@@ -2228,6 +2232,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->super_lock);
        spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
+        spin_lock_init(&fs_info->unused_bgs_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
@@ -2237,6 +2242,7 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+        INIT_LIST_HEAD(&fs_info->unused_bgs);
        btrfs_mapping_init(&fs_info->mapping_tree);
        btrfs_init_block_rsv(&fs_info->global_block_rsv,
                             BTRFS_BLOCK_RSV_GLOBAL);
@@ -2255,7 +2261,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
-        fs_info->max_inline = 8192 * 1024;
+        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->free_chunk_space = 0;
@@ -2384,7 +2390,7 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
-        __setup_root(4096, 4096, 4096, 4096, tree_root,
+        __setup_root(4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
        invalidate_bdev(fs_devices->latest_bdev);
@@ -2464,19 +2470,22 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
-        if (btrfs_super_leafsize(disk_super) !=
+        /*
+         * Leafsize and nodesize were always equal, this is only a sanity check.
+         */
+        if (le32_to_cpu(disk_super->__unused_leafsize) !=
            btrfs_super_nodesize(disk_super)) {
                printk(KERN_ERR "BTRFS: couldn't mount because metadata "
                       "blocksizes don't match.  node %d leaf %d\n",
                       btrfs_super_nodesize(disk_super),
-                       btrfs_super_leafsize(disk_super));
+                       le32_to_cpu(disk_super->__unused_leafsize));
                err = -EINVAL;
                goto fail_alloc;
        }
-        if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+        if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
                printk(KERN_ERR "BTRFS: couldn't mount because metadata "
                       "blocksize (%d) was too large\n",
-                       btrfs_super_leafsize(disk_super));
+                       btrfs_super_nodesize(disk_super));
                err = -EINVAL;
                goto fail_alloc;
        }
@@ -2493,17 +2502,16 @@ int open_ctree(struct super_block *sb,
         * flag our filesystem as having big metadata blocks if
         * they are bigger than the page size
         */
-        if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+        if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
                if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
                        printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
                features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
        }
        nodesize = btrfs_super_nodesize(disk_super);
-        leafsize = btrfs_super_leafsize(disk_super);
        sectorsize = btrfs_super_sectorsize(disk_super);
        stripesize = btrfs_super_stripesize(disk_super);
-        fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+        fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
        /*
@@ -2511,7 +2519,7 @@ int open_ctree(struct super_block *sb,
         * extent buffers for the same range.  It leads to corruptions
         */
        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
-            (sectorsize != leafsize)) {
+            (sectorsize != nodesize)) {
                printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
                                "are not allowed for mixed block groups on %s\n",
                                sb->s_id);
@@ -2574,6 +2582,8 @@ int open_ctree(struct super_block *sb,
                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
        fs_info->endio_raid56_workers =
                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+        fs_info->endio_repair_workers =
+                btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
        fs_info->rmw_workers =
                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
        fs_info->endio_write_workers =
@@ -2595,11 +2605,12 @@ int open_ctree(struct super_block *sb,
              fs_info->submit_workers && fs_info->flush_workers &&
              fs_info->endio_workers && fs_info->endio_meta_workers &&
              fs_info->endio_meta_write_workers &&
+              fs_info->endio_repair_workers &&
              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
              fs_info->caching_workers && fs_info->readahead_workers &&
              fs_info->fixup_workers && fs_info->delayed_workers &&
-              fs_info->fixup_workers && fs_info->extent_workers &&
+              fs_info->extent_workers &&
              fs_info->qgroup_rescan_workers)) {
                err = -ENOMEM;
                goto fail_sb_buffer;
@@ -2610,7 +2621,6 @@ int open_ctree(struct super_block *sb,
                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
        tree_root->nodesize = nodesize;
-        tree_root->leafsize = leafsize;
        tree_root->sectorsize = sectorsize;
        tree_root->stripesize = stripesize;
@@ -2637,16 +2647,14 @@ int open_ctree(struct super_block *sb,
                goto fail_sb_buffer;
        }
-        blocksize = btrfs_level_size(tree_root,
-                                     btrfs_super_chunk_root_level(disk_super));
        generation = btrfs_super_chunk_root_generation(disk_super);
-        __setup_root(nodesize, leafsize, sectorsize, stripesize,
+        __setup_root(nodesize, sectorsize, stripesize, chunk_root,
-                     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+                     fs_info, BTRFS_CHUNK_TREE_OBJECTID);
        chunk_root->node = read_tree_block(chunk_root,
                                           btrfs_super_chunk_root(disk_super),
-                                           blocksize, generation);
+                                           generation);
        if (!chunk_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2679,13 +2687,11 @@ int open_ctree(struct super_block *sb,
        }
 retry_root_backup:
-        blocksize = btrfs_level_size(tree_root,
-                                     btrfs_super_root_level(disk_super));
        generation = btrfs_super_generation(disk_super);
        tree_root->node = read_tree_block(tree_root,
                                          btrfs_super_root(disk_super),
-                                          blocksize, generation);
+                                          generation);
        if (!tree_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2854,9 +2860,6 @@ retry_root_backup:
                        err = -EIO;
                        goto fail_qgroup;
                }
-                blocksize =
-                     btrfs_level_size(tree_root,
-                                      btrfs_super_log_root_level(disk_super));
                log_tree_root = btrfs_alloc_root(fs_info);
                if (!log_tree_root) {
@@ -2864,11 +2867,10 @@ retry_root_backup:
                        goto fail_qgroup;
                }
-                __setup_root(nodesize, leafsize, sectorsize, stripesize,
+                __setup_root(nodesize, sectorsize, stripesize,
                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
                log_tree_root->node = read_tree_block(tree_root, bytenr,
-                                                      blocksize,
                                                      generation + 1);
                if (!log_tree_root->node ||
                    !extent_buffer_uptodate(log_tree_root->node)) {
@@ -2975,6 +2977,8 @@ retry_root_backup:
                fs_info->update_uuid_tree_gen = 1;
        }
+        fs_info->open = 1;
        return 0;
 fail_qgroup:
@@ -3134,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
        for (i = 0; i < max_mirrors; i++) {
                bytenr = btrfs_sb_offset(i);
-                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+                    device->commit_total_bytes)
                        break;
                if (wait) {
@@ -3450,8 +3455,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                btrfs_set_stack_device_generation(dev_item, 0);
                btrfs_set_stack_device_type(dev_item, dev->type);
                btrfs_set_stack_device_id(dev_item, dev->devid);
-                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+                btrfs_set_stack_device_total_bytes(dev_item,
-                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+                                                   dev->commit_total_bytes);
+                btrfs_set_stack_device_bytes_used(dev_item,
+                                                  dev->commit_bytes_used);
                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3526,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 static void free_fs_root(struct btrfs_root *root)
 {
-        iput(root->cache_inode);
+        iput(root->ino_cache_inode);
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        btrfs_free_block_rsv(root, root->orphan_block_rsv);
        root->orphan_block_rsv = NULL;
@@ -3617,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
        return btrfs_commit_transaction(trans, root);
 }
-int close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
@@ -3683,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        btrfs_stop_all_workers(fs_info);
+        fs_info->open = 0;
        free_root_pointers(fs_info, 1);
        iput(fs_info->btree_inode);
@@ -3705,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_free_block_rsv(root, root->orphan_block_rsv);
        root->orphan_block_rsv = NULL;
-        return 0;
 }
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3808,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                              int read_only)
 {
+        struct btrfs_super_block *sb = fs_info->super_copy;
+        int ret = 0;
+        if (sb->root_level > BTRFS_MAX_LEVEL) {
+                printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+                                sb->root_level, BTRFS_MAX_LEVEL);
+                ret = -EINVAL;
+        }
+        if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+                printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+                                sb->chunk_root_level, BTRFS_MAX_LEVEL);
+                ret = -EINVAL;
+        }
+        if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+                printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+                                sb->log_root_level, BTRFS_MAX_LEVEL);
+                ret = -EINVAL;
+        }
        /*
-         * Placeholder for checks
+         * The common minimum, we don't know if we can trust the nodesize/sectorsize
+         * items yet, they'll be verified later. Issue just a warning.
         */
-        return 0;
+        if (!IS_ALIGNED(sb->root, 4096))
+                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+                                sb->root);
+        if (!IS_ALIGNED(sb->chunk_root, 4096))
+                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+                                sb->chunk_root);
+        if (!IS_ALIGNED(sb->log_root, 4096))
+                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+                                sb->log_root);
+        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+                printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+                                fs_info->fsid, sb->dev_item.fsid);
+                ret = -EINVAL;
+        }
+        /*
+         * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+         * done later
+         */
+        if (sb->num_devices > (1UL << 31))
+                printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+                                sb->num_devices);
+        if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+                printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+                                sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+                ret = -EINVAL;
+        }
+        /*
+         * The generation is a global counter, we'll trust it more than the others
+         * but it's still possible that it's the one that's wrong.
+         */
+        if (sb->generation < sb->chunk_root_generation)
+                printk(KERN_WARNING
+                        "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+                        sb->generation, sb->chunk_root_generation);
+        if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+                printk(KERN_WARNING
+                        "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+                        sb->generation, sb->cache_generation);
+        return ret;
 }
 static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -4003,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
                while (start <= end) {
-                        eb = btrfs_find_tree_block(root, start,
+                        eb = btrfs_find_tree_block(root, start);
-                                                   root->leafsize);
+                        start += root->nodesize;
-                        start += root->leafsize;
                        if (!eb)
                                continue;
                        wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
 #define BTRFS_SUPER_MIRROR_MAX   3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
-enum {
+enum btrfs_wq_endio_type {
        BTRFS_WQ_ENDIO_DATA = 0,
        BTRFS_WQ_ENDIO_METADATA = 1,
        BTRFS_WQ_ENDIO_FREE_SPACE = 2,
        BTRFS_WQ_ENDIO_RAID56 = 3,
+        BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
 };
 static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-                                      u32 blocksize, u64 parent_transid);
+                                      u64 parent_transid);
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-                         u64 parent_transid);
 int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                         int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
 int open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options);
-int close_ctree(struct btrfs_root *root);
+void close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
-                                            u64 bytenr, u32 blocksize);
+                                            u64 bytenr);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
                                      struct btrfs_key *location);
 int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-                        int metadata);
+                        enum btrfs_wq_endio_type metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *));
 int btrfs_calc_num_tolerated_disk_barrier_failures(
        struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                return ERR_PTR(-ESTALE);
        key.objectid = root_objectid;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
        index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        }
        key.objectid = objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 102ed3143976..d56589571012 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
                                                          key.objectid);
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
                                last = key.objectid +
-                                        fs_info->tree_root->leafsize;
+                                        fs_info->tree_root->nodesize;
                        else
                                last = key.objectid + key.offset;
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
        atomic_set(&caching_ctl->count, 1);
-        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+                        caching_thread, NULL, NULL);
        spin_lock(&cache->lock);
        /*
@@ -764,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
         * different
         */
        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
-                offset = root->leafsize;
+                offset = root->nodesize;
                metadata = 0;
        }
@@ -798,13 +799,13 @@ again:
                                              path->slots[0]);
                        if (key.objectid == bytenr &&
                            key.type == BTRFS_EXTENT_ITEM_KEY &&
-                            key.offset == root->leafsize)
+                            key.offset == root->nodesize)
                                ret = 0;
                }
                if (ret) {
                        key.objectid = bytenr;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
-                        key.offset = root->leafsize;
+                        key.offset = root->nodesize;
                        btrfs_release_path(path);
                        goto again;
                }
@@ -2650,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        num_heads = heads_to_leaves(root, num_heads);
        if (num_heads > 1)
-                num_bytes += (num_heads - 1) * root->leafsize;
+                num_bytes += (num_heads - 1) * root->nodesize;
        num_bytes <<= 1;
        global_rsv = &root->fs_info->global_block_rsv;
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
                async->sync = 0;
        init_completion(&async->wait);
-        btrfs_init_work(&async->work, delayed_ref_async_start,
+        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
-                        NULL, NULL);
+                        delayed_ref_async_start, NULL, NULL);
        btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3072,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
                            u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+        if (btrfs_test_is_dummy_root(root))
                return 0;
-#endif
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
@@ -3096,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        for (i = 0; i < nritems; i++) {
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, i);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                        if (key.type != BTRFS_EXTENT_DATA_KEY)
                                continue;
                        fi = btrfs_item_ptr(buf, i,
                                            struct btrfs_file_extent_item);
@@ -3116,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
-                        num_bytes = btrfs_level_size(root, level - 1);
+                        num_bytes = root->nodesize;
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0,
                                           1);
@@ -3493,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
-        ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+        ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
        if (ret) {
                kfree(found);
                return ret;
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 */
 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        /*
+        u64 num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        u64 num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        u64 target;
        u64 tmp;
@@ -4348,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
 }
 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
-                                       struct btrfs_fs_info *fs_info)
+                                       struct btrfs_fs_info *fs_info,
+                                       int flush_state)
 {
        u64 used;
        spin_lock(&space_info->lock);
+        /*
+         * We run out of space and have not got any free space via flush_space,
+         * so don't bother doing async reclaim.
+         */
+        if (flush_state > COMMIT_TRANS && space_info->full) {
+                spin_unlock(&space_info->lock);
+                return 0;
+        }
        used = space_info->bytes_used + space_info->bytes_reserved +
               space_info->bytes_pinned + space_info->bytes_readonly +
               space_info->bytes_may_use;
@@ -4385,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                flush_space(fs_info->fs_root, space_info, to_reclaim,
                            to_reclaim, flush_state);
                flush_state++;
-                if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+                                                 flush_state))
                        return;
        } while (flush_state <= COMMIT_TRANS);
-        if (btrfs_need_do_async_reclaim(space_info, fs_info))
+        if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
                queue_work(system_unbound_wq, work);
 }
@@ -4507,7 +4513,13 @@ again:
                space_info->flush = 1;
        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
                used += orig_bytes;
-                if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+                /*
+                 * We will do the space reservation dance during log replay,
+                 * which means we won't have fs_info->fs_root set, so don't do
+                 * the async reclaim as we will panic.
+                 */
+                if (!root->fs_info->log_root_recovering &&
+                    need_do_async_reclaim(space_info, root->fs_info, used) &&
                    !work_busy(&root->fs_info->async_reclaim_work))
                        queue_work(system_unbound_wq,
                                   &root->fs_info->async_reclaim_work);
@@ -4844,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        if (num_bytes * 3 > meta_used)
                num_bytes = div64_u64(meta_used, 3);
-        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+        return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4993,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
-                num_bytes = 3 * root->leafsize;
+                num_bytes = 3 * root->nodesize;
                ret = btrfs_qgroup_reserve(root, num_bytes);
                if (ret)
                        return ret;
@@ -5181,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        if (root->fs_info->quota_enabled) {
                ret = btrfs_qgroup_reserve(root, num_bytes +
-                                           nr_extents * root->leafsize);
+                                           nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
@@ -5190,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        if (unlikely(ret)) {
                if (root->fs_info->quota_enabled)
                        btrfs_qgroup_free(root, num_bytes +
-                                                nr_extents * root->leafsize);
+                                                nr_extents * root->nodesize);
                goto out_fail;
        }
@@ -5306,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
                                      btrfs_ino(inode), to_free, 0);
        if (root->fs_info->quota_enabled) {
                btrfs_qgroup_free(root, num_bytes +
-                                        dropped * root->leafsize);
+                                        dropped * root->nodesize);
        }
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5427,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
+                        /*
+                         * No longer have used bytes in this block group, queue
+                         * it for deletion.
+                         */
+                        if (old_val == 0) {
+                                spin_lock(&info->unused_bgs_lock);
+                                if (list_empty(&cache->bg_list)) {
+                                        btrfs_get_block_group(cache);
+                                        list_add_tail(&cache->bg_list,
+                                                      &info->unused_bgs);
+                                }
+                                spin_unlock(&info->unused_bgs_lock);
+                        }
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->pinned += num_bytes;
                        cache->space_info->bytes_pinned += num_bytes;
@@ -6238,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(root))
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
                return 0;
-#endif
        add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
        /*
@@ -6268,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return ret;
 }
-static u64 stripe_align(struct btrfs_root *root,
-                        struct btrfs_block_group_cache *cache,
-                        u64 val, u64 num_bytes)
-{
-        u64 ret = ALIGN(val, root->stripesize);
-        return ret;
-}
 /*
 * when we wait for progress in the block group caching, its because
 * our allocation attempt failed at least once.  So, we must sleep
@@ -6469,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
        bool have_caching_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
-        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+        ins->type = BTRFS_EXTENT_ITEM_KEY;
        ins->objectid = 0;
        ins->offset = 0;
@@ -6756,8 +6773,7 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-                search_start = stripe_align(root, block_group,
+                search_start = ALIGN(offset, root->stripesize);
-                                            offset, num_bytes);
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -7082,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path) {
                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
-                                                   root->leafsize);
+                                                   root->nodesize);
                return -ENOMEM;
        }
@@ -7091,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                      ins, size);
        if (ret) {
                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
-                                                   root->leafsize);
+                                                   root->nodesize);
                btrfs_free_path(path);
                return ret;
        }
@@ -7106,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        if (skinny_metadata) {
                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-                num_bytes = root->leafsize;
+                num_bytes = root->nodesize;
        } else {
                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
                btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7136,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                        return ret;
        }
-        ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+        ret = update_block_group(root, ins->objectid, root->nodesize, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
                BUG();
        }
-        trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+        trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
        return ret;
 }
@@ -7218,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        btrfs_set_buffer_uptodate(buf);
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                buf->log_index = root->log_transid % 2;
                /*
                 * we allow two log transactions at a time, use different
                 * EXENT bit to differentiate dirty pages.
                 */
-                if (root->log_transid % 2 == 0)
+                if (buf->log_index == 0)
                        set_extent_dirty(&root->dirty_log_pages, buf->start,
                                        buf->start + buf->len - 1, GFP_NOFS);
                else
                        set_extent_new(&root->dirty_log_pages, buf->start,
                                        buf->start + buf->len - 1, GFP_NOFS);
        } else {
+                buf->log_index = -1;
                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
        }
@@ -7305,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 *
 * returns the tree buffer or NULL.
 */
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root, u32 blocksize,
+                                        struct btrfs_root *root,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size)
@@ -7316,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        struct extent_buffer *buf;
        u64 flags = 0;
        int ret;
+        u32 blocksize = root->nodesize;
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(root)) {
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
                                            blocksize, level);
                if (!IS_ERR(buf))
                        root->alloc_bytenr += blocksize;
                return buf;
        }
-#endif
        block_rsv = use_block_rsv(trans, root, blocksize);
        if (IS_ERR(block_rsv))
                return ERR_CAST(block_rsv);
@@ -7422,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
        eb = path->nodes[wc->level];
        nritems = btrfs_header_nritems(eb);
-        blocksize = btrfs_level_size(root, wc->level - 1);
+        blocksize = root->nodesize;
        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
                if (nread >= wc->reada_count)
@@ -7469,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                continue;
                }
 reada:
-                ret = readahead_tree_block(root, bytenr, blocksize,
+                readahead_tree_block(root, bytenr, blocksize);
-                                           generation);
-                if (ret)
-                        break;
                nread++;
        }
        wc->reada_slot = slot;
@@ -7631,7 +7646,6 @@ walk_down:
        level = root_level;
        while (level >= 0) {
                if (path->nodes[level] == NULL) {
-                        int child_bsize = root->nodesize;
                        int parent_slot;
                        u64 child_gen;
                        u64 child_bytenr;
@@ -7643,8 +7657,7 @@ walk_down:
                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
-                        eb = read_tree_block(root, child_bytenr, child_bsize,
+                        eb = read_tree_block(root, child_bytenr, child_gen);
-                                             child_gen);
                        if (!eb || !extent_buffer_uptodate(eb)) {
                                ret = -EIO;
                                goto out;
@@ -7660,7 +7673,7 @@ walk_down:
                        ret = btrfs_qgroup_record_ref(trans, root->fs_info,
                                                root->objectid,
                                                child_bytenr,
-                                                child_bsize,
+                                                root->nodesize,
                                                BTRFS_QGROUP_OPER_SUB_SUBTREE,
                                                0);
                        if (ret)
@@ -7811,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        }
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
-        blocksize = btrfs_level_size(root, level - 1);
+        blocksize = root->nodesize;
-        next = btrfs_find_tree_block(root, bytenr, blocksize);
+        next = btrfs_find_tree_block(root, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
                if (!next)
@@ -7875,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (!next) {
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
-                next = read_tree_block(root, bytenr, blocksize, generation);
+                next = read_tree_block(root, bytenr, generation);
                if (!next || !extent_buffer_uptodate(next)) {
                        free_extent_buffer(next);
                        return -EIO;
@@ -8440,13 +8453,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        if (stripped)
                return extended_to_chunk(stripped);
-        /*
+        num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -8864,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        }
        up_write(&info->commit_root_sem);
+        spin_lock(&info->unused_bgs_lock);
+        while (!list_empty(&info->unused_bgs)) {
+                block_group = list_first_entry(&info->unused_bgs,
+                                               struct btrfs_block_group_cache,
+                                               bg_list);
+                list_del_init(&block_group->bg_list);
+                btrfs_put_block_group(block_group);
+        }
+        spin_unlock(&info->unused_bgs_lock);
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
                block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8998,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        init_rwsem(&cache->data_rwsem);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
-        INIT_LIST_HEAD(&cache->new_bg_list);
+        INIT_LIST_HEAD(&cache->bg_list);
        btrfs_init_free_space_ctl(cache);
        return cache;
@@ -9020,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        root = info->extent_root;
        key.objectid = 0;
        key.offset = 0;
-        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -9139,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                __link_block_group(space_info, cache);
                set_avail_alloc_bits(root->fs_info, cache->flags);
-                if (btrfs_chunk_readonly(root, cache->key.objectid))
+                if (btrfs_chunk_readonly(root, cache->key.objectid)) {
                        set_block_group_ro(cache, 1);
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        spin_lock(&info->unused_bgs_lock);
+                        /* Should always be true but just in case. */
+                        if (list_empty(&cache->bg_list)) {
+                                btrfs_get_block_group(cache);
+                                list_add_tail(&cache->bg_list,
+                                              &info->unused_bgs);
+                        }
+                        spin_unlock(&info->unused_bgs_lock);
+                }
        }
        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9181,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        int ret = 0;
-        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-                                 new_bg_list) {
+                list_del_init(&block_group->bg_list);
-                list_del_init(&block_group->new_bg_list);
                if (ret)
                        continue;
@@ -9270,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        __link_block_group(cache->space_info, cache);
-        list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+        list_add_tail(&cache->bg_list, &trans->new_bgs);
        set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9424,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        memcpy(&key, &block_group->key, sizeof(key));
-        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
@@ -9441,6 +9464,101 @@ out:
        return ret;
 }
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_space_info *space_info;
+        struct btrfs_root *root = fs_info->extent_root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        if (!fs_info->open)
+                return;
+        spin_lock(&fs_info->unused_bgs_lock);
+        while (!list_empty(&fs_info->unused_bgs)) {
+                u64 start, end;
+                block_group = list_first_entry(&fs_info->unused_bgs,
+                                               struct btrfs_block_group_cache,
+                                               bg_list);
+                space_info = block_group->space_info;
+                list_del_init(&block_group->bg_list);
+                if (ret || btrfs_mixed_space_info(space_info)) {
+                        btrfs_put_block_group(block_group);
+                        continue;
+                }
+                spin_unlock(&fs_info->unused_bgs_lock);
+                /* Don't want to race with allocators so take the groups_sem */
+                down_write(&space_info->groups_sem);
+                spin_lock(&block_group->lock);
+                if (block_group->reserved ||
+                    btrfs_block_group_used(&block_group->item) ||
+                    block_group->ro) {
+                        /*
+                         * We want to bail if we made new allocations or have
+                         * outstanding allocations in this block group.  We do
+                         * the ro check in case balance is currently acting on
+                         * this block group.
+                         */
+                        spin_unlock(&block_group->lock);
+                        up_write(&space_info->groups_sem);
+                        goto next;
+                }
+                spin_unlock(&block_group->lock);
+                /* We don't want to force the issue, only flip if it's ok. */
+                ret = set_block_group_ro(block_group, 0);
+                up_write(&space_info->groups_sem);
+                if (ret < 0) {
+                        ret = 0;
+                        goto next;
+                }
+                /*
+                 * Want to do this before we do anything else so we can recover
+                 * properly if we fail to join the transaction.
+                 */
+                trans = btrfs_join_transaction(root);
+                if (IS_ERR(trans)) {
+                        btrfs_set_block_group_rw(root, block_group);
+                        ret = PTR_ERR(trans);
+                        goto next;
+                }
+                /*
+                 * We could have pending pinned extents for this block group,
+                 * just delete them, we don't care about them anymore.
+                 */
+                start = block_group->key.objectid;
+                end = start + block_group->key.offset - 1;
+                clear_extent_bits(&fs_info->freed_extents[0], start, end,
+                                  EXTENT_DIRTY, GFP_NOFS);
+                clear_extent_bits(&fs_info->freed_extents[1], start, end,
+                                  EXTENT_DIRTY, GFP_NOFS);
+                /* Reset pinned so btrfs_put_block_group doesn't complain */
+                block_group->pinned = 0;
+                /*
+                 * Btrfs_remove_chunk will abort the transaction if things go
+                 * horribly wrong.
+                 */
+                ret = btrfs_remove_chunk(trans, root,
+                                         block_group->key.objectid);
+                btrfs_end_transaction(trans, root);
+next:
+                btrfs_put_block_group(block_group);
+                spin_lock(&fs_info->unused_bgs_lock);
+        }
+        spin_unlock(&fs_info->unused_bgs_lock);
+}
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_space_info *space_info;
@@ -9572,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
 int btrfs_start_nocow_write(struct btrfs_root *root)
 {
-        if (unlikely(atomic_read(&root->will_be_snapshoted)))
+        if (atomic_read(&root->will_be_snapshoted))
                return 0;
        percpu_counter_inc(&root->subv_writers->counter);
@@ -9580,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
         * Make sure counter is updated before we check for snapshot creation.
         */
        smp_mb();
-        if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+        if (atomic_read(&root->will_be_snapshoted)) {
                btrfs_end_nocow_write(root);
                return 0;
        }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 static struct bio_set *btrfs_bioset;
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+        return !RB_EMPTY_NODE(&state->rb_node);
+}
 #ifdef CONFIG_BTRFS_DEBUG
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
        while (!list_empty(&states)) {
                state = list_entry(states.next, struct extent_state, leak_list);
-                printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
+                pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
-                       "state %lu in tree %p refs %d\n",
+                       state->start, state->end, state->state,
-                       state->start, state->end, state->state, state->tree,
+                       extent_state_in_tree(state),
                       atomic_read(&state->refs));
                list_del(&state->leak_list);
                kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
                return state;
        state->state = 0;
        state->private = 0;
-        state->tree = NULL;
+        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add(&state->leak_list, &states);
        atomic_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
        if (!state)
                return;
        if (atomic_dec_and_test(&state->refs)) {
-                WARN_ON(state->tree);
+                WARN_ON(extent_state_in_tree(state));
                btrfs_leak_debug_del(&state->leak_list);
                trace_free_extent_state(state, _RET_IP_);
                kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
                    other->state == state->state) {
                        merge_cb(tree, state, other);
                        state->start = other->start;
-                        other->tree = NULL;
                        rb_erase(&other->rb_node, &tree->state);
+                        RB_CLEAR_NODE(&other->rb_node);
                        free_extent_state(other);
                }
        }
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
                    other->state == state->state) {
                        merge_cb(tree, state, other);
                        state->end = other->end;
-                        other->tree = NULL;
                        rb_erase(&other->rb_node, &tree->state);
+                        RB_CLEAR_NODE(&other->rb_node);
                        free_extent_state(other);
                }
        }
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
                       found->start, found->end, start, end);
                return -EEXIST;
        }
-        state->tree = tree;
        merge_state(tree, state);
        return 0;
 }
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
                free_extent_state(prealloc);
                return -EEXIST;
        }
-        prealloc->tree = tree;
        return 0;
 }
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                wake_up(&state->wq);
        if (state->state == 0) {
                next = next_state(state);
-                if (state->tree) {
+                if (extent_state_in_tree(state)) {
                        rb_erase(&state->rb_node, &tree->state);
-                        state->tree = NULL;
+                        RB_CLEAR_NODE(&state->rb_node);
                        free_extent_state(state);
                } else {
                        WARN_ON(1);
@@ -606,8 +609,8 @@ again:
                        cached_state = NULL;
                }
-                if (cached && cached->tree && cached->start <= start &&
+                if (cached && extent_state_in_tree(cached) &&
-                    cached->end > start) {
+                    cached->start <= start && cached->end > start) {
                        if (clear)
                                atomic_dec(&cached->refs);
                        state = cached;
@@ -843,7 +846,7 @@ again:
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (state->start <= start && state->end > start &&
-                    state->tree) {
+                    extent_state_in_tree(state)) {
                        node = &state->rb_node;
                        goto hit_next;
                }
@@ -1069,7 +1072,7 @@ again:
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (state->start <= start && state->end > start &&
-                    state->tree) {
+                    extent_state_in_tree(state)) {
                        node = &state->rb_node;
                        goto hit_next;
                }
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
-                if (state->end == start - 1 && state->tree) {
+                if (state->end == start - 1 && extent_state_in_tree(state)) {
                        n = rb_next(&state->rb_node);
                        while (n) {
                                state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int bitset = 0;
        spin_lock(&tree->lock);
-        if (cached && cached->tree && cached->start <= start &&
+        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
            cached->end > start)
                node = &cached->rb_node;
        else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
                SetPageUptodate(page);
 }
-/*
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-        struct page *page;
-        u64 start;
-        u64 len;
-        u64 logical;
-        unsigned long bio_flags;
-        int this_mirror;
-        int failed_mirror;
-        int in_validation;
-};
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
-                                int did_repair)
 {
        int ret;
        int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
 * currently, there can be no more than two copies of every data bit. thus,
 * exactly one rewrite is required.
 */
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
-                        u64 length, u64 logical, struct page *page,
+                      struct page *page, unsigned int pg_offset, int mirror_num)
-                        int mirror_num)
 {
+        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct bio *bio;
        struct btrfs_device *dev;
        u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                return -EIO;
        }
        bio->bi_bdev = dev->bdev;
-        bio_add_page(bio, page, length, start - page_offset(page));
+        bio_add_page(bio, page, length, pg_offset);
        if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
                /* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
        }
        printk_ratelimited_in_rcu(KERN_INFO
-                        "BTRFS: read error corrected: ino %lu off %llu "
+                                  "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
-                    "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+                                  btrfs_ino(inode), start,
-                    start, rcu_str_deref(dev->name), sector);
+                                  rcu_str_deref(dev->name), sector);
        bio_put(bio);
        return 0;
 }
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                return -EROFS;
        for (i = 0; i < num_pages; i++) {
-                struct page *p = extent_buffer_page(eb, i);
+                struct page *p = eb->pages[i];
-                ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
-                                        start, p, mirror_num);
+                ret = repair_io_failure(root->fs_info->btree_inode, start,
+                                        PAGE_CACHE_SIZE, start, p,
+                                        start - page_offset(p), mirror_num);
                if (ret)
                        break;
                start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 * each time an IO finishes, we do a fast check in the IO failure tree
 * to see if we need to process or clean up an io_failure_record
 */
-static int clean_io_failure(u64 start, struct page *page)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                     unsigned int pg_offset)
 {
        u64 private;
        u64 private_failure;
        struct io_failure_record *failrec;
-        struct inode *inode = page->mapping->host;
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct extent_state *state;
        int num_copies;
-        int did_repair = 0;
        int ret;
        private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
                /* there was no real error, just free the record */
                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
                         failrec->start);
-                did_repair = 1;
                goto out;
        }
        if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
                num_copies = btrfs_num_copies(fs_info, failrec->logical,
                                              failrec->len);
                if (num_copies > 1)  {
-                        ret = repair_io_failure(fs_info, start, failrec->len,
+                        repair_io_failure(inode, start, failrec->len,
-                                                failrec->logical, page,
+                                          failrec->logical, page,
-                                                failrec->failed_mirror);
+                                          pg_offset, failrec->failed_mirror);
-                        did_repair = !ret;
                }
-                ret = 0;
        }
 out:
-        if (!ret)
+        free_io_failure(inode, failrec);
-                ret = free_io_failure(inode, failrec, did_repair);
-        return ret;
+        return 0;
 }
 /*
- * this is a generic handler for readpage errors (default
+ * Can be called when
- * readpage_io_failed_hook). if other copies exist, read those and write back
+ * - hold extent lock
- * good data to the failed position. does not investigate in remapping the
+ * - under ordered extent
- * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * - the inode is freeing
- * needed
 */
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        struct io_failure_record *failrec;
+        struct extent_state *state, *next;
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+        if (RB_EMPTY_ROOT(&failure_tree->state))
-                              struct page *page, u64 start, u64 end,
+                return;
-                              int failed_mirror)
+        spin_lock(&failure_tree->lock);
+        state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+        while (state) {
+                if (state->start > end)
+                        break;
+                ASSERT(state->end <= end);
+                next = next_state(state);
+                failrec = (struct io_failure_record *)state->private;
+                free_extent_state(state);
+                kfree(failrec);
+                state = next;
+        }
+        spin_unlock(&failure_tree->lock);
+}
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+                                struct io_failure_record **failrec_ret)
 {
-        struct io_failure_record *failrec = NULL;
+        struct io_failure_record *failrec;
        u64 private;
        struct extent_map *em;
-        struct inode *inode = page->mapping->host;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-        struct bio *bio;
-        struct btrfs_io_bio *btrfs_failed_bio;
-        struct btrfs_io_bio *btrfs_bio;
-        int num_copies;
        int ret;
-        int read_mode;
        u64 logical;
-        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
        ret = get_state_private(failure_tree, start, &private);
        if (ret) {
                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
                if (!failrec)
                        return -ENOMEM;
                failrec->start = start;
                failrec->len = end - start + 1;
                failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                        em = NULL;
                }
                read_unlock(&em_tree->lock);
                if (!em) {
                        kfree(failrec);
                        return -EIO;
                }
                logical = start - em->start;
                logical = em->block_start + logical;
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                        extent_set_compress_type(&failrec->bio_flags,
                                                 em->compress_type);
                }
-                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
-                         "len=%llu\n", logical, start, failrec->len);
+                pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+                         logical, start, failrec->len);
                failrec->logical = logical;
                free_extent_map(em);
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                }
        } else {
                failrec = (struct io_failure_record *)(unsigned long)private;
-                pr_debug("bio_readpage_error: (found) logical=%llu, "
+                pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
-                         "start=%llu, len=%llu, validation=%d\n",
                         failrec->logical, failrec->start, failrec->len,
                         failrec->in_validation);
                /*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                 * clean_io_failure() clean all those errors at once.
                 */
        }
+        *failrec_ret = failrec;
+        return 0;
+}
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+                           struct io_failure_record *failrec, int failed_mirror)
+{
+        int num_copies;
        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
                                      failrec->logical, failrec->len);
        if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                 * all the retry and error correction code that follows. no
                 * matter what the error is, it is very likely to persist.
                 */
-                pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
                         num_copies, failrec->this_mirror, failed_mirror);
-                free_io_failure(inode, failrec, 0);
+                return 0;
-                return -EIO;
        }
        /*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                BUG_ON(failrec->in_validation);
                failrec->in_validation = 1;
                failrec->this_mirror = failed_mirror;
-                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
        } else {
                /*
                 * we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                failrec->this_mirror++;
                if (failrec->this_mirror == failed_mirror)
                        failrec->this_mirror++;
-                read_mode = READ_SYNC;
        }
        if (failrec->this_mirror > num_copies) {
-                pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
                         num_copies, failrec->this_mirror, failed_mirror);
-                free_io_failure(inode, failrec, 0);
+                return 0;
-                return -EIO;
        }
+        return 1;
+}
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+                                    struct io_failure_record *failrec,
+                                    struct page *page, int pg_offset, int icsum,
+                                    bio_end_io_t *endio_func, void *data)
+{
+        struct bio *bio;
+        struct btrfs_io_bio *btrfs_failed_bio;
+        struct btrfs_io_bio *btrfs_bio;
        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-        if (!bio) {
+        if (!bio)
-                free_io_failure(inode, failrec, 0);
+                return NULL;
-                return -EIO;
-        }
+        bio->bi_end_io = endio_func;
-        bio->bi_end_io = failed_bio->bi_end_io;
        bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        bio->bi_iter.bi_size = 0;
+        bio->bi_private = data;
        btrfs_failed_bio = btrfs_io_bio(failed_bio);
        if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                btrfs_bio = btrfs_io_bio(bio);
                btrfs_bio->csum = btrfs_bio->csum_inline;
-                phy_offset >>= inode->i_sb->s_blocksize_bits;
+                icsum *= csum_size;
-                phy_offset *= csum_size;
+                memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
-                memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
                       csum_size);
        }
-        bio_add_page(bio, page, failrec->len, start - page_offset(page));
+        bio_add_page(bio, page, failrec->len, pg_offset);
+        return bio;
+}
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+                              struct page *page, u64 start, u64 end,
+                              int failed_mirror)
+{
+        struct io_failure_record *failrec;
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+        struct bio *bio;
+        int read_mode;
+        int ret;
+        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+        if (ret)
+                return ret;
+        ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+        if (!ret) {
+                free_io_failure(inode, failrec);
+                return -EIO;
+        }
+        if (failed_bio->bi_vcnt > 1)
+                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+        else
+                read_mode = READ_SYNC;
+        phy_offset >>= inode->i_sb->s_blocksize_bits;
+        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                      start - page_offset(page),
+                                      (int)phy_offset, failed_bio->bi_end_io,
+                                      NULL);
+        if (!bio) {
+                free_io_failure(inode, failrec);
+                return -EIO;
+        }
-        pr_debug("bio_readpage_error: submitting new read[%#x] to "
+        pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
-                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+                 read_mode, failrec->this_mirror, failrec->in_validation);
-                 failrec->this_mirror, num_copies, failrec->in_validation);
        ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
                                         failrec->this_mirror,
                                         failrec->bio_flags, 0);
+        if (ret) {
+                free_io_failure(inode, failrec);
+                bio_put(bio);
+        }
        return ret;
 }
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct inode *inode = page->mapping->host;
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-                         "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
+                         "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
                         io_bio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        if (ret)
                                uptodate = 0;
                        else
-                                clean_io_failure(start, page);
+                                clean_io_failure(inode, start, page, 0);
                }
                if (likely(uptodate))
@@ -2532,6 +2602,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                offset += len;
                                continue;
                        }
                }
@@ -2539,12 +2610,12 @@ readpage_ok:
                if (likely(uptodate)) {
                        loff_t i_size = i_size_read(inode);
                        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-                        unsigned offset;
+                        unsigned off;
                        /* Zero out the end if this page straddles i_size */
-                        offset = i_size & (PAGE_CACHE_SIZE-1);
+                        off = i_size & (PAGE_CACHE_SIZE-1);
-                        if (page->index == end_index && offset)
+                        if (page->index == end_index && off)
-                                zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+                                zero_user_segment(page, off, PAGE_CACHE_SIZE);
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
@@ -2617,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
-        return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+        struct btrfs_io_bio *btrfs_bio;
-}
+        struct bio *new;
+        new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+        if (new) {
+                btrfs_bio = btrfs_io_bio(new);
+                btrfs_bio->csum = NULL;
+                btrfs_bio->csum_allocated = NULL;
+                btrfs_bio->end_io = NULL;
+        }
+        return new;
+}
 /* this also allocates from the btrfs_bioset */
 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3500,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
-                struct page *p = extent_buffer_page(eb, i);
+                struct page *p = eb->pages[i];
                if (!trylock_page(p)) {
                        if (!flush) {
@@ -3521,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
 }
+static void set_btree_ioerr(struct page *page)
+{
+        struct extent_buffer *eb = (struct extent_buffer *)page->private;
+        struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+        SetPageError(page);
+        if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+                return;
+        /*
+         * If writeback for a btree extent that doesn't belong to a log tree
+         * failed, increment the counter transaction->eb_write_errors.
+         * We do this because while the transaction is running and before it's
+         * committing (when we call filemap_fdata[write|wait]_range against
+         * the btree inode), we might have
+         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+         * returns an error or an error happens during writeback, when we're
+         * committing the transaction we wouldn't know about it, since the pages
+         * can be no longer dirty nor marked anymore for writeback (if a
+         * subsequent modification to the extent buffer didn't happen before the
+         * transaction commit), which makes filemap_fdata[write|wait]_range not
+         * able to find the pages tagged with SetPageError at transaction
+         * commit time. So if this happens we must abort the transaction,
+         * otherwise we commit a super block with btree roots that point to
+         * btree nodes/leafs whose content on disk is invalid - either garbage
+         * or the content of some node/leaf from a past generation that got
+         * cowed or deleted and is no longer valid.
+         *
+         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+         * not be enough - we need to distinguish between log tree extents vs
+         * non-log tree extents, and the next filemap_fdatawait_range() call
+         * will catch and clear such errors in the mapping - and that call might
+         * be from a log sync and not from a transaction commit. Also, checking
+         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+         * not done and would not be reliable - the eb might have been released
+         * from memory and reading it back again means that flag would not be
+         * set (since it's a runtime flag, not persisted on disk).
+         *
+         * Using the flags below in the btree inode also makes us achieve the
+         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+         * writeback for all dirty pages and before filemap_fdatawait_range()
+         * is called, the writeback for all dirty pages had already finished
+         * with errors - because we were not using AS_EIO/AS_ENOSPC,
+         * filemap_fdatawait_range() would return success, as it could not know
+         * that writeback errors happened (the pages were no longer tagged for
+         * writeback).
+         */
+        switch (eb->log_index) {
+        case -1:
+                set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+                break;
+        case 0:
+                set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+                break;
+        case 1:
+                set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+                break;
+        default:
+                BUG(); /* unexpected, logic error */
+        }
+}
 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 {
        struct bio_vec *bvec;
@@ -3534,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
                BUG_ON(!eb);
                done = atomic_dec_and_test(&eb->io_pages);
-                if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+                if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
-                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
                        ClearPageUptodate(page);
-                        SetPageError(page);
+                        set_btree_ioerr(page);
                }
                end_page_writeback(page);
@@ -3564,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
        int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
        int ret = 0;
-        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
        atomic_set(&eb->io_pages, num_pages);
        if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
                bio_flags = EXTENT_BIO_TREE_LOG;
        for (i = 0; i < num_pages; i++) {
-                struct page *p = extent_buffer_page(eb, i);
+                struct page *p = eb->pages[i];
                clear_page_dirty_for_io(p);
                set_page_writeback(p);
@@ -3581,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
                                         0, epd->bio_flags, bio_flags);
                epd->bio_flags = bio_flags;
                if (ret) {
-                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                        set_btree_ioerr(p);
-                        SetPageError(p);
+                        end_page_writeback(p);
                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
                                end_extent_buffer_writeback(eb);
                        ret = -EIO;
@@ -3595,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
        if (unlikely(ret)) {
                for (; i < num_pages; i++) {
-                        struct page *p = extent_buffer_page(eb, i);
+                        struct page *p = eb->pages[i];
+                        clear_page_dirty_for_io(p);
                        unlock_page(p);
                }
        }
@@ -4165,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
        return NULL;
 }
-static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
-{
-        unsigned long cnt = *((unsigned long *)ctx);
-        cnt++;
-        *((unsigned long *)ctx) = cnt;
-        /* Now we're sure that the extent is shared. */
-        if (cnt > 1)
-                return 1;
-        return 0;
-}
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4194,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        int end = 0;
        u64 em_start = 0;
        u64 em_len = 0;
@@ -4207,15 +4337,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
-        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        start = round_down(start, BTRFS_I(inode)->root->sectorsize);
-        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+        len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
        /*
         * lookup the last file extent.  We're not using i_size here
         * because there might be preallocation past i_size
         */
-        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
-                                       path, btrfs_ino(inode), -1, 0);
+                                       0);
        if (ret < 0) {
                btrfs_free_path(path);
                return ret;
@@ -4223,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        WARN_ON(!ret);
        path->slots[0]--;
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
-        found_type = btrfs_key_type(&found_key);
+        found_type = found_key.type;
        /* No extents, but there might be delalloc bits */
        if (found_key.objectid != btrfs_ino(inode) ||
@@ -4308,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
-                } else {
+                } else if (fieinfo->fi_extents_max) {
-                        unsigned long ref_cnt = 0;
+                        u64 bytenr = em->block_start -
+                                (em->start - em->orig_start);
                        disko = em->block_start + offset_in_extent;
                        /*
                         * As btrfs supports shared space, this information
                         * can be exported to userspace tools via
-                         * flag FIEMAP_EXTENT_SHARED.
+                         * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
+                         * then we're just getting a count and we can skip the
+                         * lookup stuff.
                         */
-                        ret = iterate_inodes_from_logical(
+                        ret = btrfs_check_shared(NULL, root->fs_info,
-                                        em->block_start,
+                                                 root->objectid,
-                                        BTRFS_I(inode)->root->fs_info,
+                                                 btrfs_ino(inode), bytenr);
-                                        path, count_ext_ref, &ref_cnt);
+                        if (ret < 0)
-                        if (ret < 0 && ret != -ENOENT)
                                goto out_free;
+                        if (ret)
-                        if (ref_cnt > 1)
                                flags |= FIEMAP_EXTENT_SHARED;
+                        ret = 0;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
@@ -4380,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
 /*
 * Helper for releasing extent buffer page.
 */
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
-                                                unsigned long start_idx)
 {
        unsigned long index;
-        unsigned long num_pages;
        struct page *page;
        int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
        BUG_ON(extent_buffer_under_io(eb));
-        num_pages = num_extent_pages(eb->start, eb->len);
+        index = num_extent_pages(eb->start, eb->len);
-        index = start_idx + num_pages;
+        if (index == 0)
-        if (start_idx >= index)
                return;
        do {
                index--;
-                page = extent_buffer_page(eb, index);
+                page = eb->pages[index];
                if (page && mapped) {
                        spin_lock(&page->mapping->private_lock);
                        /*
@@ -4428,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                        /* One for when we alloced the page */
                        page_cache_release(page);
                }
-        } while (index != start_idx);
+        } while (index != 0);
 }
 /*
@@ -4436,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
 */
 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
-        btrfs_release_extent_buffer_page(eb, 0);
+        btrfs_release_extent_buffer_page(eb);
        __free_extent_buffer(eb);
 }
@@ -4579,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
-                struct page *p = extent_buffer_page(eb, i);
+                struct page *p = eb->pages[i];
                if (p != accessed)
                        mark_page_accessed(p);
        }
@@ -4748,7 +4878,7 @@ again:
         */
        SetPageChecked(eb->pages[0]);
        for (i = 1; i < num_pages; i++) {
-                p = extent_buffer_page(eb, i);
+                p = eb->pages[i];
                ClearPageChecked(p);
                unlock_page(p);
        }
@@ -4793,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
                }
                /* Should be safe to release our pages at this point */
-                btrfs_release_extent_buffer_page(eb, 0);
+                btrfs_release_extent_buffer_page(eb);
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
@@ -4859,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                if (!PageDirty(page))
                        continue;
@@ -4895,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
        for (i = 0; i < num_pages; i++)
-                set_page_dirty(extent_buffer_page(eb, i));
+                set_page_dirty(eb->pages[i]);
        return was_dirty;
 }
@@ -4908,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                if (page)
                        ClearPageUptodate(page);
        }
@@ -4924,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                SetPageUptodate(page);
        }
        return 0;
@@ -4964,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                if (wait == WAIT_NONE) {
                        if (!trylock_page(page))
                                goto unlock_exit;
@@ -4983,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                goto unlock_exit;
        }
-        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        eb->read_mirror = 0;
        atomic_set(&eb->io_pages, num_reads);
        for (i = start_i; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                if (!PageUptodate(page)) {
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
@@ -5012,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                return ret;
        for (i = start_i; i < num_pages; i++) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                wait_on_page_locked(page);
                if (!PageUptodate(page))
                        ret = -EIO;
@@ -5023,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 unlock_exit:
        i = start_i;
        while (locked_pages > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                i++;
                unlock_page(page);
                locked_pages--;
@@ -5049,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                cur = min(len, (PAGE_CACHE_SIZE - offset));
                kaddr = page_address(page);
@@ -5081,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                cur = min(len, (PAGE_CACHE_SIZE - offset));
                kaddr = page_address(page);
@@ -5130,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                return -EINVAL;
        }
-        p = extent_buffer_page(eb, i);
+        p = eb->pages[i];
        kaddr = page_address(p);
        *map = kaddr + offset;
        *map_len = PAGE_CACHE_SIZE - offset;
@@ -5156,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -5190,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                WARN_ON(!PageUptodate(page));
                cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5220,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(eb, i);
+                page = eb->pages[i];
                WARN_ON(!PageUptodate(page));
                cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5251,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
                (PAGE_CACHE_SIZE - 1);
        while (len > 0) {
-                page = extent_buffer_page(dst, i);
+                page = dst->pages[i];
                WARN_ON(!PageUptodate(page));
                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5329,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                cur = min_t(unsigned long, cur,
                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
-                copy_pages(extent_buffer_page(dst, dst_i),
+                copy_pages(dst->pages[dst_i], dst->pages[src_i],
-                           extent_buffer_page(dst, src_i),
                           dst_off_in_page, src_off_in_page, cur);
                src_offset += cur;
@@ -5376,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                cur = min_t(unsigned long, len, src_off_in_page + 1);
                cur = min(cur, dst_off_in_page + 1);
-                copy_pages(extent_buffer_page(dst, dst_i),
+                copy_pages(dst->pages[dst_i], dst->pages[src_i],
-                           extent_buffer_page(dst, src_i),
                           dst_off_in_page - cur + 1,
                           src_off_in_page - cur + 1, cur);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
 #define EXTENT_NEW (1 << 4)
 #define EXTENT_DELALLOC (1 << 5)
 #define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
 #define EXTENT_BUFFER_READAHEAD 4       /* this got triggered by readahead */
 #define EXTENT_BUFFER_TREE_REF 5
 #define EXTENT_BUFFER_STALE 6
 #define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_READ_ERR 8        /* read IO error */
 #define EXTENT_BUFFER_DUMMY 9
 #define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
 /* these are flags for extent_clear_unlock_delalloc */
 #define PAGE_UNLOCK             (1 << 0)
@@ -57,7 +55,6 @@
 * map has page->private set to one.
 */
 #define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 struct extent_state;
 struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
        struct rb_node rb_node;
        /* ADD NEW ELEMENTS AFTER THIS */
-        struct extent_io_tree *tree;
        wait_queue_head_t wq;
        atomic_t refs;
        unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
 struct extent_buffer {
        u64 start;
        unsigned long len;
-        unsigned long map_start;
-        unsigned long map_len;
        unsigned long bflags;
        struct btrfs_fs_info *fs_info;
        spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
        atomic_t blocking_readers;
        atomic_t spinning_readers;
        atomic_t spinning_writers;
-        int lock_nested;
+        short lock_nested;
+        /* >= 0 if eb belongs to a log tree, -1 otherwise */
+        short log_index;
        /* protects write locks */
        rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
                (start >> PAGE_CACHE_SHIFT);
 }
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
-                                              unsigned long i)
-{
-        return eb->pages[i];
-}
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
        atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
 struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
-                        u64 length, u64 logical, struct page *page,
+                      struct page *page, unsigned int pg_offset,
-                        int mirror_num);
+                      int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                     unsigned int pg_offset);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num);
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+        struct page *page;
+        u64 start;
+        u64 len;
+        u64 logical;
+        unsigned long bio_flags;
+        int this_mirror;
+        int failed_mirror;
+        int in_validation;
+};
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+                                struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+                           struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+                                    struct io_failure_record *failrec,
+                                    struct page *page, int pg_offset, int icsum,
+                                    bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 noinline u64 find_lock_delalloc_range(struct inode *inode,
                                      struct extent_io_tree *tree,
                                      struct page *locked_page, u64 *start,
                                      u64 *end, u64 max_bytes);
+#endif
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
                                               u64 start, unsigned long len);
 #endif
-#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54c84daec9b5..783a94355efd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        file_key.objectid = objectid;
        file_key.offset = pos;
-        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        file_key.type = BTRFS_EXTENT_DATA_KEY;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        file_key.offset = bytenr;
-        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        file_key.type = BTRFS_EXTENT_CSUM_KEY;
        ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
        if (ret < 0)
                goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                        goto fail;
                path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+                if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
                        goto fail;
                csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
        file_key.objectid = objectid;
        file_key.offset = offset;
-        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        file_key.type = BTRFS_EXTENT_DATA_KEY;
        ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
        return ret;
 }
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 }
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-                              struct btrfs_dio_private *dip, struct bio *bio,
+                              struct bio *bio, u64 offset)
-                              u64 offset)
 {
-        int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
-        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-        int ret;
-        len >>= inode->i_sb->s_blocksize_bits;
-        len *= csum_size;
-        ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
-                                      (u32 *)(dip->csum + len), 1);
-        return ret;
 }
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        u64 csum_end;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-        ASSERT(start == ALIGN(start, root->sectorsize) &&
+        ASSERT(IS_ALIGNED(start, root->sectorsize) &&
-               (end + 1) == ALIGN(end + 1, root->sectorsize));
+               IS_ALIGNED(end + 1, root->sectorsize));
        path = btrfs_alloc_path();
        if (!path)
@@ -720,7 +710,7 @@ again:
        bytenr = sums->bytenr + total_bytes;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        file_key.offset = bytenr;
-        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        file_key.type = BTRFS_EXTENT_CSUM_KEY;
        item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
        if (!IS_ERR(item)) {
@@ -790,7 +780,7 @@ again:
        csum_offset = (bytenr - found_key.offset) >>
                        root->fs_info->sb->s_blocksize_bits;
-        if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+        if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
            found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
            csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
                goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3afac292d67..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        /* get the inode */
        key.objectid = defrag->root;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
        index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        }
        key.objectid = defrag->ino;
-        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
        if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                if (unlikely(copied == 0))
                        break;
-                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                if (copied < PAGE_CACHE_SIZE - offset) {
                        offset += copied;
                } else {
                        pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        bool force_page_uptodate = false;
        bool need_unlock;
-        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
+        nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
-                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                        PAGE_CACHE_SIZE / (sizeof(struct page *)));
-                     (sizeof(struct page *)));
        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
        nrptrs = max(nrptrs, 8);
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                size_t write_bytes = min(iov_iter_count(i),
                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
-                size_t num_pages = (write_bytes + offset +
+                size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
-                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                                                PAGE_CACHE_SIZE);
                size_t reserve_bytes;
                size_t dirty_pages;
                size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                 * our prealloc extent may be smaller than
                                 * write_bytes, so scale down.
                                 */
-                                num_pages = (write_bytes + offset +
+                                num_pages = DIV_ROUND_UP(write_bytes + offset,
-                                             PAGE_CACHE_SIZE - 1) >>
+                                                         PAGE_CACHE_SIZE);
-                                        PAGE_CACHE_SHIFT;
                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
                                ret = 0;
                        } else {
@@ -1590,9 +1588,8 @@ again:
                        dirty_pages = 0;
                } else {
                        force_page_uptodate = false;
-                        dirty_pages = (copied + offset +
+                        dirty_pages = DIV_ROUND_UP(copied + offset,
-                                       PAGE_CACHE_SIZE - 1) >>
+                                                   PAGE_CACHE_SIZE);
-                                       PAGE_CACHE_SHIFT;
                }
                /*
@@ -1653,7 +1650,7 @@ again:
                cond_resched();
                balance_dirty_pages_ratelimited(inode->i_mapping);
-                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root);
                pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        if (sync)
                atomic_inc(&BTRFS_I(inode)->sync_writers);
-        if (unlikely(file->f_flags & O_DIRECT)) {
+        if (file->f_flags & O_DIRECT) {
                num_written = __btrfs_direct_write(iocb, from, pos);
        } else {
                num_written = __btrfs_buffered_write(file, from, pos);
@@ -1840,10 +1837,32 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
-        filemap_flush(inode->i_mapping);
+        /*
+         * ordered_data_close is set by settattr when we are about to truncate
+         * a file from a non-zero size to a zero size.  This tries to
+         * flush down new bytes that may have been written if the
+         * application were using truncate to replace a file in place.
+         */
+        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                               &BTRFS_I(inode)->runtime_flags))
+                        filemap_flush(inode->i_mapping);
        return 0;
 }
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+        int ret;
+        atomic_inc(&BTRFS_I(inode)->sync_writers);
+        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                             &BTRFS_I(inode)->runtime_flags))
+                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        atomic_dec(&BTRFS_I(inode)->sync_writers);
+        return ret;
+}
 /*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
@@ -1873,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * multi-task, and make the performance up.  See
         * btrfs_wait_ordered_range for an explanation of the ASYNC check.
         */
-        atomic_inc(&BTRFS_I(inode)->sync_writers);
+        ret = start_ordered_ops(inode, start, end);
-        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
-        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                             &BTRFS_I(inode)->runtime_flags))
-                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
-        atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
        mutex_lock(&inode->i_mutex);
-        /*
-         * We flush the dirty pages again to avoid some dirty pages in the
-         * range being left.
-         */
        atomic_inc(&root->log_batch);
        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &BTRFS_I(inode)->runtime_flags);
+        /*
+         * We might have have had more pages made dirty after calling
+         * start_ordered_ops and before acquiring the inode's i_mutex.
+         */
        if (full_sync) {
+                /*
+                 * For a full sync, we need to make sure any ordered operations
+                 * start and finish before we start logging the inode, so that
+                 * all extents are persisted and the respective file extent
+                 * items are in the fs/subvol btree.
+                 */
                ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
-                if (ret) {
+        } else {
-                        mutex_unlock(&inode->i_mutex);
+                /*
-                        goto out;
+                 * Start any new ordered operations before starting to log the
-                }
+                 * inode. We will wait for them to finish in btrfs_sync_log().
+                 *
+                 * Right before acquiring the inode's mutex, we might have new
+                 * writes dirtying pages, which won't immediately start the
+                 * respective ordered operations - that is done through the
+                 * fill_delalloc callbacks invoked from the writepage and
+                 * writepages address space operations. So make sure we start
+                 * all ordered operations before starting to log our inode. Not
+                 * doing this means that while logging the inode, writeback
+                 * could start and invoke writepage/writepages, which would call
+                 * the fill_delalloc callbacks (cow_file_range,
+                 * submit_compressed_extents). These callbacks add first an
+                 * extent map to the modified list of extents and then create
+                 * the respective ordered operation, which means in
+                 * tree-log.c:btrfs_log_inode() we might capture all existing
+                 * ordered operations (with btrfs_get_logged_extents()) before
+                 * the fill_delalloc callback adds its ordered operation, and by
+                 * the time we visit the modified list of extent maps (with
+                 * btrfs_log_changed_extents()), we see and process the extent
+                 * map they created. We then use the extent map to construct a
+                 * file extent item for logging without waiting for the
+                 * respective ordered operation to finish - this file extent
+                 * item points to a disk location that might not have yet been
+                 * written to, containing random data - so after a crash a log
+                 * replay will make our inode have file extent items that point
+                 * to disk locations containing invalid data, as we returned
+                 * success to userspace without waiting for the respective
+                 * ordered operation to finish, because it wasn't captured by
+                 * btrfs_get_logged_extents().
+                 */
+                ret = start_ordered_ops(inode, start, end);
+        }
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
        }
        atomic_inc(&root->log_batch);
@@ -1958,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        btrfs_init_log_ctx(&ctx);
-        ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
@@ -1976,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        mutex_unlock(&inode->i_mutex);
+        /*
+         * If any of the ordered extents had an error, just return it to user
+         * space, so that the application knows some writes didn't succeed and
+         * can take proper action (retry for e.g.). Blindly committing the
+         * transaction in this case, would fool userspace that everything was
+         * successful. And we also want to make sure our log doesn't contain
+         * file extent items pointing to extents that weren't fully written to -
+         * just like in the non fast fsync path, where we check for the ordered
+         * operation's error flag before writing to the log tree and return -EIO
+         * if any of them had this flag set (btrfs_wait_ordered_range) -
+         * therefore we need to check for errors in the ordered operations,
+         * which are indicated by ctx.io_err.
+         */
+        if (ctx.io_err) {
+                btrfs_end_transaction(trans, root);
+                ret = ctx.io_err;
+                goto out;
+        }
        if (ret != BTRFS_NO_LOG_SYNC) {
                if (!ret) {
                        ret = btrfs_sync_log(trans, root, &ctx);
@@ -2088,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                goto out;
        }
-        if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+        if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
                u64 num_bytes;
-                path->slots[0]++;
                key.offset = offset;
                btrfs_set_item_key_safe(root, path, &key);
                fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2216,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                goto out_only_mutex;
        }
-        lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+        lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
        lockend = round_down(offset + len,
                             BTRFS_I(inode)->root->sectorsize) - 1;
        same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2277,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                                                tail_start + tail_len, 0, 1);
                                if (ret)
                                        goto out_only_mutex;
-                                }
+                        }
                }
        }
@@ -2614,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
-        u64 lockstart = *offset;
+        u64 lockstart;
-        u64 lockend = i_size_read(inode);
+        u64 lockend;
-        u64 start = *offset;
+        u64 start;
-        u64 len = i_size_read(inode);
+        u64 len;
        int ret = 0;
-        lockend = max_t(u64, root->sectorsize, lockend);
+        if (inode->i_size == 0)
+                return -ENXIO;
+        /*
+         * *offset can be negative, in this case we start finding DATA/HOLE from
+         * the very start of the file.
+         */
+        start = max_t(loff_t, 0, *offset);
+        lockstart = round_down(start, root->sectorsize);
+        lockend = round_up(i_size_read(inode), root->sectorsize);
        if (lockend <= lockstart)
                lockend = lockstart + root->sectorsize;
        lockend--;
        len = lockend - lockstart + 1;
-        len = max_t(u64, len, root->sectorsize);
-        if (inode->i_size == 0)
-                return -ENXIO;
        lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
                         &cached_state);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
        int num_pages;
        int check_crcs = 0;
-        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
-                    PAGE_CACHE_SHIFT;
        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
                check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
        return merged;
 }
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+                                     struct btrfs_free_space *info,
+                                     bool update_stat)
+{
+        struct btrfs_free_space *bitmap;
+        unsigned long i;
+        unsigned long j;
+        const u64 end = info->offset + info->bytes;
+        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+        u64 bytes;
+        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+        if (!bitmap)
+                return false;
+        i = offset_to_bit(bitmap->offset, ctl->unit, end);
+        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+        if (j == i)
+                return false;
+        bytes = (j - i) * ctl->unit;
+        info->bytes += bytes;
+        if (update_stat)
+                bitmap_clear_bits(ctl, bitmap, end, bytes);
+        else
+                __bitmap_clear_bits(ctl, bitmap, end, bytes);
+        if (!bitmap->bytes)
+                free_bitmap(ctl, bitmap);
+        return true;
+}
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+                                       struct btrfs_free_space *info,
+                                       bool update_stat)
+{
+        struct btrfs_free_space *bitmap;
+        u64 bitmap_offset;
+        unsigned long i;
+        unsigned long j;
+        unsigned long prev_j;
+        u64 bytes;
+        bitmap_offset = offset_to_bitmap(ctl, info->offset);
+        /* If we're on a boundary, try the previous logical bitmap. */
+        if (bitmap_offset == info->offset) {
+                if (info->offset == 0)
+                        return false;
+                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+        }
+        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+        if (!bitmap)
+                return false;
+        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+        j = 0;
+        prev_j = (unsigned long)-1;
+        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+                if (j > i)
+                        break;
+                prev_j = j;
+        }
+        if (prev_j == i)
+                return false;
+        if (prev_j == (unsigned long)-1)
+                bytes = (i + 1) * ctl->unit;
+        else
+                bytes = (i - prev_j) * ctl->unit;
+        info->offset -= bytes;
+        info->bytes += bytes;
+        if (update_stat)
+                bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+        else
+                __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+        if (!bitmap->bytes)
+                free_bitmap(ctl, bitmap);
+        return true;
+}
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info,
+                              bool update_stat)
+{
+        /*
+         * Only work with disconnected entries, as we can change their offset,
+         * and must be extent entries.
+         */
+        ASSERT(!info->bitmap);
+        ASSERT(RB_EMPTY_NODE(&info->offset_index));
+        if (ctl->total_bitmaps > 0) {
+                bool stole_end;
+                bool stole_front = false;
+                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+                if (ctl->total_bitmaps > 0)
+                        stole_front = steal_from_bitmap_to_front(ctl, info,
+                                                                 update_stat);
+                if (stole_end || stole_front)
+                        try_merge_free_space(ctl, info, update_stat);
+        }
+}
 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
                           u64 offset, u64 bytes)
 {
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
        info->offset = offset;
        info->bytes = bytes;
+        RB_CLEAR_NODE(&info->offset_index);
        spin_lock(&ctl->tree_lock);
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
                goto out;
        }
 link:
+        /*
+         * Only steal free space from adjacent bitmaps if we're sure we're not
+         * going to add the new free space to existing bitmap entries - because
+         * that would mean unnecessary work that would be reverted. Therefore
+         * attempt to steal space from bitmaps if we're adding an extent entry.
+         */
+        steal_from_bitmap(ctl, info, true);
        ret = link_free_space(ctl, info);
        if (ret)
                kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
+                RB_CLEAR_NODE(&entry->offset_index);
                bitmap = (entry->bitmap != NULL);
-                if (!bitmap)
+                if (!bitmap) {
                        try_merge_free_space(ctl, entry, false);
+                        steal_from_bitmap(ctl, entry, false);
+                }
                tree_insert_offset(&ctl->free_space_offset,
                                   entry->offset, &entry->offset_index, bitmap);
        }
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
 {
        struct inode *inode = NULL;
-        spin_lock(&root->cache_lock);
+        spin_lock(&root->ino_cache_lock);
-        if (root->cache_inode)
+        if (root->ino_cache_inode)
-                inode = igrab(root->cache_inode);
+                inode = igrab(root->ino_cache_inode);
-        spin_unlock(&root->cache_lock);
+        spin_unlock(&root->ino_cache_lock);
        if (inode)
                return inode;
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
        if (IS_ERR(inode))
                return inode;
-        spin_lock(&root->cache_lock);
+        spin_lock(&root->ino_cache_lock);
        if (!btrfs_fs_closing(root->fs_info))
-                root->cache_inode = igrab(inode);
+                root->ino_cache_inode = igrab(inode);
-        spin_unlock(&root->cache_lock);
+        spin_unlock(&root->ino_cache_lock);
        return inode;
 }
@@ -3176,6 +3309,7 @@ again:
                map = NULL;
                add_new_bitmap(ctl, info, offset);
                bitmap_info = info;
+                info = NULL;
        }
        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
        if (bytes)
                goto again;
+        if (info)
+                kmem_cache_free(btrfs_free_space_cachep, info);
        if (map)
                kfree(map);
        return 0;
@@ -3260,6 +3396,7 @@ have_info:
                        goto have_info;
                }
+                ret = 0;
                goto out;
        }
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..64f15bb30a81 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
 int __init btrfs_hash_init(void)
 {
        tfm = crypto_alloc_shash("crc32c", 0, 0);
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        return 0;
+        return PTR_ERR_OR_ZERO(tfm);
 }
 void btrfs_hash_exit(void)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
        u32 item_size;
        key.objectid = inode_objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
        path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        key.objectid = inode_objectid;
        key.offset = ref_objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        key.type = BTRFS_INODE_REF_KEY;
        path = btrfs_alloc_path();
        if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
        key.objectid = inode_objectid;
        key.offset = ref_objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        key.type = BTRFS_INODE_REF_KEY;
        path = btrfs_alloc_path();
        if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        int ret;
        key.objectid = objectid;
-        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_key found_key;
        ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
-        if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+        if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
            location->offset == (u64)-1 && path->slots[0] != 0) {
                slot = path->slots[0] - 1;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                if (found_key.objectid == location->objectid &&
-                    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+                    found_key.type == location->type) {
                        path->slots[0]--;
                        return 0;
                }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
                                 */
                                btrfs_item_key_to_cpu(leaf, &key, 0);
                                btrfs_release_path(path);
-                                root->cache_progress = last;
+                                root->ino_cache_progress = last;
                                up_read(&fs_info->commit_root_sem);
                                schedule_timeout(1);
                                goto again;
@@ -106,7 +106,7 @@ again:
                if (last != (u64)-1 && last + 1 != key.objectid) {
                        __btrfs_add_free_space(ctl, last + 1,
                                               key.objectid - last - 1);
-                        wake_up(&root->cache_wait);
+                        wake_up(&root->ino_cache_wait);
                }
                last = key.objectid;
@@ -119,14 +119,14 @@ next:
                                       root->highest_objectid - last - 1);
        }
-        spin_lock(&root->cache_lock);
+        spin_lock(&root->ino_cache_lock);
-        root->cached = BTRFS_CACHE_FINISHED;
+        root->ino_cache_state = BTRFS_CACHE_FINISHED;
-        spin_unlock(&root->cache_lock);
+        spin_unlock(&root->ino_cache_lock);
-        root->cache_progress = (u64)-1;
+        root->ino_cache_progress = (u64)-1;
        btrfs_unpin_free_ino(root);
 out:
-        wake_up(&root->cache_wait);
+        wake_up(&root->ino_cache_wait);
        up_read(&fs_info->commit_root_sem);
        btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return;
-        spin_lock(&root->cache_lock);
+        spin_lock(&root->ino_cache_lock);
-        if (root->cached != BTRFS_CACHE_NO) {
+        if (root->ino_cache_state != BTRFS_CACHE_NO) {
-                spin_unlock(&root->cache_lock);
+                spin_unlock(&root->ino_cache_lock);
                return;
        }
-        root->cached = BTRFS_CACHE_STARTED;
+        root->ino_cache_state = BTRFS_CACHE_STARTED;
-        spin_unlock(&root->cache_lock);
+        spin_unlock(&root->ino_cache_lock);
        ret = load_free_ino_cache(root->fs_info, root);
        if (ret == 1) {
-                spin_lock(&root->cache_lock);
+                spin_lock(&root->ino_cache_lock);
-                root->cached = BTRFS_CACHE_FINISHED;
+                root->ino_cache_state = BTRFS_CACHE_FINISHED;
-                spin_unlock(&root->cache_lock);
+                spin_unlock(&root->ino_cache_lock);
                return;
        }
@@ -196,11 +196,11 @@ again:
        start_caching(root);
-        wait_event(root->cache_wait,
+        wait_event(root->ino_cache_wait,
-                   root->cached == BTRFS_CACHE_FINISHED ||
+                   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
                   root->free_ino_ctl->free_space > 0);
-        if (root->cached == BTRFS_CACHE_FINISHED &&
+        if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
            root->free_ino_ctl->free_space == 0)
                return -ENOSPC;
        else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return;
 again:
-        if (root->cached == BTRFS_CACHE_FINISHED) {
+        if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
                __btrfs_add_free_space(pinned, objectid, 1);
        } else {
                down_write(&root->fs_info->commit_root_sem);
-                spin_lock(&root->cache_lock);
+                spin_lock(&root->ino_cache_lock);
-                if (root->cached == BTRFS_CACHE_FINISHED) {
+                if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
-                        spin_unlock(&root->cache_lock);
+                        spin_unlock(&root->ino_cache_lock);
                        up_write(&root->fs_info->commit_root_sem);
                        goto again;
                }
-                spin_unlock(&root->cache_lock);
+                spin_unlock(&root->ino_cache_lock);
                start_caching(root);
@@ -235,10 +235,10 @@ again:
 }
 /*
- * When a transaction is committed, we'll move those inode numbers which
+ * When a transaction is committed, we'll move those inode numbers which are
- * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
- * and others will just be dropped, because the commit root we were
+ * others will just be dropped, because the commit root we were searching has
- * searching has changed.
+ * changed.
 *
 * Must be called with root->fs_info->commit_root_sem held
 */
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                BUG_ON(info->bitmap); /* Logic error */
-                if (info->offset > root->cache_progress)
+                if (info->offset > root->ino_cache_progress)
                        goto free;
-                else if (info->offset + info->bytes > root->cache_progress)
+                else if (info->offset + info->bytes > root->ino_cache_progress)
-                        count = root->cache_progress - info->offset + 1;
+                        count = root->ino_cache_progress - info->offset + 1;
                else
                        count = info->bytes;
@@ -462,13 +462,13 @@ again:
                }
        }
-        spin_lock(&root->cache_lock);
+        spin_lock(&root->ino_cache_lock);
-        if (root->cached != BTRFS_CACHE_FINISHED) {
+        if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
                ret = -1;
-                spin_unlock(&root->cache_lock);
+                spin_unlock(&root->ino_cache_lock);
                goto out_put;
        }
-        spin_unlock(&root->cache_lock);
+        spin_unlock(&root->ino_cache_lock);
        spin_lock(&ctl->tree_lock);
        prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03708ef3deef..fc9c0439caa3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                key.objectid = btrfs_ino(inode);
                key.offset = start;
-                btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+                key.type = BTRFS_EXTENT_DATA_KEY;
                datasize = btrfs_file_extent_calc_inline_size(cur_size);
                path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                data_len = compressed_size;
        if (start > 0 ||
-            actual_end >= PAGE_CACHE_SIZE ||
+            actual_end > PAGE_CACHE_SIZE ||
-            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+            data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
            (!compressed_size &&
            (actual_end & (root->sectorsize - 1)) == 0) ||
            end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
        return 0;
 }
+static inline int inode_need_compress(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        /* force compress */
+        if (btrfs_test_opt(root, FORCE_COMPRESS))
+                return 1;
+        /* bad compression ratios */
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+                return 0;
+        if (btrfs_test_opt(root, COMPRESS) ||
+            BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+            BTRFS_I(inode)->force_compress)
+                return 1;
+        return 0;
+}
 /*
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
+        if (inode_need_compress(inode)) {
-            (btrfs_test_opt(root, COMPRESS) ||
-             (BTRFS_I(inode)->force_compress) ||
-             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
                if (!pages) {
@@ -778,8 +792,12 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-                if (ret)
+                if (ret) {
+                        btrfs_drop_extent_cache(inode, async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
+                }
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -971,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode,
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
                if (ret)
-                        goto out_reserve;
+                        goto out_drop_extent_cache;
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
                        if (ret)
-                                goto out_reserve;
+                                goto out_drop_extent_cache;
                }
                if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode,
 out:
        return ret;
+out_drop_extent_cache:
+        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
@@ -1088,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
-                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+                    !btrfs_test_opt(root, FORCE_COMPRESS))
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -1096,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
-                btrfs_init_work(&async_cow->work, async_cow_start,
+                btrfs_init_work(&async_cow->work,
-                                async_cow_submit, async_cow_free);
+                                btrfs_delalloc_helper,
+                                async_cow_start, async_cow_submit,
+                                async_cow_free);
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
@@ -1437,6 +1460,26 @@ error:
        return ret;
 }
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+                return 0;
+        /*
+         * @defrag_bytes is a hint value, no spinlock held here,
+         * if is not zero, it means the file is defragging.
+         * Force cow if given extent needs to be defragged.
+         */
+        if (BTRFS_I(inode)->defrag_bytes &&
+            test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                           EXTENT_DEFRAG, 0, NULL))
+                return 1;
+        return 0;
+}
 /*
 * extent_io.c call back to do delayed allocation processing
 */
@@ -1445,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int force_cow = need_force_cow(inode, start, end);
-        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-        } else if (!btrfs_test_opt(root, COMPRESS) &&
+        } else if (!inode_need_compress(inode)) {
-                   !(BTRFS_I(inode)->force_compress) &&
-                   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        } else {
@@ -1547,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                               struct extent_state *state, unsigned long *bits)
 {
+        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+                WARN_ON(1);
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1569,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
+                if (*bits & EXTENT_DEFRAG)
+                        BTRFS_I(inode)->defrag_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                                         &BTRFS_I(inode)->runtime_flags))
                        btrfs_add_delalloc_inodes(root, inode);
@@ -1583,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
                                 unsigned long *bits)
 {
+        u64 len = state->end + 1 - state->start;
+        spin_lock(&BTRFS_I(inode)->lock);
+        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+                BTRFS_I(inode)->defrag_bytes -= len;
+        spin_unlock(&BTRFS_I(inode)->lock);
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1590,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
         */
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-                u64 len = state->end + 1 - state->start;
                bool do_list = !btrfs_is_free_space_inode(inode);
                if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -1881,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        SetPageChecked(page);
        page_cache_get(page);
-        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+                        btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
@@ -2651,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
+        btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+                                     ordered_extent->file_offset +
+                                     ordered_extent->len - 1);
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                truncated = true;
                logical_len = ordered_extent->truncated_len;
@@ -2822,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-        struct btrfs_workqueue *workers;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2831,17 +2888,55 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                            end - start + 1, uptodate))
                return 0;
-        btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+        if (btrfs_is_free_space_inode(inode)) {
+                wq = root->fs_info->endio_freespace_worker;
+                func = btrfs_freespace_write_helper;
+        } else {
+                wq = root->fs_info->endio_write_workers;
+                func = btrfs_endio_write_helper;
+        }
-        if (btrfs_is_free_space_inode(inode))
+        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
-                workers = root->fs_info->endio_freespace_worker;
+                        NULL);
-        else
+        btrfs_queue_work(wq, &ordered_extent->work);
-                workers = root->fs_info->endio_write_workers;
-        btrfs_queue_work(workers, &ordered_extent->work);
        return 0;
 }
+static int __readpage_endio_check(struct inode *inode,
+                                  struct btrfs_io_bio *io_bio,
+                                  int icsum, struct page *page,
+                                  int pgoff, u64 start, size_t len)
+{
+        char *kaddr;
+        u32 csum_expected;
+        u32 csum = ~(u32)0;
+        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                      DEFAULT_RATELIMIT_BURST);
+        csum_expected = *(((u32 *)io_bio->csum) + icsum);
+        kaddr = kmap_atomic(page);
+        csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
+        btrfs_csum_final(csum, (char *)&csum);
+        if (csum != csum_expected)
+                goto zeroit;
+        kunmap_atomic(kaddr);
+        return 0;
+zeroit:
+        if (__ratelimit(&_rs))
+                btrfs_info(BTRFS_I(inode)->root->fs_info,
+                           "csum failed ino %llu off %llu csum %u expected csum %u",
+                           btrfs_ino(inode), start, csum, csum_expected);
+        memset(kaddr + pgoff, 1, len);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr);
+        if (csum_expected == 0)
+                return 0;
+        return -EIO;
+}
 /*
 * when reads are done, we need to check csums to verify the data is correct
 * if there's a match, we allow the bio to finish.  If not, the code in
@@ -2854,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        char *kaddr;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u32 csum_expected;
-        u32 csum = ~(u32)0;
-        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                      DEFAULT_RATELIMIT_BURST);
        if (PageChecked(page)) {
                ClearPageChecked(page);
-                goto good;
+                return 0;
        }
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-                goto good;
+                return 0;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2877,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        }
        phy_offset >>= inode->i_sb->s_blocksize_bits;
-        csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
+        return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+                                      start, (size_t)(end - start + 1));
-        kaddr = kmap_atomic(page);
-        csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
-        btrfs_csum_final(csum, (char *)&csum);
-        if (csum != csum_expected)
-                goto zeroit;
-        kunmap_atomic(kaddr);
-good:
-        return 0;
-zeroit:
-        if (__ratelimit(&_rs))
-                btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                        btrfs_ino(page->mapping->host), start, csum, csum_expected);
-        memset(kaddr + offset, 1, end - start + 1);
-        flush_dcache_page(page);
-        kunmap_atomic(kaddr);
-        if (csum_expected == 0)
-                return 0;
-        return -EIO;
 }
 struct delayed_iput {
@@ -3145,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        path->reada = -1;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
-        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = (u64)-1;
        while (1) {
@@ -3172,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                /* make sure the item matches what we want */
                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
                        break;
-                if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+                if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
                /* release the path since we're done with it */
@@ -3648,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
         * without delay
         */
        if (!btrfs_is_free_space_inode(inode)
-            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+            && !root->fs_info->log_root_recovering) {
                btrfs_update_root_times(trans, root);
                ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4071,7 +4142,7 @@ search_again:
                fi = NULL;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                found_type = btrfs_key_type(&found_key);
+                found_type = found_key.type;
                if (found_key.objectid != ino)
                        break;
@@ -4234,7 +4305,8 @@ out:
                        btrfs_abort_transaction(trans, root, ret);
        }
 error:
-        if (last_size != (u64)-1)
+        if (last_size != (u64)-1 &&
+            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        btrfs_free_path(path);
        return err;
@@ -4674,6 +4746,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                remove_extent_mapping(map_tree, em);
                free_extent_map(em);
+                if (need_resched()) {
+                        write_unlock(&map_tree->lock);
+                        cond_resched();
+                        write_lock(&map_tree->lock);
+                }
        }
        write_unlock(&map_tree->lock);
@@ -4696,6 +4773,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                 &cached_state, GFP_NOFS);
                free_extent_state(state);
+                cond_resched();
                spin_lock(&io_tree->lock);
        }
        spin_unlock(&io_tree->lock);
@@ -4726,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        btrfs_free_io_failure_record(inode, 0, (u64)-1);
        if (root->fs_info->log_root_recovering) {
                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                                 &BTRFS_I(inode)->runtime_flags));
@@ -5181,6 +5261,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
+                /*
+                 * If orphan cleanup did remove any orphans, it means the tree
+                 * was modified and therefore the commit root is not the same as
+                 * the current root anymore. This is a problem, because send
+                 * uses the commit root and therefore can see inode items that
+                 * don't exist in the current root anymore, and for example make
+                 * calls to btrfs_iget, which will do tree lookups based on the
+                 * current root and not on the commit root. Those lookups will
+                 * fail, returning a -ESTALE error, and making send fail with
+                 * that error. So make sure a send does not see any orphans we
+                 * have just removed, and that it will see the same inodes
+                 * regardless of whether a transaction commit happened before
+                 * it started (meaning that the commit root will be the same as
+                 * the current root) or not.
+                 */
+                if (sub_root->node != sub_root->commit_root) {
+                        u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+                        if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+                                struct extent_buffer *eb;
+                                /*
+                                 * Assert we can't have races between dentry
+                                 * lookup called through the snapshot creation
+                                 * ioctl and the VFS.
+                                 */
+                                ASSERT(mutex_is_locked(&dir->i_mutex));
+                                down_write(&root->fs_info->commit_root_sem);
+                                eb = sub_root->commit_root;
+                                sub_root->commit_root =
+                                        btrfs_root_node(sub_root);
+                                up_write(&root->fs_info->commit_root_sem);
+                                free_extent_buffer(eb);
+                        }
+                }
        }
        return inode;
@@ -5274,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                btrfs_get_delayed_items(inode, &ins_list, &del_list);
        }
-        btrfs_set_key_type(&key, key_type);
+        key.type = key_type;
        key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
@@ -5299,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                if (found_key.objectid != key.objectid)
                        break;
-                if (btrfs_key_type(&found_key) != key_type)
+                if (found_key.type != key_type)
                        break;
                if (found_key.offset < ctx->pos)
                        goto next;
@@ -5511,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        int ret;
        key.objectid = btrfs_ino(inode);
-        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = (u64)-1;
        path = btrfs_alloc_path();
@@ -5543,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
        if (found_key.objectid != btrfs_ino(inode) ||
-            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+            found_key.type != BTRFS_DIR_INDEX_KEY) {
                BTRFS_I(inode)->index_cnt = 2;
                goto out;
        }
@@ -5577,6 +5693,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        return ret;
 }
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+        struct btrfs_iget_args args;
+        args.location = &BTRFS_I(inode)->location;
+        args.root = BTRFS_I(inode)->root;
+        return insert_inode_locked4(inode,
+                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+                   btrfs_find_actor, &args);
+}
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
@@ -5606,6 +5733,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        }
        /*
+         * O_TMPFILE, set link count to 0, so that after this point,
+         * we fill in an inode item with the correct link count.
+         */
+        if (!name)
+                set_nlink(inode, 0);
+        /*
         * we have to initialize this early, so we can reclaim the inode
         * number if we fail afterwards in this function.
         */
@@ -5643,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
        key[0].objectid = objectid;
-        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+        key[0].type = BTRFS_INODE_ITEM_KEY;
        key[0].offset = 0;
        sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5656,16 +5790,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                 * add more hard links than can fit in the ref item.
                 */
                key[1].objectid = objectid;
-                btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+                key[1].type = BTRFS_INODE_REF_KEY;
                key[1].offset = ref_objectid;
                sizes[1] = name_len + sizeof(*ref);
        }
+        location = &BTRFS_I(inode)->location;
+        location->objectid = objectid;
+        location->offset = 0;
+        location->type = BTRFS_INODE_ITEM_KEY;
+        ret = btrfs_insert_inode_locked(inode);
+        if (ret < 0)
+                goto fail;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
-                goto fail;
+                goto fail_unlock;
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
@@ -5688,11 +5831,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        location = &BTRFS_I(inode)->location;
-        location->objectid = objectid;
-        location->offset = 0;
-        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
        btrfs_inherit_iflags(inode, dir);
        if (S_ISREG(mode)) {
@@ -5703,7 +5841,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                BTRFS_INODE_NODATASUM;
        }
-        btrfs_insert_inode_hash(inode);
        inode_tree_add(inode);
        trace_btrfs_inode_new(inode);
@@ -5718,6 +5855,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                          btrfs_ino(inode), root->root_key.objectid, ret);
        return inode;
+fail_unlock:
+        unlock_new_inode(inode);
 fail:
        if (dir && name)
                BTRFS_I(dir)->index_cnt--;
@@ -5751,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
        } else {
                key.objectid = ino;
-                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+                key.type = BTRFS_INODE_ITEM_KEY;
                key.offset = 0;
        }
@@ -5852,28 +5992,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
        * if the filesystem supports xattrs by looking at the
        * ops vector.
        */
        inode->i_op = &btrfs_special_inode_operations;
-        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        if (err) {
+                goto out_unlock_inode;
+        } else {
                btrfs_update_inode(trans, root, inode);
+                unlock_new_inode(inode);
                d_instantiate(dentry, inode);
        }
 out_unlock:
        btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
@@ -5883,6 +6023,12 @@ out_unlock:
                iput(inode);
        }
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5917,15 +6063,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
        drop_inode_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err)
-                goto out_unlock;
-        err = btrfs_update_inode(trans, root, inode);
-        if (err)
-                goto out_unlock;
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5934,14 +6071,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                goto out_unlock;
+                goto out_unlock_inode;
-        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
 out_unlock:
@@ -5953,6 +6099,11 @@ out_unlock:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6060,25 +6211,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        }
        drop_on_err = 1;
+        /* these must be set before we unlock the inode */
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
-        inode->i_op = &btrfs_dir_inode_operations;
-        inode->i_fop = &btrfs_dir_file_operations;
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
                             dentry->d_name.len, 0, index);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        d_instantiate(dentry, inode);
+        /*
+         * mkdir is special.  We're unlocking after we call d_instantiate
+         * to avoid a race with nfsd calling d_instantiate.
+         */
+        unlock_new_inode(inode);
        drop_on_err = 0;
 out_fail:
@@ -6088,23 +6244,66 @@ out_fail:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_fail_inode:
+        unlock_new_inode(inode);
+        goto out_fail;
+}
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+        struct rb_node *next;
+        next = rb_next(&em->rb_node);
+        if (!next)
+                return NULL;
+        return container_of(next, struct extent_map, rb_node);
+}
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+        struct rb_node *prev;
+        prev = rb_prev(&em->rb_node);
+        if (!prev)
+                return NULL;
+        return container_of(prev, struct extent_map, rb_node);
 }
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
 * and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
 */
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                struct extent_map *existing,
                                struct extent_map *em,
-                                u64 map_start, u64 map_len)
+                                u64 map_start)
 {
+        struct extent_map *prev;
+        struct extent_map *next;
+        u64 start;
+        u64 end;
        u64 start_diff;
        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-        start_diff = map_start - em->start;
-        em->start = map_start;
+        if (existing->start > map_start) {
-        em->len = map_len;
+                next = existing;
+                prev = prev_extent_map(next);
+        } else {
+                prev = existing;
+                next = next_extent_map(prev);
+        }
+        start = prev ? extent_map_end(prev) : em->start;
+        start = max_t(u64, start, em->start);
+        end = next ? next->start : extent_map_end(em);
+        end = min_t(u64, end, extent_map_end(em));
+        start_diff = start - em->start;
+        em->start = start;
+        em->len = end - start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
@@ -6232,7 +6431,7 @@ again:
                              struct btrfs_file_extent_item);
        /* are we inside the extent that was found? */
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-        found_type = btrfs_key_type(&found_key);
+        found_type = found_key.type;
        if (found_key.objectid != objectid ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
                /*
@@ -6275,6 +6474,8 @@ next:
                        goto not_found;
                if (start + len <= found_key.offset)
                        goto not_found;
+                if (start > found_key.offset)
+                        goto next;
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
@@ -6379,26 +6580,21 @@ insert:
                ret = 0;
-                existing = lookup_extent_mapping(em_tree, start, len);
+                existing = search_extent_mapping(em_tree, start, len);
-                if (existing && (existing->start > start ||
+                /*
-                    existing->start + existing->len <= start)) {
+                 * existing will always be non-NULL, since there must be
+                 * extent causing the -EEXIST.
+                 */
+                if (start >= extent_map_end(existing) ||
+                    start <= existing->start) {
+                        /*
+                         * The existing extent map is the one nearest to
+                         * the [start, start + len) range which overlaps
+                         */
+                        err = merge_extent_mapping(em_tree, existing,
+                                                   em, start);
                        free_extent_map(existing);
-                        existing = NULL;
+                        if (err) {
-                }
-                if (!existing) {
-                        existing = lookup_extent_mapping(em_tree, em->start,
-                                                         em->len);
-                        if (existing) {
-                                err = merge_extent_mapping(em_tree, existing,
-                                                           em, start,
-                                                           root->sectorsize);
-                                free_extent_map(existing);
-                                if (err) {
-                                        free_extent_map(em);
-                                        em = NULL;
-                                }
-                        } else {
-                                err = -EIO;
                                free_extent_map(em);
                                em = NULL;
                        }
@@ -7010,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                       block_start, len,
                                                       orig_block_len,
                                                       ram_bytes, type);
-                                if (IS_ERR(em))
+                                if (IS_ERR(em)) {
+                                        ret = PTR_ERR(em);
                                        goto unlock_err;
+                                }
                        }
                        ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7086,45 +7284,277 @@ unlock_err:
        return ret;
 }
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                        int rw, int mirror_num)
 {
-        struct btrfs_dio_private *dip = bio->bi_private;
-        struct bio_vec *bvec;
-        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct bio *dio_bio;
+        int ret;
-        u32 *csums = (u32 *)dip->csum;
+        BUG_ON(rw & REQ_WRITE);
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                  BTRFS_WQ_ENDIO_DIO_REPAIR);
+        if (ret)
+                goto err;
+        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+        bio_put(bio);
+        return ret;
+}
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                      struct bio *failed_bio,
+                                      struct io_failure_record *failrec,
+                                      int failed_mirror)
+{
+        int num_copies;
+        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                      failrec->logical, failrec->len);
+        if (num_copies == 1) {
+                /*
+                 * we only have a single copy of the data, so don't bother with
+                 * all the retry and error correction code that follows. no
+                 * matter what the error is, it is very likely to persist.
+                 */
+                pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                         num_copies, failrec->this_mirror, failed_mirror);
+                return 0;
+        }
+        failrec->failed_mirror = failed_mirror;
+        failrec->this_mirror++;
+        if (failrec->this_mirror == failed_mirror)
+                failrec->this_mirror++;
+        if (failrec->this_mirror > num_copies) {
+                pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                         num_copies, failrec->this_mirror, failed_mirror);
+                return 0;
+        }
+        return 1;
+}
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                          struct page *page, u64 start, u64 end,
+                          int failed_mirror, bio_end_io_t *repair_endio,
+                          void *repair_arg)
+{
+        struct io_failure_record *failrec;
+        struct bio *bio;
+        int isector;
+        int read_mode;
+        int ret;
+        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+        if (ret)
+                return ret;
+        ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                         failed_mirror);
+        if (!ret) {
+                free_io_failure(inode, failrec);
+                return -EIO;
+        }
+        if (failed_bio->bi_vcnt > 1)
+                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+        else
+                read_mode = READ_SYNC;
+        isector = start - btrfs_io_bio(failed_bio)->logical;
+        isector >>= inode->i_sb->s_blocksize_bits;
+        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                      0, isector, repair_endio, repair_arg);
+        if (!bio) {
+                free_io_failure(inode, failrec);
+                return -EIO;
+        }
+        btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+                    read_mode, failrec->this_mirror, failrec->in_validation);
+        ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                    failrec->this_mirror);
+        if (ret) {
+                free_io_failure(inode, failrec);
+                bio_put(bio);
+        }
+        return ret;
+}
+struct btrfs_retry_complete {
+        struct completion done;
+        struct inode *inode;
        u64 start;
+        int uptodate;
+};
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+        struct btrfs_retry_complete *done = bio->bi_private;
+        struct bio_vec *bvec;
        int i;
-        start = dip->logical_offset;
+        if (err)
+                goto end;
+        done->uptodate = 1;
+        bio_for_each_segment_all(bvec, bio, i)
+                clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+        complete(&done->done);
+        bio_put(bio);
+}
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                       struct btrfs_io_bio *io_bio)
+{
+        struct bio_vec *bvec;
+        struct btrfs_retry_complete done;
+        u64 start;
+        int i;
+        int ret;
+        start = io_bio->logical;
+        done.inode = inode;
+        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+                done.uptodate = 0;
+                done.start = start;
+                init_completion(&done.done);
+                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                     start + bvec->bv_len - 1,
+                                     io_bio->mirror_num,
+                                     btrfs_retry_endio_nocsum, &done);
+                if (ret)
+                        return ret;
+                wait_for_completion(&done.done);
+                if (!done.uptodate) {
+                        /* We might have another mirror, so try again */
+                        goto try_again;
+                }
+                start += bvec->bv_len;
+        }
+        return 0;
+}
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+        struct btrfs_retry_complete *done = bio->bi_private;
+        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+        struct bio_vec *bvec;
+        int uptodate;
+        int ret;
+        int i;
+        if (err)
+                goto end;
+        uptodate = 1;
        bio_for_each_segment_all(bvec, bio, i) {
-                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                ret = __readpage_endio_check(done->inode, io_bio, i,
-                        struct page *page = bvec->bv_page;
+                                             bvec->bv_page, 0,
-                        char *kaddr;
+                                             done->start, bvec->bv_len);
-                        u32 csum = ~(u32)0;
+                if (!ret)
-                        unsigned long flags;
+                        clean_io_failure(done->inode, done->start,
+                                         bvec->bv_page, 0);
-                        local_irq_save(flags);
+                else
-                        kaddr = kmap_atomic(page);
+                        uptodate = 0;
-                        csum = btrfs_csum_data(kaddr + bvec->bv_offset,
+        }
-                                               csum, bvec->bv_len);
-                        btrfs_csum_final(csum, (char *)&csum);
+        done->uptodate = uptodate;
-                        kunmap_atomic(kaddr);
+end:
-                        local_irq_restore(flags);
+        complete(&done->done);
+        bio_put(bio);
-                        flush_dcache_page(bvec->bv_page);
+}
-                        if (csum != csums[i]) {
-                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+static int __btrfs_subio_endio_read(struct inode *inode,
-                                          btrfs_ino(inode), start, csum,
+                                    struct btrfs_io_bio *io_bio, int err)
-                                          csums[i]);
+{
-                                err = -EIO;
+        struct bio_vec *bvec;
-                        }
+        struct btrfs_retry_complete done;
+        u64 start;
+        u64 offset = 0;
+        int i;
+        int ret;
+        err = 0;
+        start = io_bio->logical;
+        done.inode = inode;
+        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+                                             0, start, bvec->bv_len);
+                if (likely(!ret))
+                        goto next;
+try_again:
+                done.uptodate = 0;
+                done.start = start;
+                init_completion(&done.done);
+                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                     start + bvec->bv_len - 1,
+                                     io_bio->mirror_num,
+                                     btrfs_retry_endio, &done);
+                if (ret) {
+                        err = ret;
+                        goto next;
                }
+                wait_for_completion(&done.done);
+                if (!done.uptodate) {
+                        /* We might have another mirror, so try again */
+                        goto try_again;
+                }
+next:
+                offset += bvec->bv_len;
                start += bvec->bv_len;
        }
+        return err;
+}
+static int btrfs_subio_endio_read(struct inode *inode,
+                                  struct btrfs_io_bio *io_bio, int err)
+{
+        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        if (skip_csum) {
+                if (unlikely(err))
+                        return __btrfs_correct_data_nocsum(inode, io_bio);
+                else
+                        return 0;
+        } else {
+                return __btrfs_subio_endio_read(inode, io_bio, err);
+        }
+}
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct bio *dio_bio;
+        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+        if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+                err = btrfs_subio_endio_read(inode, io_bio, err);
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
        dio_bio = dip->dio_bio;
@@ -7135,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        if (err)
                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
        dio_end_io(dio_bio, err);
+        if (io_bio->end_io)
+                io_bio->end_io(io_bio, err);
        bio_put(bio);
 }
@@ -7158,7 +7591,8 @@ again:
        if (!ret)
                goto out_test;
-        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+        btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+                        finish_ordered_fn, NULL, NULL);
        btrfs_queue_work(root->fs_info->endio_write_workers,
                         &ordered->work);
 out_test:
@@ -7199,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
+        if (err)
+                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                           "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+                           btrfs_ino(dip->inode), bio->bi_rw,
+                           (unsigned long long)bio->bi_iter.bi_sector,
+                           bio->bi_iter.bi_size, err);
+        if (dip->subio_endio)
+                err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
        if (err) {
-                btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-                      btrfs_ino(dip->inode), bio->bi_rw,
-                      (unsigned long long)bio->bi_iter.bi_sector,
-                      bio->bi_iter.bi_size, err);
                dip->errors = 1;
                /*
@@ -7235,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
 }
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+                                                 struct inode *inode,
+                                                 struct btrfs_dio_private *dip,
+                                                 struct bio *bio,
+                                                 u64 file_offset)
+{
+        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+        struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+        int ret;
+        /*
+         * We load all the csum data we need when we submit
+         * the first bio to reduce the csum tree search and
+         * contention.
+         */
+        if (dip->logical_offset == file_offset) {
+                ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+                                                file_offset);
+                if (ret)
+                        return ret;
+        }
+        if (bio == dip->orig_bio)
+                return 0;
+        file_offset -= dip->logical_offset;
+        file_offset >>= inode->i_sb->s_blocksize_bits;
+        io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+        return 0;
+}
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
                                         int async_submit)
@@ -7250,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        bio_get(bio);
        if (!write) {
-                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                BTRFS_WQ_ENDIO_DATA);
                if (ret)
                        goto err;
        }
@@ -7273,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
                if (ret)
                        goto err;
-        } else if (!skip_sum) {
+        } else {
-                ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
+                ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
-                                                file_offset);
+                                                     file_offset);
                if (ret)
                        goto err;
        }
 map:
        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
@@ -7300,19 +7771,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 submit_len = 0;
        u64 map_length;
        int nr_pages = 0;
-        int ret = 0;
+        int ret;
        int async_submit = 0;
        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
-        if (ret) {
+        if (ret)
-                bio_put(orig_bio);
                return -EIO;
-        }
        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
+                dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
                goto submit;
        }
@@ -7326,14 +7796,16 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                return -ENOMEM;
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
+        btrfs_io_bio(bio)->logical = file_offset;
        atomic_inc(&dip->pending_bios);
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
-                if (unlikely(map_length < submit_len + bvec->bv_len ||
+                if (map_length < submit_len + bvec->bv_len ||
                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
-                                 bvec->bv_offset) < bvec->bv_len)) {
+                                 bvec->bv_offset) < bvec->bv_len) {
                        /*
                         * inc the count before we submit the bio so
                         * we know the end IO handler won't happen before
@@ -7362,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                                goto out_err;
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
+                        btrfs_io_bio(bio)->logical = file_offset;
                        map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
@@ -7405,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio *io_bio;
+        struct btrfs_io_bio *btrfs_bio;
        int skip_sum;
-        int sum_len;
        int write = rw & REQ_WRITE;
        int ret = 0;
-        u16 csum_size;
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -7419,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                goto free_ordered;
        }
-        if (!skip_sum && !write) {
+        dip = kzalloc(sizeof(*dip), GFP_NOFS);
-                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                sum_len = dio_bio->bi_iter.bi_size >>
-                        inode->i_sb->s_blocksize_bits;
-                sum_len *= csum_size;
-        } else {
-                sum_len = 0;
-        }
-        dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
                goto free_io_bio;
@@ -7440,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip->bytes = dio_bio->bi_iter.bi_size;
        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
-        dip->errors = 0;
        dip->orig_bio = io_bio;
        dip->dio_bio = dio_bio;
        atomic_set(&dip->pending_bios, 0);
+        btrfs_bio = btrfs_io_bio(io_bio);
+        btrfs_bio->logical = file_offset;
-        if (write)
+        if (write) {
                io_bio->bi_end_io = btrfs_endio_direct_write;
-        else
+        } else {
                io_bio->bi_end_io = btrfs_endio_direct_read;
+                dip->subio_endio = btrfs_subio_endio_read;
+        }
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
+        if (btrfs_bio->end_io)
+                btrfs_bio->end_io(btrfs_bio, ret);
 free_io_bio:
        bio_put(io_bio);
@@ -7534,7 +8002,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        count = iov_iter_count(iter);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags))
-                filemap_fdatawrite_range(inode->i_mapping, offset, count);
+                filemap_fdatawrite_range(inode->i_mapping, offset,
+                                         offset + count - 1);
        if (rw & WRITE) {
                /*
@@ -7549,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
-        } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
-                                     &BTRFS_I(inode)->runtime_flags))) {
+                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
                flags = DIO_LOCKING | DIO_SKIP_HOLES;
                wakeup = false;
@@ -8041,6 +8510,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
+        unlock_new_inode(inode);
        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
        if (err)
@@ -8069,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
+        ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
        ei->csum_bytes = 0;
@@ -8127,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(BTRFS_I(inode)->reserved_extents);
        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
        WARN_ON(BTRFS_I(inode)->csum_bytes);
+        WARN_ON(BTRFS_I(inode)->defrag_bytes);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -8495,7 +8967,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+        WARN_ON_ONCE(!inode);
+        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+                        btrfs_run_delalloc_work, NULL, NULL);
        return work;
 }
@@ -8540,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
                spin_unlock(&root->delalloc_lock);
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                if (unlikely(!work)) {
+                if (!work) {
                        if (delay_iput)
                                btrfs_add_delayed_iput(inode);
                        else
@@ -8699,12 +9173,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -8713,34 +9181,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-        }
-        if (drop_inode)
-                goto out_unlock;
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
-                drop_inode = 1;
+                goto out_unlock_inode;
-                goto out_unlock;
        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        key.type = BTRFS_EXTENT_DATA_KEY;
        datasize = btrfs_file_extent_calc_inline_size(name_len);
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
        if (err) {
-                drop_inode = 1;
                btrfs_free_path(path);
-                goto out_unlock;
+                goto out_unlock_inode;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8764,12 +9230,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
-        if (err)
+        if (err) {
                drop_inode = 1;
+                goto out_unlock_inode;
+        }
+        unlock_new_inode(inode);
+        d_instantiate(dentry, inode);
 out_unlock:
-        if (!err)
-                d_instantiate(dentry, inode);
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -8777,6 +9246,11 @@ out_unlock:
        }
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8960,14 +9434,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out;
        }
-        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
-        if (ret)
-                goto out;
-        ret = btrfs_update_inode(trans, root, inode);
-        if (ret)
-                goto out;
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
@@ -8975,10 +9441,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+        if (ret)
+                goto out_inode;
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret)
+                goto out_inode;
        ret = btrfs_orphan_add(trans, inode);
        if (ret)
-                goto out;
+                goto out_inode;
+        /*
+         * We set number of links to 0 in btrfs_new_inode(), and here we set
+         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+         * through:
+         *
+         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+         */
+        set_nlink(inode, 1);
+        unlock_new_inode(inode);
        d_tmpfile(dentry, inode);
        mark_inode_dirty(inode);
@@ -8988,8 +9470,12 @@ out:
                iput(inode);
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return ret;
+out_inode:
+        unlock_new_inode(inode);
+        goto out;
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..e732274f1afd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                        goto out_drop;
        } else {
+                ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+                if (ret && ret != -ENODATA)
+                        goto out_drop;
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
        if (ret)
                goto fail;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
-                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
        btrfs_set_stack_inode_generation(inode_item, 1);
        btrfs_set_stack_inode_size(inode_item, 3);
        btrfs_set_stack_inode_nlink(inode_item, 1);
-        btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+        btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
        btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
        key.objectid = objectid;
        key.offset = 0;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.type = BTRFS_ROOT_ITEM_KEY;
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &root_item);
        if (ret)
@@ -711,39 +713,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (ret)
                goto fail;
-        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
-        if (ret)
-                goto fail;
-        /*
-         * If orphan cleanup did remove any orphans, it means the tree was
-         * modified and therefore the commit root is not the same as the
-         * current root anymore. This is a problem, because send uses the
-         * commit root and therefore can see inode items that don't exist
-         * in the current root anymore, and for example make calls to
-         * btrfs_iget, which will do tree lookups based on the current root
-         * and not on the commit root. Those lookups will fail, returning a
-         * -ESTALE error, and making send fail with that error. So make sure
-         * a send does not see any orphans we have just removed, and that it
-         * will see the same inodes regardless of whether a transaction
-         * commit happened before it started (meaning that the commit root
-         * will be the same as the current root) or not.
-         */
-        if (readonly && pending_snapshot->snap->node !=
-            pending_snapshot->snap->commit_root) {
-                trans = btrfs_join_transaction(pending_snapshot->snap);
-                if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
-                        ret = PTR_ERR(trans);
-                        goto fail;
-                }
-                if (!IS_ERR(trans)) {
-                        ret = btrfs_commit_transaction(trans,
-                                                       pending_snapshot->snap);
-                        if (ret)
-                                goto fail;
-                }
-        }
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -915,7 +884,7 @@ out_unlock:
 * file you want to defrag, we return 0 to let you know to skip this
 * part of the file
 */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
 {
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em = NULL;
@@ -950,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
 */
 static int find_new_extents(struct btrfs_root *root,
                            struct inode *inode, u64 newer_than,
-                            u64 *off, int thresh)
+                            u64 *off, u32 thresh)
 {
        struct btrfs_path *path;
        struct btrfs_key min_key;
@@ -969,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root,
        min_key.offset = *off;
        while (1) {
-                path->keep_locks = 1;
                ret = btrfs_search_forward(root, &min_key, path, newer_than);
                if (ret != 0)
                        goto none;
-                path->keep_locks = 0;
-                btrfs_unlock_up_safe(path, 1);
 process_slot:
                if (min_key.objectid != ino)
                        goto none;
@@ -1052,15 +1018,17 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
                return false;
        next = defrag_lookup_extent(inode, em->start + em->len);
-        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
-            (em->block_start + em->block_len == next->block_start))
+                ret = false;
+        else if ((em->block_start + em->block_len == next->block_start) &&
+                 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
                ret = false;
        free_extent_map(next);
        return ret;
 }
-static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
                               u64 *last_len, u64 *skip, u64 *defrag_end,
                               int compress)
 {
@@ -1088,7 +1056,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
        }
        next_mergeable = defrag_check_next_extent(inode, em);
        /*
         * we hit a real extent, if it is big or the next extent is not a
         * real extent, don't bother defragging it
@@ -1291,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        int ret;
        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
-        int extent_thresh = range->extent_thresh;
+        u32 extent_thresh = range->extent_thresh;
        unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
        unsigned long cluster = max_cluster;
        u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1367,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                inode->i_mapping->writeback_index = i;
        while (i <= last_index && defrag_count < max_to_defrag &&
-               (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+               (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
-                PAGE_CACHE_SHIFT)) {
                /*
                 * make sure we stop running if someone unmounts
                 * the FS
@@ -1391,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         * the should_defrag function tells us how much to skip
                         * bump our counter by the suggested amount
                         */
-                        next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                        next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
                        i = max(i + 1, next);
                        continue;
                }
@@ -1586,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                goto out_free;
        }
-        old_size = device->total_bytes;
+        old_size = btrfs_device_get_total_bytes(device);
        if (mod < 0) {
                if (new_size > old_size) {
@@ -1735,7 +1701,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
              BTRFS_SUBVOL_QGROUP_INHERIT)) {
                ret = -EOPNOTSUPP;
-                goto out;
+                goto free_args;
        }
        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1711,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
                if (vol_args->size > PAGE_CACHE_SIZE) {
                        ret = -EINVAL;
-                        goto out;
+                        goto free_args;
                }
                inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
                if (IS_ERR(inherit)) {
                        ret = PTR_ERR(inherit);
-                        goto out;
+                        goto free_args;
                }
        }
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol, ptr,
                                              readonly, inherit);
+        if (ret)
+                goto free_inherit;
-        if (ret == 0 && ptr &&
+        if (ptr && copy_to_user(arg +
-            copy_to_user(arg +
+                                offsetof(struct btrfs_ioctl_vol_args_v2,
-                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                        transid),
-                                  transid), ptr, sizeof(*ptr)))
+                                ptr, sizeof(*ptr)))
                ret = -EFAULT;
-out:
-        kfree(vol_args);
+free_inherit:
        kfree(inherit);
+free_args:
+        kfree(vol_args);
        return ret;
 }
@@ -2117,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode,
        key.type = sk->min_type;
        key.offset = sk->min_offset;
-        path->keep_locks = 1;
        while (1) {
                ret = btrfs_search_forward(root, &key, path, sk->min_transid);
                if (ret != 0) {
@@ -2554,9 +2522,9 @@ out_unlock:
                ASSERT(dest->send_in_progress == 0);
                /* the last ref */
-                if (dest->cache_inode) {
+                if (dest->ino_cache_inode) {
-                        iput(dest->cache_inode);
+                        iput(dest->ino_cache_inode);
-                        dest->cache_inode = NULL;
+                        dest->ino_cache_inode = NULL;
                }
        }
 out_dput:
@@ -2662,6 +2630,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
+        if (!ret)
+                btrfs_info(root->fs_info, "disk added %s",vol_args->name);
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
@@ -2685,7 +2656,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
-                goto out;
+                goto err_drop;
        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2701,8 +2672,12 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        mutex_unlock(&root->fs_info->volume_mutex);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        if (!ret)
+                btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
 out:
        kfree(vol_args);
+err_drop:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -2764,8 +2739,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
        }
        di_args->devid = dev->devid;
-        di_args->bytes_used = dev->bytes_used;
+        di_args->bytes_used = btrfs_device_get_bytes_used(dev);
-        di_args->total_bytes = dev->total_bytes;
+        di_args->total_bytes = btrfs_device_get_total_bytes(dev);
        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
        if (dev->name) {
                struct rcu_string *name;
@@ -3191,7 +3166,7 @@ static void clone_update_extent_map(struct inode *inode,
                                        em->start + em->len - 1, 0);
        }
-        if (unlikely(ret))
+        if (ret)
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                        &BTRFS_I(inode)->runtime_flags);
 }
@@ -3226,7 +3201,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
        u64 last_dest_end = destoff;
        ret = -ENOMEM;
-        buf = vmalloc(btrfs_level_size(root, 0));
+        buf = vmalloc(root->nodesize);
        if (!buf)
                return ret;
@@ -3279,11 +3254,11 @@ process_slot:
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
-                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                if (key.type > BTRFS_EXTENT_DATA_KEY ||
                    key.objectid != btrfs_ino(src))
                        break;
-                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+                if (key.type == BTRFS_EXTENT_DATA_KEY) {
                        struct btrfs_file_extent_item *extent;
                        int type;
                        u32 size;
@@ -3527,7 +3502,8 @@ process_slot:
                        btrfs_mark_buffer_dirty(leaf);
                        btrfs_release_path(path);
-                        last_dest_end = new_key.offset + datal;
+                        last_dest_end = ALIGN(new_key.offset + datal,
+                                              root->sectorsize);
                        ret = clone_finish_inode_update(trans, inode,
                                                        last_dest_end,
                                                        destoff, olen);
@@ -5309,6 +5285,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                if (ret)
                        return ret;
                ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+                /*
+                 * The transaction thread may want to do more work,
+                 * namely it pokes the cleaner ktread that will start
+                 * processing uncleaned subvols.
+                 */
+                wake_up_process(root->fs_info->transaction_kthread);
                return ret;
        }
        case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
        char *data_in;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+        unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
-                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
        unsigned long buf_offset = 0;
        unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 963895c1f801..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                spin_unlock(&root->ordered_extent_lock);
                btrfs_init_work(&ordered->flush_work,
+                                btrfs_flush_delalloc_helper,
                                btrfs_run_ordered_extent_work, NULL, NULL);
                list_add_tail(&ordered->work_list, &works);
                btrfs_queue_work(root->fs_info->flush_workers,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
        int ret = 0;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
-        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = offset;
        path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
        int ret = 0;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
-        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = offset;
        path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
        for (i = 0 ; i < nr ; i++) {
                item = btrfs_item_nr(i);
                btrfs_item_key_to_cpu(l, &key, i);
-                type = btrfs_key_type(&key);
+                type = key.type;
                printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
                       "itemsize %d\n",
                        i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        for (i = 0; i < nr; i++) {
                struct extent_buffer *next = read_tree_block(root,
                                        btrfs_node_blockptr(c, i),
-                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
                   level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b497498484be..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        struct btrfs_key key;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(quota_root))
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
                return 0;
-#endif
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
        key.type = BTRFS_QGROUP_INFO_KEY;
        key.offset = qgroupid;
+        /*
+         * Avoid a transaction abort by catching -EEXIST here. In that
+         * case, we proceed by re-initializing the existing structure
+         * on disk.
+         */
        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
                                      sizeof(*qgroup_info));
-        if (ret)
+        if (ret && ret != -EEXIST)
                goto out;
        leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
        key.type = BTRFS_QGROUP_LIMIT_KEY;
        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
                                      sizeof(*qgroup_limit));
-        if (ret)
+        if (ret && ret != -EEXIST)
                goto out;
        leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
        int ret;
        int slot;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        if (btrfs_test_is_dummy_root(root))
-        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
                return 0;
-#endif
        key.objectid = 0;
        key.type = BTRFS_QGROUP_INFO_KEY;
        key.offset = qgroup->qgroupid;
@@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&oper->elem.list);
        oper->elem.seq = 0;
+        trace_btrfs_qgroup_record_ref(oper);
        if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
                /*
                 * If any operation for this bytenr/ref_root combo
@@ -1973,7 +1979,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
                                   elem.seq, &roots);
        btrfs_put_tree_mod_seq(fs_info, &elem);
        if (ret < 0)
-                return ret;
+                goto out;
        if (roots->nnodes != 1)
                goto out;
@@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
        ASSERT(is_fstree(oper->ref_root));
+        trace_btrfs_qgroup_account(oper);
        switch (oper->type) {
        case BTRFS_QGROUP_OPER_ADD_EXCL:
        case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
        if (srcid) {
                struct btrfs_root *srcroot;
                struct btrfs_key srckey;
-                int srcroot_level;
                srckey.objectid = srcid;
                srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                }
                rcu_read_lock();
-                srcroot_level = btrfs_header_level(srcroot->node);
+                level_size = srcroot->nodesize;
-                level_size = btrfs_level_size(srcroot, srcroot_level);
                rcu_read_unlock();
        }
@@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                    found.type != BTRFS_METADATA_ITEM_KEY)
                        continue;
                if (found.type == BTRFS_METADATA_ITEM_KEY)
-                        num_bytes = fs_info->extent_root->leafsize;
+                        num_bytes = fs_info->extent_root->nodesize;
                else
                        num_bytes = found.offset;
@@ -2720,6 +2726,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
        memset(&fs_info->qgroup_rescan_work, 0,
               sizeof(fs_info->qgroup_rescan_work));
        btrfs_init_work(&fs_info->qgroup_rescan_work,
+                        btrfs_qgroup_rescan_helper,
                        btrfs_qgroup_rescan_worker, NULL, NULL);
        if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 {
        unsigned long nr = stripe_len * nr_stripes;
-        return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
 }
 /*
@@ -1416,7 +1416,8 @@ cleanup:
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        rmw_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        read_rebuild_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1440,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
        struct btrfs_bio *bbio = rbio->bbio;
        struct bio_list bio_list;
        int ret;
-        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        int pagenr;
        int stripe;
        struct bio *bio;
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
        plug = container_of(cb, struct btrfs_plug_cb, cb);
        if (from_schedule) {
-                btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+                btrfs_init_work(&plug->work, btrfs_rmw_helper,
+                                unplug_work, NULL, NULL);
                btrfs_queue_work(plug->info->rmw_workers,
                                 &plug->work);
                return;
@@ -1722,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        int pagenr, stripe;
        void **pointers;
        int faila = -1, failb = -1;
-        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        struct page *page;
        int err;
        int i;
@@ -1937,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
        struct btrfs_bio *bbio = rbio->bbio;
        struct bio_list bio_list;
        int ret;
-        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        int pagenr;
        int stripe;
        struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        if (!re)
                return NULL;
-        blocksize = btrfs_level_size(root, level);
+        blocksize = root->nodesize;
        re->logical = logical;
        re->blocksize = blocksize;
        re->top = *top;
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
                /* FIXME we cannot handle this properly right now */
                BUG();
        }
-        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+        btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+                        reada_start_machine_worker, NULL, NULL);
        rmw->fs_info = fs_info;
        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
                err = ret;
                goto out;
        }
-        BUG_ON(!ret || !path1->slots[0]);
+        ASSERT(ret);
+        ASSERT(path1->slots[0]);
        path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
                 * the backref was added previously when processing
                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
                 */
-                BUG_ON(!list_is_singular(&cur->upper));
+                ASSERT(list_is_singular(&cur->upper));
                edge = list_entry(cur->upper.next, struct backref_edge,
                                  list[LOWER]);
-                BUG_ON(!list_empty(&edge->list[UPPER]));
+                ASSERT(list_empty(&edge->list[UPPER]));
                exist = edge->node[UPPER];
                /*
                 * add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
                                        cur->cowonly = 1;
                        }
 #else
-                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 #endif
                        if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
                                 * backref of this type.
                                 */
                                root = find_reloc_root(rc, cur->bytenr);
-                                BUG_ON(!root);
+                                ASSERT(root);
                                cur->root = root;
                                break;
                        }
@@ -868,7 +869,7 @@ again:
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
-                                BUG_ON(!upper->checked);
+                                ASSERT(upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                        }
                        list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
                if (btrfs_root_level(&root->root_item) == cur->level) {
                        /* tree root */
-                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                        ASSERT(btrfs_root_bytenr(&root->root_item) ==
                               cur->bytenr);
                        if (should_ignore_root(root))
                                list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
                need_check = true;
                for (; level < BTRFS_MAX_LEVEL; level++) {
                        if (!path2->nodes[level]) {
-                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                                ASSERT(btrfs_root_bytenr(&root->root_item) ==
                                       lower->bytenr);
                                if (should_ignore_root(root))
                                        list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
                                        need_check = false;
                                        list_add_tail(&edge->list[UPPER],
                                                      &list);
-                                } else
+                                } else {
+                                        if (upper->checked)
+                                                need_check = true;
                                        INIT_LIST_HEAD(&edge->list[UPPER]);
+                                }
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
-                                BUG_ON(!upper->checked);
+                                ASSERT(upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                                if (!upper->owner)
                                        upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
         * everything goes well, connect backref nodes and insert backref nodes
         * into the cache.
         */
-        BUG_ON(!node->checked);
+        ASSERT(node->checked);
        cowonly = node->cowonly;
        if (!cowonly) {
                rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
                        continue;
                }
-                BUG_ON(!upper->checked);
+                if (!upper->checked) {
-                BUG_ON(cowonly != upper->cowonly);
+                        /*
+                         * Still want to blow up for developers since this is a
+                         * logic bug.
+                         */
+                        ASSERT(0);
+                        err = -EINVAL;
+                        goto out;
+                }
+                if (cowonly != upper->cowonly) {
+                        ASSERT(0);
+                        err = -EINVAL;
+                        goto out;
+                }
                if (!cowonly) {
                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
                                              &upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
        while (!list_empty(&useless)) {
                upper = list_entry(useless.next, struct backref_node, list);
                list_del_init(&upper->list);
-                BUG_ON(!list_empty(&upper->upper));
+                ASSERT(list_empty(&upper->upper));
                if (upper == node)
                        node = NULL;
                if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
        if (err) {
                while (!list_empty(&useless)) {
                        lower = list_entry(useless.next,
-                                           struct backref_node, upper);
+                                           struct backref_node, list);
-                        list_del_init(&lower->upper);
+                        list_del_init(&lower->list);
                }
-                upper = node;
+                while (!list_empty(&list)) {
-                INIT_LIST_HEAD(&list);
+                        edge = list_first_entry(&list, struct backref_edge,
-                while (upper) {
+                                                list[UPPER]);
-                        if (RB_EMPTY_NODE(&upper->rb_node)) {
+                        list_del(&edge->list[UPPER]);
-                                list_splice_tail(&upper->upper, &list);
-                                free_backref_node(cache, upper);
-                        }
-                        if (list_empty(&list))
-                                break;
-                        edge = list_entry(list.next, struct backref_edge,
-                                          list[LOWER]);
                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
                        upper = edge->node[UPPER];
                        free_backref_edge(cache, edge);
+                        /*
+                         * Lower is no longer linked to any upper backref nodes
+                         * and isn't in the cache, we can free it ourselves.
+                         */
+                        if (list_empty(&lower->upper) &&
+                            RB_EMPTY_NODE(&lower->rb_node))
+                                list_add(&lower->list, &useless);
+                        if (!RB_EMPTY_NODE(&upper->rb_node))
+                                continue;
+                        /* Add this guy's upper edges to the list to proces */
+                        list_for_each_entry(edge, &upper->upper, list[LOWER])
+                                list_add_tail(&edge->list[UPPER], &list);
+                        if (list_empty(&upper->upper))
+                                list_add(&upper->list, &useless);
+                }
+                while (!list_empty(&useless)) {
+                        lower = list_entry(useless.next,
+                                           struct backref_node, list);
+                        list_del_init(&lower->list);
+                        free_backref_node(cache, lower);
                }
                return ERR_PTR(err);
        }
-        BUG_ON(node && node->detached);
+        ASSERT(!node || !node->detached);
        return node;
 }
@@ -1787,7 +1820,7 @@ again:
                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
                old_bytenr = btrfs_node_blockptr(parent, slot);
-                blocksize = btrfs_level_size(dest, level - 1);
+                blocksize = dest->nodesize;
                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
                if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
                                break;
                        }
-                        eb = read_tree_block(dest, old_bytenr, blocksize,
+                        eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
-                                             old_ptr_gen);
                        if (!eb || !extent_buffer_uptodate(eb)) {
                                ret = (!eb) ? -ENOMEM : -EIO;
                                free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
        u64 bytenr;
        u64 ptr_gen = 0;
        u64 last_snapshot;
-        u32 blocksize;
        u32 nritems;
        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
                }
                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
-                blocksize = btrfs_level_size(root, i - 1);
+                eb = read_tree_block(root, bytenr, ptr_gen);
-                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
                if (!eb || !extent_buffer_uptodate(eb)) {
                        free_extent_buffer(eb);
                        return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
 }
 static noinline_for_stack
-int merge_reloc_roots(struct reloc_control *rc)
+void merge_reloc_roots(struct reloc_control *rc)
 {
        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
        }
        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
-        return ret;
 }
 static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
                        if (next->processed && (reserve || next != node))
                                break;
-                        num_bytes += btrfs_level_size(rc->extent_root,
+                        num_bytes += rc->extent_root->nodesize;
-                                                      next->level);
                        if (list_empty(&next->upper))
                                break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                                goto next;
                }
-                blocksize = btrfs_level_size(root, node->level);
+                blocksize = root->nodesize;
                generation = btrfs_node_ptr_generation(upper->eb, slot);
-                eb = read_tree_block(root, bytenr, blocksize, generation);
+                eb = read_tree_block(root, bytenr, generation);
                if (!eb || !extent_buffer_uptodate(eb)) {
                        free_extent_buffer(eb);
                        err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
        u32 blocksize;
        if (node->level == 0 ||
            in_block_group(node->bytenr, rc->block_group)) {
-                blocksize = btrfs_level_size(rc->extent_root, node->level);
+                blocksize = rc->extent_root->nodesize;
                mark_block_processed(rc, node->bytenr, blocksize);
        }
        node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
        BUG_ON(block->key_ready);
        eb = read_tree_block(rc->extent_root, block->bytenr,
-                             block->key.objectid, block->key.offset);
+                             block->key.offset);
        if (!eb || !extent_buffer_uptodate(eb)) {
                free_extent_buffer(eb);
                return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
        return 0;
 }
-static int reada_tree_block(struct reloc_control *rc,
-                            struct tree_block *block)
-{
-        BUG_ON(block->key_ready);
-        if (block->key.type == BTRFS_METADATA_ITEM_KEY)
-                readahead_tree_block(rc->extent_root, block->bytenr,
-                                     block->key.objectid,
-                                     rc->extent_root->leafsize);
-        else
-                readahead_tree_block(rc->extent_root, block->bytenr,
-                                     block->key.objectid, block->key.offset);
-        return 0;
-}
 /*
 * helper function to relocate a tree block
 */
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
                if (!block->key_ready)
-                        reada_tree_block(rc, block);
+                        readahead_tree_block(rc->extent_root, block->bytenr,
+                                        block->key.objectid);
                rb_node = rb_next(rb_node);
        }
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
                return -ENOMEM;
        block->bytenr = extent_key->objectid;
-        block->key.objectid = rc->extent_root->leafsize;
+        block->key.objectid = rc->extent_root->nodesize;
        block->key.offset = generation;
        block->level = level;
        block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
        struct btrfs_extent_inline_ref *iref;
        unsigned long ptr;
        unsigned long end;
-        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+        u32 blocksize = rc->extent_root->nodesize;
        int ret = 0;
        int err = 0;
@@ -3783,7 +3798,7 @@ next:
                }
                if (key.type == BTRFS_METADATA_ITEM_KEY &&
-                    key.objectid + rc->extent_root->leafsize <=
+                    key.objectid + rc->extent_root->nodesize <=
                    rc->search_start) {
                        path->slots[0]++;
                        goto next;
@@ -3801,7 +3816,7 @@ next:
                                rc->search_start = key.objectid + key.offset;
                        else
                                rc->search_start = key.objectid +
-                                        rc->extent_root->leafsize;
+                                        rc->extent_root->nodesize;
                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(path);
 out:
        btrfs_free_path(path);
        return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
        int                     pages_per_rd_bio;
        u32                     sectorsize;
        u32                     nodesize;
-        u32                     leafsize;
        int                     is_dev_replace;
        struct scrub_wr_ctx     wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
 struct scrub_warning {
        struct btrfs_path       *path;
        u64                     extent_item_size;
-        char                    *scratch_buf;
-        char                    *msg_buf;
        const char              *errstr;
        sector_t                sector;
        u64                     logical;
        struct btrfs_device     *dev;
-        int                     msg_bufsize;
-        int                     scratch_bufsize;
 };
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -428,8 +422,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
                sbio->index = i;
                sbio->sctx = sctx;
                sbio->page_count = 0;
-                btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_bio_end_io_worker, NULL, NULL);
                if (i != SCRUB_BIOS_PER_SCTX - 1)
                        sctx->bios[i]->next_free = i + 1;
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        }
        sctx->first_free = 0;
        sctx->nodesize = dev->dev_root->nodesize;
-        sctx->leafsize = dev->dev_root->leafsize;
        sctx->sectorsize = dev->dev_root->sectorsize;
        atomic_set(&sctx->bios_in_flight, 0);
        atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        u64 ref_root;
        u32 item_size;
        u8 ref_level;
-        const int bufsize = 4096;
        int ret;
        WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        fs_info = sblock->sctx->dev_root->fs_info;
        path = btrfs_alloc_path();
+        if (!path)
+                return;
-        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
-        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.sector = (sblock->pagev[0]->physical) >> 9;
        swarn.logical = sblock->pagev[0]->logical;
        swarn.errstr = errstr;
        swarn.dev = NULL;
-        swarn.msg_bufsize = bufsize;
-        swarn.scratch_bufsize = bufsize;
-        if (!path || !swarn.scratch_buf || !swarn.msg_buf)
-                goto out;
        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
                                  &flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 out:
        btrfs_free_path(path);
-        kfree(swarn.scratch_buf);
-        kfree(swarn.msg_buf);
 }
 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
                        ret = -EIO;
                        goto out;
                }
-                fs_info = BTRFS_I(inode)->root->fs_info;
+                ret = repair_io_failure(inode, offset, PAGE_SIZE,
-                ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
                                        fixup->logical, page,
+                                        offset - page_offset(page),
                                        fixup->mirror_num);
                unlock_page(page);
                corrected = !ret;
@@ -999,8 +984,8 @@ nodatasum_case:
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                scrub_pending_trans_workers_inc(sctx);
-                btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_fixup_nodatasum, NULL, NULL);
                btrfs_queue_work(fs_info->scrub_workers,
                                 &fixup_nodatasum->work);
                goto out;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
        return;
 }
+static inline int scrub_check_fsid(u8 fsid[],
+                                   struct scrub_page *spage)
+{
+        struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+        int ret;
+        ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+        return !ret;
+}
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         struct scrub_block *sblock,
                                         int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                h = (struct btrfs_header *)mapped_buffer;
                if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
-                    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+                    !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                           BTRFS_UUID_SIZE)) {
                        sblock->header_error = 1;
@@ -1616,7 +1611,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
        sbio->err = err;
        sbio->bio = bio;
-        btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+                         scrub_wr_bio_end_io_worker, NULL, NULL);
        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
@@ -1750,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
        if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
                ++fail;
-        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+        if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
                ++fail;
        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                   BTRFS_UUID_SIZE))
                ++fail;
-        WARN_ON(sctx->nodesize != sctx->leafsize);
        len = sctx->nodesize - BTRFS_CSUM_SIZE;
        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1790,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 {
        struct btrfs_super_block *s;
        struct scrub_ctx *sctx = sblock->sctx;
-        struct btrfs_root *root = sctx->dev_root;
-        struct btrfs_fs_info *fs_info = root->fs_info;
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
        struct page *page;
@@ -1816,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        if (sblock->pagev[0]->generation != btrfs_super_generation(s))
                ++fail_gen;
-        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+        if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
                ++fail_cor;
        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2195,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
                sctx->stat.data_bytes_scrubbed += len;
                spin_unlock(&sctx->stat_lock);
        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-                WARN_ON(sctx->nodesize != sctx->leafsize);
                blocksize = sctx->nodesize;
                spin_lock(&sctx->stat_lock);
                sctx->stat.tree_extents_scrubbed++;
@@ -2486,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        btrfs_item_key_to_cpu(l, &key, slot);
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
-                                bytes = root->leafsize;
+                                bytes = root->nodesize;
                        else
                                bytes = key.offset;
@@ -2713,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                if (found_key.objectid != scrub_dev->devid)
                        break;
-                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
                        break;
                if (found_key.offset >= end)
@@ -2827,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EIO;
-        gen = root->fs_info->last_trans_committed;
+        /* Seed devices of a new filesystem has their own generation. */
+        if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+                gen = scrub_dev->generation;
+        else
+                gen = root->fs_info->last_trans_committed;
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-                if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >
+                    scrub_dev->commit_total_bytes)
                        break;
                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2904,21 +2901,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        struct scrub_ctx *sctx;
        int ret;
        struct btrfs_device *dev;
+        struct rcu_string *name;
        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
-        /*
-         * check some assumptions
-         */
-        if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
-                btrfs_err(fs_info,
-                           "scrub: size assumption nodesize == leafsize (%d == %d) fails",
-                       fs_info->chunk_root->nodesize,
-                       fs_info->chunk_root->leafsize);
-                return -EINVAL;
-        }
        if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
                /*
                 * in this case scrub is unable to calculate the checksum
@@ -2965,6 +2952,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -ENODEV;
        }
+        if (!is_dev_replace && !readonly && !dev->writeable) {
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                rcu_read_lock();
+                name = rcu_dereference(dev->name);
+                btrfs_err(fs_info, "scrub: device %s is not writable",
+                          name->str);
+                rcu_read_unlock();
+                return -EROFS;
+        }
        mutex_lock(&fs_info->scrub_lock);
        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3200,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        nocow_ctx->len = len;
        nocow_ctx->mirror_num = mirror_num;
        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-        btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+                        copy_nocow_pages_worker, NULL, NULL);
        INIT_LIST_HEAD(&nocow_ctx->inodes);
        btrfs_queue_work(fs_info->scrub_nocow_workers,
                         &nocow_ctx->work);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
        set_fs(KERNEL_DS);
        while (pos < len) {
-                ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
+                ret = vfs_write(filp, (__force const char __user *)buf + pos,
+                                len - pos, off);
                /* TODO handle that correctly */
                /*if (ret == -ERESTARTSYS) {
                        continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        int num;
        u8 type;
-        if (found_key->type == BTRFS_XATTR_ITEM_KEY)
+        /*
-                buf_len = BTRFS_MAX_XATTR_SIZE(root);
+         * Start with a small buffer (1 page). If later we end up needing more
-        else
+         * space, which can happen for xattrs on a fs with a leaf size greater
-                buf_len = PATH_MAX;
+         * then the page size, attempt to increase the buffer. Typically xattr
+         * values are small.
+         */
+        buf_len = PATH_MAX;
        buf = kmalloc(buf_len, GFP_NOFS);
        if (!buf) {
                ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                                ret = -ENAMETOOLONG;
                                goto out;
                        }
-                        if (name_len + data_len > buf_len) {
+                        if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
                                ret = -E2BIG;
                                goto out;
                        }
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                        /*
                         * Path too long
                         */
-                        if (name_len + data_len > buf_len) {
+                        if (name_len + data_len > PATH_MAX) {
                                ret = -ENAMETOOLONG;
                                goto out;
                        }
                }
+                if (name_len + data_len > buf_len) {
+                        buf_len = name_len + data_len;
+                        if (is_vmalloc_addr(buf)) {
+                                vfree(buf);
+                                buf = NULL;
+                        } else {
+                                char *tmp = krealloc(buf, buf_len,
+                                                     GFP_NOFS | __GFP_NOWARN);
+                                if (!tmp)
+                                        kfree(buf);
+                                buf = tmp;
+                        }
+                        if (!buf) {
+                                buf = vmalloc(buf_len);
+                                if (!buf) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                        }
+                }
                read_extent_buffer(eb, buf, (unsigned long)(di + 1),
                                name_len + data_len);
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        }
 out:
-        kfree(buf);
+        kvfree(buf);
        return ret;
 }
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
                if (ret < 0 && ret != -ENOENT) {
                        goto out;
                } else if (ret == -ENOENT) {
-                        ret = 1;
+                        ret = 0;
                        break;
                }
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        NULL);
        sort_clone_roots = 1;
-        current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
+        current->journal_info = BTRFS_SEND_TRANS_STUB;
        ret = send_subvol(sctx);
        current->journal_info = NULL;
        if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c4124de4435b..a2b97ef10317 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
 #include "backref.h"
 #include "tests/btrfs-tests.h"
+#include "qgroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 static void btrfs_put_super(struct super_block *sb)
 {
-        (void)close_ctree(btrfs_sb(sb)->tree_root);
+        close_ctree(btrfs_sb(sb)->tree_root);
-        /* FIXME: need to fix VFS to return error? */
-        /* AV: return it _where_?  ->put_super() can be triggered by any number
-         * of async events, up to and including delivery of SIGKILL to the
-         * last process that kept it busy.  Or segfault in the aforementioned
-         * process...  Whom would you report that to?
-         */
 }
 enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        int ret = 0;
        char *compress_type;
        bool compress_force = false;
-        bool compress = false;
        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
        if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        /* Fallthrough */
                case Opt_compress:
                case Opt_compress_type:
-                        compress = true;
                        if (token == Opt_compress ||
                            token == Opt_compress_force ||
                            strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                btrfs_set_and_info(root, FORCE_COMPRESS,
                                                   "force %s compression",
                                                   compress_type);
-                        } else if (compress) {
+                        } else {
                                if (!btrfs_test_opt(root, COMPRESS))
                                        btrfs_info(root->fs_info,
                                                   "btrfs: use %s compression",
                                                   compress_type);
+                                /*
+                                 * If we remount from compress-force=xxx to
+                                 * compress=xxx, we need clear FORCE_COMPRESS
+                                 * flag, otherwise, there is no way for users
+                                 * to disable forcible compression separately.
+                                 */
+                                btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                        }
                        break;
                case Opt_ssd:
@@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",nodatacow");
        if (btrfs_test_opt(root, NOBARRIER))
                seq_puts(seq, ",nobarrier");
-        if (info->max_inline != 8192 * 1024)
+        if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
                seq_printf(seq, ",max_inline=%llu", info->max_inline);
        if (info->alloc_start != 0)
                seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
        return root;
 }
+static int parse_security_options(char *orig_opts,
+                                  struct security_mnt_opts *sec_opts)
+{
+        char *secdata = NULL;
+        int ret = 0;
+        secdata = alloc_secdata();
+        if (!secdata)
+                return -ENOMEM;
+        ret = security_sb_copy_data(orig_opts, secdata);
+        if (ret) {
+                free_secdata(secdata);
+                return ret;
+        }
+        ret = security_sb_parse_opts_str(secdata, sec_opts);
+        free_secdata(secdata);
+        return ret;
+}
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+                                  struct super_block *sb,
+                                  struct security_mnt_opts *sec_opts)
+{
+        int ret = 0;
+        /*
+         * Call security_sb_set_mnt_opts() to check whether new sec_opts
+         * is valid.
+         */
+        ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+        if (ret)
+                return ret;
+#ifdef CONFIG_SECURITY
+        if (!fs_info->security_opts.num_mnt_opts) {
+                /* first time security setup, copy sec_opts to fs_info */
+                memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+        } else {
+                /*
+                 * Since SELinux(the only one supports security_mnt_opts) does
+                 * NOT support changing context during remount/mount same sb,
+                 * This must be the same or part of the same security options,
+                 * just free it.
+                 */
+                security_free_mnt_opts(sec_opts);
+        }
+#endif
+        return ret;
+}
 /*
 * Find a superblock for the given device / mount point.
 *
@@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
        struct btrfs_fs_info *fs_info = NULL;
+        struct security_mnt_opts new_sec_opts;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
@@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                return root;
        }
+        security_init_mnt_opts(&new_sec_opts);
+        if (data) {
+                error = parse_security_options(data, &new_sec_opts);
+                if (error)
+                        return ERR_PTR(error);
+        }
        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
        if (error)
-                return ERR_PTR(error);
+                goto error_sec_opts;
        /*
         * Setup a dummy root and fs_info for test/set super.  This is because
@@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
         * then open_ctree will properly initialize everything later.
         */
        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-        if (!fs_info)
+        if (!fs_info) {
-                return ERR_PTR(-ENOMEM);
+                error = -ENOMEM;
+                goto error_sec_opts;
+        }
        fs_info->fs_devices = fs_devices;
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
        fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+        security_init_mnt_opts(&fs_info->security_opts);
        if (!fs_info->super_copy || !fs_info->super_for_commit) {
                error = -ENOMEM;
                goto error_fs_info;
@@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        }
        root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
-        if (IS_ERR(root))
+        if (IS_ERR(root)) {
+                deactivate_locked_super(s);
+                error = PTR_ERR(root);
+                goto error_sec_opts;
+        }
+        fs_info = btrfs_sb(s);
+        error = setup_security_options(fs_info, s, &new_sec_opts);
+        if (error) {
+                dput(root);
                deactivate_locked_super(s);
+                goto error_sec_opts;
+        }
        return root;
@@ -1315,6 +1387,8 @@ error_close_devices:
        btrfs_close_devices(fs_devices);
 error_fs_info:
        free_fs_info(fs_info);
+error_sec_opts:
+        security_free_mnt_opts(&new_sec_opts);
        return ERR_PTR(error);
 }
@@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        sync_filesystem(sb);
        btrfs_remount_prepare(fs_info);
+        if (data) {
+                struct security_mnt_opts new_sec_opts;
+                security_init_mnt_opts(&new_sec_opts);
+                ret = parse_security_options(data, &new_sec_opts);
+                if (ret)
+                        goto restore;
+                ret = setup_security_options(fs_info, sb,
+                                             &new_sec_opts);
+                if (ret) {
+                        security_free_mnt_opts(&new_sec_opts);
+                        goto restore;
+                }
+        }
        ret = btrfs_parse_options(root, data);
        if (ret) {
                ret = -EINVAL;
@@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        int ret;
-        /* holding chunk_muext to avoid allocating new chunks */
+        /*
+         * holding chunk_muext to avoid allocating new chunks, holding
+         * device_list_mutex to avoid the device being removed
+         */
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
                mutex_unlock(&fs_info->chunk_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                return ret;
        }
        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;
        mutex_unlock(&fs_info->chunk_mutex);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
        .name           = "btrfs",
        .mount          = btrfs_mount,
        .kill_sb        = btrfs_kill_super,
-        .fs_flags       = FS_REQUIRES_DEV,
+        .fs_flags       = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
 };
 MODULE_ALIAS_FS("btrfs");
@@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void)
        err = btrfs_prelim_ref_init();
        if (err)
+                goto free_delayed_ref;
+        err = btrfs_end_io_wq_init();
+        if (err)
                goto free_prelim_ref;
        err = btrfs_interface_init();
        if (err)
-                goto free_delayed_ref;
+                goto free_end_io_wq;
        btrfs_init_lockdep();
@@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_end_io_wq:
+        btrfs_end_io_wq_exit();
 free_prelim_ref:
        btrfs_prelim_ref_exit();
 free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
 }
-BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
 static ssize_t global_rsv_reserved_show(struct kobject *kobj,
                                        struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
 }
-BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
 #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
 #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
        struct btrfs_space_info *sinfo = to_space_info(kobj);           \
        return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);        \
 }                                                                       \
-BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
 static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
                                                       struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
 SPACE_INFO_ATTR(bytes_may_use);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
 static struct attribute *space_info_attrs[] = {
        BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
                                struct kobj_attribute *a, char *buf)
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-        return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+        char *label = fs_info->super_copy->label;
+        return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
 }
 static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = fs_info->fs_root;
        int ret;
+        size_t p_len;
-        if (len >= BTRFS_LABEL_SIZE)
+        if (fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        /*
+         * p_len is the len until the first occurrence of either
+         * '\n' or '\0'
+         */
+        p_len = strcspn(buf, "\n");
+        if (p_len >= BTRFS_LABEL_SIZE)
                return -EINVAL;
        trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
                return PTR_ERR(trans);
        spin_lock(&root->fs_info->super_lock);
-        strcpy(fs_info->super_copy->label, buf);
+        memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+        memcpy(fs_info->super_copy->label, buf, p_len);
        spin_unlock(&root->fs_info->super_lock);
        ret = btrfs_commit_transaction(trans, root);
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
        return ret;
 }
-BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
-static ssize_t btrfs_no_store(struct kobject *kobj,
-                                 struct kobj_attribute *a,
-                                 const char *buf, size_t len)
-{
-        return -EPERM;
-}
 static ssize_t btrfs_nodesize_show(struct kobject *kobj,
                                struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
 }
-BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
 static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
                                struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
 }
-BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
 static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
                                struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
        return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
 }
-BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 static struct attribute *btrfs_attrs[] = {
        BTRFS_ATTR_PTR(label),
@@ -614,7 +619,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
        if (!fs_info->device_dir_kobj)
                return -EINVAL;
-        if (one_device) {
+        if (one_device && one_device->bdev) {
                disk = one_device->bdev->bd_part;
                disk_kobj = &part_to_dev(disk)->kobj;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
        .store  = _store,                                               \
 }
-#define BTRFS_ATTR_RW(_name, _mode, _show, _store)                      \
+#define BTRFS_ATTR_RW(_name, _show, _store)                     \
-static struct kobj_attribute btrfs_attr_##_name =                       \
+        static struct kobj_attribute btrfs_attr_##_name =               \
-                        __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
+                        __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
-#define BTRFS_ATTR(_name, _mode, _show)                                 \
-        BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR(_name, _show)                                        \
+        static struct kobj_attribute btrfs_attr_##_name =               \
+                        __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
 #define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
 #define BTRFS_RAID_ATTR(_name, _show)                                   \
-static struct kobj_attribute btrfs_raid_attr_##_name =                  \
+        static struct kobj_attribute btrfs_raid_attr_##_name =          \
                        __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
 #define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
        cache->key.offset = 1024 * 1024 * 1024;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = 4096;
+        cache->full_stripe_len = 4096;
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
-        INIT_LIST_HEAD(&cache->new_bg_list);
+        INIT_LIST_HEAD(&cache->bg_list);
        btrfs_init_free_space_ctl(cache);
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        return 0;
 }
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+                            struct btrfs_free_space *info)
+{
+        return ctl->free_extents > 0;
+}
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+                              const int num_extents,
+                              const int num_bitmaps)
+{
+        if (cache->free_space_ctl->free_extents != num_extents) {
+                test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+                         cache->free_space_ctl->free_extents, num_extents);
+                return -EINVAL;
+        }
+        if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+                test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+                         cache->free_space_ctl->total_bitmaps, num_bitmaps);
+                return -EINVAL;
+        }
+        return 0;
+}
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+        u64 offset;
+        u64 max_extent_size;
+        /*
+         * Now lets confirm that there's absolutely no free space left to
+         * allocate.
+         */
+        if (cache->free_space_ctl->free_space != 0) {
+                test_msg("Cache free space is not 0\n");
+                return -EINVAL;
+        }
+        /* And any allocation request, no matter how small, should fail now. */
+        offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+                                            &max_extent_size);
+        if (offset != 0) {
+                test_msg("Space allocation did not fail, returned offset: %llu",
+                         offset);
+                return -EINVAL;
+        }
+        /* And no extent nor bitmap entries in the cache anymore. */
+        return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+        int ret;
+        u64 offset;
+        u64 max_extent_size;
+        bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+                              struct btrfs_free_space *);
+        test_msg("Running space stealing from bitmap to extent\n");
+        /*
+         * For this test, we want to ensure we end up with an extent entry
+         * immediately adjacent to a bitmap entry, where the bitmap starts
+         * at an offset where the extent entry ends. We keep adding and
+         * removing free space to reach into this state, but to get there
+         * we need to reach a point where marking new free space doesn't
+         * result in adding new extent entries or merging the new space
+         * with existing extent entries - the space ends up being marked
+         * in an existing bitmap that covers the new free space range.
+         *
+         * To get there, we need to reach the threshold defined set at
+         * cache->free_space_ctl->extents_thresh, which currently is
+         * 256 extents on a x86_64 system at least, and a few other
+         * conditions (check free_space_cache.c). Instead of making the
+         * test much longer and complicated, use a "use_bitmap" operation
+         * that forces use of bitmaps as soon as we have at least 1
+         * extent entry.
+         */
+        use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+        cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+        /*
+         * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+         */
+        ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+                                        128 * 1024, 0);
+        if (ret) {
+                test_msg("Couldn't add extent entry %d\n", ret);
+                return ret;
+        }
+        /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+        ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+                                        128 * 1024 * 1024 - 512 * 1024, 1);
+        if (ret) {
+                test_msg("Couldn't add bitmap entry %d\n", ret);
+                return ret;
+        }
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * Now make only the first 256Kb of the bitmap marked as free, so that
+         * we end up with only the following ranges marked as free space:
+         *
+         * [128Mb - 256Kb, 128Mb - 128Kb[
+         * [128Mb + 512Kb, 128Mb + 768Kb[
+         */
+        ret = btrfs_remove_free_space(cache,
+                                      128 * 1024 * 1024 + 768 * 1024,
+                                      128 * 1024 * 1024 - 768 * 1024);
+        if (ret) {
+                test_msg("Failed to free part of bitmap space %d\n", ret);
+                return ret;
+        }
+        /* Confirm that only those 2 ranges are marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+                               128 * 1024)) {
+                test_msg("Free space range missing\n");
+                return -ENOENT;
+        }
+        if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+                               256 * 1024)) {
+                test_msg("Free space range missing\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+         * as free anymore.
+         */
+        if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+                              128 * 1024 * 1024 - 768 * 1024)) {
+                test_msg("Bitmap region not removed from space cache\n");
+                return -EINVAL;
+        }
+        /*
+         * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+         * covered by the bitmap, isn't marked as free.
+         */
+        if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+                              256 * 1024)) {
+                test_msg("Invalid bitmap region marked as free\n");
+                return -EINVAL;
+        }
+        /*
+         * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+         * by the bitmap too, isn't marked as free either.
+         */
+        if (test_check_exists(cache, 128 * 1024 * 1024,
+                              256 * 1024)) {
+                test_msg("Invalid bitmap region marked as free\n");
+                return -EINVAL;
+        }
+        /*
+         * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+         * lets make sure the free space cache marks it as free in the bitmap,
+         * and doesn't insert a new extent entry to represent this region.
+         */
+        ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /* Confirm the region is marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+                test_msg("Bitmap region not marked as free\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that no new extent entries or bitmap entries were added to
+         * the cache after adding that free space region.
+         */
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * Now lets add a small free space region to the right of the previous
+         * one, which is not contiguous with it and is part of the bitmap too.
+         * The goal is to test that the bitmap entry space stealing doesn't
+         * steal this space region.
+         */
+        ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+                                   4096);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /*
+         * Confirm that no new extent entries or bitmap entries were added to
+         * the cache after adding that free space region.
+         */
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+         * expand the range covered by the existing extent entry that represents
+         * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+         */
+        ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+                                   128 * 1024);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /* Confirm the region is marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+                               128 * 1024)) {
+                test_msg("Extent region not marked as free\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that our extent entry didn't stole all free space from the
+         * bitmap, because of the small 4Kb free space region.
+         */
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+         * space. Without stealing bitmap free space into extent entry space,
+         * we would have all this free space represented by 2 entries in the
+         * cache:
+         *
+         * extent entry covering range: [128Mb - 256Kb, 128Mb[
+         * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+         *
+         * Attempting to allocate the whole free space (1Mb) would fail, because
+         * we can't allocate from multiple entries.
+         * With the bitmap free space stealing, we get a single extent entry
+         * that represents the 1Mb free space, and therefore we're able to
+         * allocate the whole free space at once.
+         */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+                               1 * 1024 * 1024)) {
+                test_msg("Expected region not marked as free\n");
+                return -ENOENT;
+        }
+        if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+                test_msg("Cache free space is not 1Mb + 4Kb\n");
+                return -EINVAL;
+        }
+        offset = btrfs_find_space_for_alloc(cache,
+                                            0, 1 * 1024 * 1024, 0,
+                                            &max_extent_size);
+        if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+                test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+                         offset);
+                return -EINVAL;
+        }
+        /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+        ret = check_num_extents_and_bitmaps(cache, 1, 1);
+        if (ret)
+                return ret;
+        if (cache->free_space_ctl->free_space != 4096) {
+                test_msg("Cache free space is not 4Kb\n");
+                return -EINVAL;
+        }
+        offset = btrfs_find_space_for_alloc(cache,
+                                            0, 4096, 0,
+                                            &max_extent_size);
+        if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+                test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+                         offset);
+                return -EINVAL;
+        }
+        ret = check_cache_empty(cache);
+        if (ret)
+                return ret;
+        __btrfs_remove_free_space_cache(cache->free_space_ctl);
+        /*
+         * Now test a similar scenario, but where our extent entry is located
+         * to the right of the bitmap entry, so that we can check that stealing
+         * space from a bitmap to the front of an extent entry works.
+         */
+        /*
+         * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+         */
+        ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+                                        128 * 1024, 0);
+        if (ret) {
+                test_msg("Couldn't add extent entry %d\n", ret);
+                return ret;
+        }
+        /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+        ret = test_add_free_space_entry(cache, 0,
+                                        128 * 1024 * 1024 - 512 * 1024, 1);
+        if (ret) {
+                test_msg("Couldn't add bitmap entry %d\n", ret);
+                return ret;
+        }
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * Now make only the last 256Kb of the bitmap marked as free, so that
+         * we end up with only the following ranges marked as free space:
+         *
+         * [128Mb + 128b, 128Mb + 256Kb[
+         * [128Mb - 768Kb, 128Mb - 512Kb[
+         */
+        ret = btrfs_remove_free_space(cache,
+                                      0,
+                                      128 * 1024 * 1024 - 768 * 1024);
+        if (ret) {
+                test_msg("Failed to free part of bitmap space %d\n", ret);
+                return ret;
+        }
+        /* Confirm that only those 2 ranges are marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+                               128 * 1024)) {
+                test_msg("Free space range missing\n");
+                return -ENOENT;
+        }
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+                               256 * 1024)) {
+                test_msg("Free space range missing\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+         * as free anymore.
+         */
+        if (test_check_exists(cache, 0,
+                              128 * 1024 * 1024 - 768 * 1024)) {
+                test_msg("Bitmap region not removed from space cache\n");
+                return -EINVAL;
+        }
+        /*
+         * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+         * covered by the bitmap, isn't marked as free.
+         */
+        if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+                              512 * 1024)) {
+                test_msg("Invalid bitmap region marked as free\n");
+                return -EINVAL;
+        }
+        /*
+         * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+         * lets make sure the free space cache marks it as free in the bitmap,
+         * and doesn't insert a new extent entry to represent this region.
+         */
+        ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+                                   512 * 1024);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /* Confirm the region is marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+                               512 * 1024)) {
+                test_msg("Bitmap region not marked as free\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that no new extent entries or bitmap entries were added to
+         * the cache after adding that free space region.
+         */
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * Now lets add a small free space region to the left of the previous
+         * one, which is not contiguous with it and is part of the bitmap too.
+         * The goal is to test that the bitmap entry space stealing doesn't
+         * steal this space region.
+         */
+        ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /*
+         * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+         * expand the range covered by the existing extent entry that represents
+         * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+         */
+        ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+        if (ret) {
+                test_msg("Error adding free space: %d\n", ret);
+                return ret;
+        }
+        /* Confirm the region is marked as free. */
+        if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+                test_msg("Extent region not marked as free\n");
+                return -ENOENT;
+        }
+        /*
+         * Confirm that our extent entry didn't stole all free space from the
+         * bitmap, because of the small 8Kb free space region.
+         */
+        ret = check_num_extents_and_bitmaps(cache, 2, 1);
+        if (ret)
+                return ret;
+        /*
+         * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+         * space. Without stealing bitmap free space into extent entry space,
+         * we would have all this free space represented by 2 entries in the
+         * cache:
+         *
+         * extent entry covering range: [128Mb, 128Mb + 256Kb[
+         * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+         *
+         * Attempting to allocate the whole free space (1Mb) would fail, because
+         * we can't allocate from multiple entries.
+         * With the bitmap free space stealing, we get a single extent entry
+         * that represents the 1Mb free space, and therefore we're able to
+         * allocate the whole free space at once.
+         */
+        if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+                               1 * 1024 * 1024)) {
+                test_msg("Expected region not marked as free\n");
+                return -ENOENT;
+        }
+        if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+                test_msg("Cache free space is not 1Mb + 8Kb\n");
+                return -EINVAL;
+        }
+        offset = btrfs_find_space_for_alloc(cache,
+                                            0, 1 * 1024 * 1024, 0,
+                                            &max_extent_size);
+        if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+                test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+                         offset);
+                return -EINVAL;
+        }
+        /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+        ret = check_num_extents_and_bitmaps(cache, 1, 1);
+        if (ret)
+                return ret;
+        if (cache->free_space_ctl->free_space != 8192) {
+                test_msg("Cache free space is not 8Kb\n");
+                return -EINVAL;
+        }
+        offset = btrfs_find_space_for_alloc(cache,
+                                            0, 8192, 0,
+                                            &max_extent_size);
+        if (offset != (32 * 1024 * 1024)) {
+                test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+                         offset);
+                return -EINVAL;
+        }
+        ret = check_cache_empty(cache);
+        if (ret)
+                return ret;
+        cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+        __btrfs_remove_free_space_cache(cache->free_space_ctl);
+        return 0;
+}
 int btrfs_test_free_space_cache(void)
 {
        struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
        ret = test_bitmaps_and_extents(cache);
        if (ret)
                goto out;
+        ret = test_steal_space_from_bitmap_to_extent(cache);
 out:
        __btrfs_remove_free_space_cache(cache->free_space_ctl);
        kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3542ca..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
        int ret;
        /* Send isn't supposed to start transactions. */
-        ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+        ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return ERR_PTR(-EROFS);
@@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
        if (num_items > 0 && root != root->fs_info->chunk_root) {
                if (root->fs_info->quota_enabled &&
                    is_fstree(root->root_key.objectid)) {
-                        qgroup_reserved = num_items * root->leafsize;
+                        qgroup_reserved = num_items * root->nodesize;
                        ret = btrfs_qgroup_reserve(root, qgroup_reserved);
                        if (ret)
                                return ERR_PTR(ret);
@@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                /*
                 * Do the reservation for the relocation root creation
                 */
-                if (unlikely(need_reserve_reloc_root(root))) {
+                if (need_reserve_reloc_root(root)) {
                        num_bytes += root->nodesize;
                        reloc_reserved = true;
                }
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                if (transid <= root->fs_info->last_trans_committed)
                        goto out;
-                ret = -EINVAL;
                /* find specified transaction */
                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                        }
                }
                spin_unlock(&root->fs_info->trans_lock);
-                /* The specified transaction doesn't exist */
-                if (!cur_trans)
+                /*
+                 * The specified transaction doesn't exist, or we
+                 * raced with btrfs_commit_transaction
+                 */
+                if (!cur_trans) {
+                        if (transid > root->fs_info->last_trans_committed)
+                                ret = -EINVAL;
                        goto out;
+                }
        } else {
                /* find newest transaction that is committing | committed */
                spin_lock(&root->fs_info->trans_lock);
@@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
+        struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+        bool errors = false;
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      EXTENT_NEED_WAIT, &cached_state)) {
@@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        }
        if (err)
                werr = err;
+        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                if ((mark & EXTENT_DIRTY) &&
+                    test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+                                       &btree_ino->runtime_flags))
+                        errors = true;
+                if ((mark & EXTENT_NEW) &&
+                    test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+                                       &btree_ino->runtime_flags))
+                        errors = true;
+        } else {
+                if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+                                       &btree_ino->runtime_flags))
+                        errors = true;
+        }
+        if (errors && !werr)
+                werr = -EIO;
        return werr;
 }
@@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
+        struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
        int ret;
        /* Stop the commit early if ->aborted is set */
@@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
               sizeof(*root->fs_info->super_copy));
+        btrfs_update_commit_device_size(root->fs_info);
+        btrfs_update_commit_device_bytes_used(root, cur_trans);
+        clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+        clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
        spin_lock(&root->fs_info->trans_lock);
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        root->fs_info->running_transaction = NULL;
@@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
                ret = btrfs_drop_snapshot(root, NULL, 0, 0);
        else
                ret = btrfs_drop_snapshot(root, NULL, 1, 0);
-        /*
-         * If we encounter a transaction abort during snapshot cleaning, we
-         * don't want to crash here
-         */
        return (ret < 0) ? 0 : 1;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 579be51b27e5..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,7 +79,7 @@ struct btrfs_transaction {
 #define TRANS_EXTWRITERS        (__TRANS_USERSPACE | __TRANS_START |    \
                                 __TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB   1
+#define BTRFS_SEND_TRANS_STUB   ((void *)1)
 struct btrfs_trans_handle {
        u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..1475979e5718 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,11 @@
 #define LOG_WALK_REPLAY_ALL 3
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only);
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end,
+                           struct btrfs_log_ctx *ctx);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -1496,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                return -EIO;
        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
-        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = objectid;
        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1635,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
            found_key.type == log_key.type &&
            found_key.offset == log_key.offset &&
            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+                update_size = false;
                goto out;
        }
@@ -2155,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
-                blocksize = btrfs_level_size(root, *level - 1);
+                blocksize = root->nodesize;
                parent = path->nodes[*level];
                root_owner = btrfs_header_owner(parent);
@@ -2981,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        min_key.type = key_type;
        min_key.offset = min_offset;
-        path->keep_locks = 1;
        ret = btrfs_search_forward(root, &min_key, path, trans->transid);
        /*
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        struct list_head ordered_sums;
        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        bool has_extents = false;
-        bool need_find_last_extent = (*last_extent == 0);
+        bool need_find_last_extent = true;
        bool done = false;
        INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                 */
                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
                        has_extents = true;
-                        if (need_find_last_extent &&
+                        if (first_key.objectid == (u64)-1)
-                            first_key.objectid == (u64)-1)
                                first_key = ins_keys[i];
                } else {
                        need_find_last_extent = false;
@@ -3363,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                 * or deletes of this inode don't have to relog the inode
                 * again
                 */
-                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
                    !skip_csum) {
                        int found_type;
                        extent = btrfs_item_ptr(src, start_slot + i,
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        if (!has_extents)
                return ret;
+        if (need_find_last_extent && *last_extent == first_key.offset) {
+                /*
+                 * We don't have any leafs between our current one and the one
+                 * we processed before that can have file extent items for our
+                 * inode (and have a generation number smaller than our current
+                 * transaction id).
+                 */
+                need_find_last_extent = false;
+        }
        /*
         * Because we use btrfs_search_forward we could skip leaves that were
         * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
                                               0, 0);
                if (ret)
                        break;
-                *last_extent = offset + len;
+                *last_extent = extent_end;
        }
        /*
         * Need to let the callers know we dropped the path so they should
@@ -3562,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
        return 0;
 }
-static int log_one_extent(struct btrfs_trans_handle *trans,
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
-                          struct inode *inode, struct btrfs_root *root,
+                                struct inode *inode,
-                          struct extent_map *em, struct btrfs_path *path,
+                                struct btrfs_root *root,
-                          struct list_head *logged_list)
+                                const struct extent_map *em,
+                                const struct list_head *logged_list,
+                                bool *ordered_io_error)
 {
-        struct btrfs_root *log = root->log_root;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
        struct btrfs_ordered_extent *ordered;
-        struct list_head ordered_sums;
+        struct btrfs_root *log = root->log_root;
-        struct btrfs_map_token token;
-        struct btrfs_key key;
        u64 mod_start = em->mod_start;
        u64 mod_len = em->mod_len;
+        const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        u64 csum_offset;
        u64 csum_len;
-        u64 extent_offset = em->start - em->orig_start;
+        LIST_HEAD(ordered_sums);
-        u64 block_len;
+        int ret = 0;
-        int ret;
-        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        int extent_inserted = 0;
-        INIT_LIST_HEAD(&ordered_sums);
-        btrfs_init_map_token(&token);
-        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
-                                   em->start + em->len, NULL, 0, 1,
-                                   sizeof(*fi), &extent_inserted);
-        if (ret)
-                return ret;
-        if (!extent_inserted) {
-                key.objectid = btrfs_ino(inode);
-                key.type = BTRFS_EXTENT_DATA_KEY;
-                key.offset = em->start;
-                ret = btrfs_insert_empty_item(trans, log, path, &key,
-                                              sizeof(*fi));
-                if (ret)
-                        return ret;
-        }
-        leaf = path->nodes[0];
-        fi = btrfs_item_ptr(leaf, path->slots[0],
-                            struct btrfs_file_extent_item);
-        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
-                                               &token);
-        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-                skip_csum = true;
-                btrfs_set_token_file_extent_type(leaf, fi,
-                                                 BTRFS_FILE_EXTENT_PREALLOC,
-                                                 &token);
-        } else {
-                btrfs_set_token_file_extent_type(leaf, fi,
-                                                 BTRFS_FILE_EXTENT_REG,
-                                                 &token);
-                if (em->block_start == EXTENT_MAP_HOLE)
-                        skip_csum = true;
-        }
-        block_len = max(em->block_len, em->orig_block_len);
-        if (em->compress_type != BTRFS_COMPRESS_NONE) {
-                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
-                                                        em->block_start,
-                                                        &token);
-                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
-                                                           &token);
-        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
-                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
-                                                        em->block_start -
-                                                        extent_offset, &token);
-                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
-                                                           &token);
-        } else {
-                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
-                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
-                                                           &token);
-        }
-        btrfs_set_token_file_extent_offset(leaf, fi,
+        *ordered_io_error = false;
-                                           em->start - em->orig_start,
-                                           &token);
-        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
-        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
-        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
-                                                &token);
-        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
-        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
-        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(path);
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
-        if (ret) {
+            em->block_start == EXTENT_MAP_HOLE)
-                return ret;
-        }
-        if (skip_csum)
                return 0;
        /*
-         * First check and see if our csums are on our outstanding ordered
+         * Wait far any ordered extent that covers our extent map. If it
-         * extents.
+         * finishes without an error, first check and see if our csums are on
+         * our outstanding ordered extents.
         */
        list_for_each_entry(ordered, logged_list, log_list) {
                struct btrfs_ordered_sum *sum;
@@ -3674,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                    mod_start + mod_len <= ordered->file_offset)
                        continue;
+                if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+                        const u64 start = ordered->file_offset;
+                        const u64 end = ordered->file_offset + ordered->len - 1;
+                        WARN_ON(ordered->inode != inode);
+                        filemap_fdatawrite_range(inode->i_mapping, start, end);
+                }
+                wait_event(ordered->wait,
+                           (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+                            test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+                        *ordered_io_error = true;
+                        break;
+                }
                /*
                 * We are going to copy all the csums on this ordered extent, so
                 * go ahead and adjust mod_start and mod_len in case this
@@ -3705,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                        }
                }
+                if (skip_csum)
+                        continue;
                /*
                 * To keep us from looping for the above case of an ordered
                 * extent that falls inside of the logged extent.
@@ -3722,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                list_for_each_entry(sum, &ordered->list, list) {
                        ret = btrfs_csum_file_blocks(trans, log, sum);
                        if (ret)
-                                goto unlocked;
+                                break;
                }
        }
-unlocked:
-        if (!mod_len || ret)
+        if (*ordered_io_error || !mod_len || ret || skip_csum)
                return ret;
        if (em->compress_type) {
                csum_offset = 0;
-                csum_len = block_len;
+                csum_len = max(em->block_len, em->orig_block_len);
        } else {
                csum_offset = mod_start - em->start;
                csum_len = mod_len;
@@ -3760,11 +3716,106 @@ unlocked:
        return ret;
 }
+static int log_one_extent(struct btrfs_trans_handle *trans,
+                          struct inode *inode, struct btrfs_root *root,
+                          const struct extent_map *em,
+                          struct btrfs_path *path,
+                          const struct list_head *logged_list,
+                          struct btrfs_log_ctx *ctx)
+{
+        struct btrfs_root *log = root->log_root;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct btrfs_map_token token;
+        struct btrfs_key key;
+        u64 extent_offset = em->start - em->orig_start;
+        u64 block_len;
+        int ret;
+        int extent_inserted = 0;
+        bool ordered_io_err = false;
+        ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+                                   &ordered_io_err);
+        if (ret)
+                return ret;
+        if (ordered_io_err) {
+                ctx->io_err = -EIO;
+                return 0;
+        }
+        btrfs_init_map_token(&token);
+        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+                                   em->start + em->len, NULL, 0, 1,
+                                   sizeof(*fi), &extent_inserted);
+        if (ret)
+                return ret;
+        if (!extent_inserted) {
+                key.objectid = btrfs_ino(inode);
+                key.type = BTRFS_EXTENT_DATA_KEY;
+                key.offset = em->start;
+                ret = btrfs_insert_empty_item(trans, log, path, &key,
+                                              sizeof(*fi));
+                if (ret)
+                        return ret;
+        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+                                               &token);
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_PREALLOC,
+                                                 &token);
+        else
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_REG,
+                                                 &token);
+        block_len = max(em->block_len, em->orig_block_len);
+        if (em->compress_type != BTRFS_COMPRESS_NONE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start,
+                                                        &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start -
+                                                        extent_offset, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+                                                           &token);
+        }
+        btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+                                                &token);
+        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        return ret;
+}
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *inode,
                                     struct btrfs_path *path,
-                                     struct list_head *logged_list)
+                                     struct list_head *logged_list,
+                                     struct btrfs_log_ctx *ctx)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -3822,7 +3873,8 @@ process:
                write_unlock(&tree->lock);
-                ret = log_one_extent(trans, inode, root, em, path, logged_list);
+                ret = log_one_extent(trans, inode, root, em, path, logged_list,
+                                     ctx);
                write_lock(&tree->lock);
                clear_em_logging(tree, em);
                free_extent_map(em);
@@ -3849,8 +3901,11 @@ process:
 * This handles both files and directories.
 */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only)
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end,
+                           struct btrfs_log_ctx *ctx)
 {
        struct btrfs_path *path;
        struct btrfs_path *dst_path;
@@ -3867,6 +3922,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int ins_nr;
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        path = btrfs_alloc_path();
        if (!path)
@@ -3950,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                err = ret;
                goto out_unlock;
        }
-        path->keep_locks = 1;
        while (1) {
                ins_nr = 0;
@@ -3980,7 +4035,8 @@ again:
                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
-                } if (ret) {
+                }
+                if (ret) {
                        ins_nr = 0;
                        btrfs_release_path(path);
                        continue;
@@ -4034,19 +4090,41 @@ log_extents:
        btrfs_release_path(dst_path);
        if (fast_search) {
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                                &logged_list);
+                                                &logged_list, ctx);
                if (ret) {
                        err = ret;
                        goto out_unlock;
                }
        } else if (inode_only == LOG_INODE_ALL) {
-                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
-                write_lock(&tree->lock);
+                write_lock(&em_tree->lock);
-                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+                /*
-                        list_del_init(&em->list);
+                 * We can't just remove every em if we're called for a ranged
-                write_unlock(&tree->lock);
+                 * fsync - that is, one that doesn't cover the whole possible
+                 * file range (0 to LLONG_MAX). This is because we can have
+                 * em's that fall outside the range we're logging and therefore
+                 * their ordered operations haven't completed yet
+                 * (btrfs_finish_ordered_io() not invoked yet). This means we
+                 * didn't get their respective file extent item in the fs/subvol
+                 * tree yet, and need to let the next fast fsync (one which
+                 * consults the list of modified extent maps) find the em so
+                 * that it logs a matching file extent item and waits for the
+                 * respective ordered operation to complete (if it's still
+                 * running).
+                 *
+                 * Removing every em outside the range we're logging would make
+                 * the next fast fsync not log their matching file extent items,
+                 * therefore making us lose data after a log replay.
+                 */
+                list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+                                         list) {
+                        const u64 mod_end = em->mod_start + em->mod_len - 1;
+                        if (em->mod_start >= start && mod_end <= end)
+                                list_del_init(&em->list);
+                }
+                write_unlock(&em_tree->lock);
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4134,7 @@ log_extents:
                        goto out_unlock;
                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
@@ -4152,7 +4231,10 @@ out:
 */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct inode *inode,
-                                  struct dentry *parent, int exists_only,
+                                  struct dentry *parent,
+                                  const loff_t start,
+                                  const loff_t end,
+                                  int exists_only,
                                  struct btrfs_log_ctx *ctx)
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
-        ret = btrfs_log_inode(trans, root, inode, inode_only);
+        ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
        if (ret)
                goto end_trans;
@@ -4226,7 +4308,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
-                        ret = btrfs_log_inode(trans, root, inode, inode_only);
+                        ret = btrfs_log_inode(trans, root, inode, inode_only,
+                                              0, LLONG_MAX, ctx);
                        if (ret)
                                goto end_trans;
                }
@@ -4260,13 +4343,15 @@ end_no_trans:
 */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx)
 {
        struct dentry *parent = dget_parent(dentry);
        int ret;
        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-                                     0, ctx);
+                                     start, end, 0, ctx);
        dput(parent);
        return ret;
@@ -4316,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 again:
        key.objectid = BTRFS_TREE_LOG_OBJECTID;
        key.offset = (u64)-1;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.type = BTRFS_ROOT_ITEM_KEY;
        while (1) {
                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
@@ -4503,6 +4588,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                    root->fs_info->last_trans_committed))
                return 0;
-        return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+        return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+                                      LLONG_MAX, 1, NULL);
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
 struct btrfs_log_ctx {
        int log_ret;
        int log_transid;
+        int io_err;
        struct list_head list;
 };
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
 {
        ctx->log_ret = 0;
        ctx->log_transid = 0;
+        ctx->io_err = 0;
        INIT_LIST_HEAD(&ctx->list);
 }
@@ -59,6 +61,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
        key.offset = 0;
 again_search_slot:
-        path->keep_locks = 1;
        ret = btrfs_search_forward(root, &key, path, 0);
        if (ret) {
                if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static DEFINE_MUTEX(uuid_mutex);
+DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
        mutex_init(&fs_devs->device_list_mutex);
        INIT_LIST_HEAD(&fs_devs->devices);
+        INIT_LIST_HEAD(&fs_devs->resized_devices);
        INIT_LIST_HEAD(&fs_devs->alloc_list);
        INIT_LIST_HEAD(&fs_devs->list);
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
        INIT_LIST_HEAD(&dev->dev_list);
        INIT_LIST_HEAD(&dev->dev_alloc_list);
+        INIT_LIST_HEAD(&dev->resized_list);
        spin_lock_init(&dev->io_lock);
        spin_lock_init(&dev->reada_lock);
        atomic_set(&dev->reada_in_flight, 0);
+        atomic_set(&dev->dev_stats_ccnt, 0);
        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
                        return PTR_ERR(fs_devices);
                list_add(&fs_devices->list, &fs_uuids);
-                fs_devices->latest_devid = devid;
-                fs_devices->latest_trans = found_transid;
                device = NULL;
        } else {
                device = __find_device(&fs_devices->devices, devid,
                                       disk_super->dev_item.uuid);
        }
        if (!device) {
                if (fs_devices->opened)
                        return -EBUSY;
@@ -508,6 +510,43 @@ static noinline int device_list_add(const char *path,
                ret = 1;
                device->fs_devices = fs_devices;
        } else if (!device->name || strcmp(device->name->str, path)) {
+                /*
+                 * When FS is already mounted.
+                 * 1. If you are here and if the device->name is NULL that
+                 *    means this device was missing at time of FS mount.
+                 * 2. If you are here and if the device->name is different
+                 *    from 'path' that means either
+                 *      a. The same device disappeared and reappeared with
+                 *         different name. or
+                 *      b. The missing-disk-which-was-replaced, has
+                 *         reappeared now.
+                 *
+                 * We must allow 1 and 2a above. But 2b would be a spurious
+                 * and unintentional.
+                 *
+                 * Further in case of 1 and 2a above, the disk at 'path'
+                 * would have missed some transaction when it was away and
+                 * in case of 2a the stale bdev has to be updated as well.
+                 * 2b must not be allowed at all time.
+                 */
+                /*
+                 * For now, we do allow update to btrfs_fs_device through the
+                 * btrfs dev scan cli after FS has been mounted.  We're still
+                 * tracking a problem where systems fail mount by subvolume id
+                 * when we reject replacement on a mounted FS.
+                 */
+                if (!fs_devices->opened && found_transid < device->generation) {
+                        /*
+                         * That is if the FS is _not_ mounted and if you
+                         * are here, that means there is more than one
+                         * disk with same uuid and devid.We keep the one
+                         * with larger generation number or the last-in if
+                         * generation are equal.
+                         */
+                        return -EEXIST;
+                }
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
@@ -519,10 +558,15 @@ static noinline int device_list_add(const char *path,
                }
        }
-        if (found_transid > fs_devices->latest_trans) {
+        /*
-                fs_devices->latest_devid = devid;
+         * Unmount does not free the btrfs_device struct but would zero
-                fs_devices->latest_trans = found_transid;
+         * generation along with most of the other members. So just update
-        }
+         * it back. We need it to pick the disk with largest generation
+         * (as above).
+         */
+        if (!fs_devices->opened)
+                device->generation = found_transid;
        *fs_devices_ret = fs_devices;
        return ret;
@@ -538,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        if (IS_ERR(fs_devices))
                return fs_devices;
-        fs_devices->latest_devid = orig->latest_devid;
+        mutex_lock(&orig->device_list_mutex);
-        fs_devices->latest_trans = orig->latest_trans;
        fs_devices->total_devices = orig->total_devices;
        /* We have held the volume lock, it is safe to get the devices. */
@@ -568,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
+        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
+        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -578,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
                               struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
+        struct btrfs_device *latest_dev = NULL;
-        struct block_device *latest_bdev = NULL;
-        u64 latest_devid = 0;
-        u64 latest_transid = 0;
        mutex_lock(&uuid_mutex);
 again:
@@ -589,11 +631,9 @@ again:
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata) {
                        if (!device->is_tgtdev_for_dev_replace &&
-                            (!latest_transid ||
+                            (!latest_dev ||
-                             device->generation > latest_transid)) {
+                             device->generation > latest_dev->generation)) {
-                                latest_devid = device->devid;
+                                latest_dev = device;
-                                latest_transid = device->generation;
-                                latest_bdev = device->bdev;
                        }
                        continue;
                }
@@ -635,9 +675,7 @@ again:
                goto again;
        }
-        fs_devices->latest_bdev = latest_bdev;
+        fs_devices->latest_bdev = latest_dev->bdev;
-        fs_devices->latest_devid = latest_devid;
-        fs_devices->latest_trans = latest_transid;
        mutex_unlock(&uuid_mutex);
 }
@@ -686,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                        fs_devices->rw_devices--;
                }
-                if (device->can_discard)
-                        fs_devices->num_can_discard--;
                if (device->missing)
                        fs_devices->missing_devices--;
@@ -752,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        struct block_device *bdev;
        struct list_head *head = &fs_devices->devices;
        struct btrfs_device *device;
-        struct block_device *latest_bdev = NULL;
+        struct btrfs_device *latest_dev = NULL;
        struct buffer_head *bh;
        struct btrfs_super_block *disk_super;
-        u64 latest_devid = 0;
-        u64 latest_transid = 0;
        u64 devid;
        int seeding = 1;
        int ret = 0;
@@ -784,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        goto error_brelse;
                device->generation = btrfs_super_generation(disk_super);
-                if (!latest_transid || device->generation > latest_transid) {
+                if (!latest_dev ||
-                        latest_devid = devid;
+                    device->generation > latest_dev->generation)
-                        latest_transid = device->generation;
+                        latest_dev = device;
-                        latest_bdev = bdev;
-                }
                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
                        device->writeable = 0;
@@ -798,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                }
                q = bdev_get_queue(bdev);
-                if (blk_queue_discard(q)) {
+                if (blk_queue_discard(q))
                        device->can_discard = 1;
-                        fs_devices->num_can_discard++;
-                }
                device->bdev = bdev;
                device->in_fs_metadata = 0;
@@ -831,9 +861,7 @@ error_brelse:
        }
        fs_devices->seeding = seeding;
        fs_devices->opened = 1;
-        fs_devices->latest_bdev = latest_bdev;
+        fs_devices->latest_bdev = latest_dev->bdev;
-        fs_devices->latest_devid = latest_devid;
-        fs_devices->latest_trans = latest_transid;
        fs_devices->total_rw_bytes = 0;
 out:
        return ret;
@@ -1007,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                if (key.objectid > device->devid)
                        break;
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                if (key.type != BTRFS_DEV_EXTENT_KEY)
                        goto next;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1159,7 +1187,7 @@ again:
                if (key.objectid > device->devid)
                        break;
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                if (key.type != BTRFS_DEV_EXTENT_KEY)
                        goto next;
                if (key.offset > search_start) {
@@ -1238,7 +1266,7 @@ out:
 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_device *device,
-                          u64 start)
+                          u64 start, u64 *dev_extent_len)
 {
        int ret;
        struct btrfs_path *path;
@@ -1280,13 +1308,8 @@ again:
                goto out;
        }
-        if (device->bytes_used > 0) {
+        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
-                u64 len = btrfs_dev_extent_length(leaf, extent);
-                device->bytes_used -= len;
-                spin_lock(&root->fs_info->free_chunk_lock);
-                root->fs_info->free_chunk_space += len;
-                spin_unlock(&root->fs_info->free_chunk_lock);
-        }
        ret = btrfs_del_item(trans, root, path);
        if (ret) {
                btrfs_error(root->fs_info, ret,
@@ -1436,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item,
-        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+                                     btrfs_device_get_disk_total_bytes(device));
+        btrfs_set_device_bytes_used(leaf, dev_item,
+                                    btrfs_device_get_bytes_used(device));
        btrfs_set_device_group(leaf, dev_item, 0);
        btrfs_set_device_seek_speed(leaf, dev_item, 0);
        btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1493,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
-        lock_chunks(root);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
@@ -1509,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
                goto out;
 out:
        btrfs_free_path(path);
-        unlock_chunks(root);
        btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -1625,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (device->writeable) {
                lock_chunks(root);
                list_del_init(&device->dev_alloc_list);
+                device->fs_devices->rw_devices--;
                unlock_chunks(root);
-                root->fs_info->fs_devices->rw_devices--;
                clear_super = true;
        }
@@ -1645,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
-        spin_lock(&root->fs_info->free_chunk_lock);
-        root->fs_info->free_chunk_space = device->total_bytes -
-                device->bytes_used;
-        spin_unlock(&root->fs_info->free_chunk_lock);
        device->in_fs_metadata = 0;
        btrfs_scrub_cancel_dev(root->fs_info, device);
@@ -1671,7 +1689,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->total_devices--;
        if (device->missing)
-                root->fs_info->fs_devices->missing_devices--;
+                device->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
@@ -1703,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        fs_devices = fs_devices->seed;
                }
                cur_devices->seed = NULL;
-                lock_chunks(root);
                __btrfs_close_devices(cur_devices);
-                unlock_chunks(root);
                free_fs_devices(cur_devices);
        }
@@ -1778,8 +1794,8 @@ error_undo:
                lock_chunks(root);
                list_add(&device->dev_alloc_list,
                         &root->fs_info->fs_devices->alloc_list);
+                device->fs_devices->rw_devices++;
                unlock_chunks(root);
-                root->fs_info->fs_devices->rw_devices++;
        }
        goto error_brelse;
 }
@@ -1787,25 +1803,57 @@ error_undo:
 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
                                 struct btrfs_device *srcdev)
 {
+        struct btrfs_fs_devices *fs_devices;
        WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+        /*
+         * in case of fs with no seed, srcdev->fs_devices will point
+         * to fs_devices of fs_info. However when the dev being replaced is
+         * a seed dev it will point to the seed's local fs_devices. In short
+         * srcdev will have its correct fs_devices in both the cases.
+         */
+        fs_devices = srcdev->fs_devices;
        list_del_rcu(&srcdev->dev_list);
        list_del_rcu(&srcdev->dev_alloc_list);
-        fs_info->fs_devices->num_devices--;
+        fs_devices->num_devices--;
-        if (srcdev->missing) {
+        if (srcdev->missing)
-                fs_info->fs_devices->missing_devices--;
+                fs_devices->missing_devices--;
-                fs_info->fs_devices->rw_devices++;
-        }
-        if (srcdev->can_discard)
-                fs_info->fs_devices->num_can_discard--;
-        if (srcdev->bdev) {
-                fs_info->fs_devices->open_devices--;
-                /* zero out the old super */
+        if (srcdev->writeable) {
+                fs_devices->rw_devices--;
+                /* zero out the old super if it is writable */
                btrfs_scratch_superblock(srcdev);
        }
+        if (srcdev->bdev)
+                fs_devices->open_devices--;
        call_rcu(&srcdev->rcu, free_device);
+        /*
+         * unless fs_devices is seed fs, num_devices shouldn't go
+         * zero
+         */
+        BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+        /* if this is no devs we rather delete the fs_devices */
+        if (!fs_devices->num_devices) {
+                struct btrfs_fs_devices *tmp_fs_devices;
+                tmp_fs_devices = fs_info->fs_devices;
+                while (tmp_fs_devices) {
+                        if (tmp_fs_devices->seed == fs_devices) {
+                                tmp_fs_devices->seed = fs_devices->seed;
+                                break;
+                        }
+                        tmp_fs_devices = tmp_fs_devices->seed;
+                }
+                fs_devices->seed = NULL;
+                __btrfs_close_devices(fs_devices);
+                free_fs_devices(fs_devices);
+        }
 }
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1813,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 {
        struct btrfs_device *next_device;
+        mutex_lock(&uuid_mutex);
        WARN_ON(!tgtdev);
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        if (tgtdev->bdev) {
@@ -1820,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                fs_info->fs_devices->open_devices--;
        }
        fs_info->fs_devices->num_devices--;
-        if (tgtdev->can_discard)
-                fs_info->fs_devices->num_can_discard++;
        next_device = list_entry(fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
@@ -1834,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
        call_rcu(&tgtdev->rcu, free_device);
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        mutex_unlock(&uuid_mutex);
 }
 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1932,15 +1980,18 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
                              synchronize_rcu);
+        list_for_each_entry(device, &seed_devices->devices, dev_list)
+                device->fs_devices = seed_devices;
+        lock_chunks(root);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
-        list_for_each_entry(device, &seed_devices->devices, dev_list) {
+        unlock_chunks(root);
-                device->fs_devices = seed_devices;
-        }
        fs_devices->seeding = 0;
        fs_devices->num_devices = 0;
        fs_devices->open_devices = 0;
+        fs_devices->missing_devices = 0;
+        fs_devices->rotating = 0;
        fs_devices->seed = seed_devices;
        generate_random_uuid(fs_devices->fsid);
@@ -2039,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        struct list_head *devices;
        struct super_block *sb = root->fs_info->sb;
        struct rcu_string *name;
-        u64 total_bytes;
+        u64 tmp;
        int seeding_dev = 0;
        int ret = 0;
@@ -2095,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        lock_chunks(root);
        q = bdev_get_queue(bdev);
        if (blk_queue_discard(q))
                device->can_discard = 1;
@@ -2107,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->sector_size = root->sectorsize;
        device->total_bytes = i_size_read(bdev->bd_inode);
        device->disk_total_bytes = device->total_bytes;
+        device->commit_total_bytes = device->total_bytes;
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
@@ -2124,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->fs_devices = root->fs_info->fs_devices;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        lock_chunks(root);
        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
@@ -2131,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->open_devices++;
        root->fs_info->fs_devices->rw_devices++;
        root->fs_info->fs_devices->total_devices++;
-        if (device->can_discard)
-                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
        spin_lock(&root->fs_info->free_chunk_lock);
@@ -2142,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                root->fs_info->fs_devices->rotating = 1;
-        total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+        tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
        btrfs_set_super_total_bytes(root->fs_info->super_copy,
-                                    total_bytes + device->total_bytes);
+                                    tmp + device->total_bytes);
-        total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+        tmp = btrfs_super_num_devices(root->fs_info->super_copy);
        btrfs_set_super_num_devices(root->fs_info->super_copy,
-                                    total_bytes + 1);
+                                    tmp + 1);
        /* add sysfs device entry */
        btrfs_kobj_add_device(root->fs_info, device);
+        /*
+         * we've got more storage, clear any full flags on the space
+         * infos
+         */
+        btrfs_clear_space_info_full(root->fs_info);
+        unlock_chunks(root);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (seeding_dev) {
-                char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+                lock_chunks(root);
                ret = init_first_rw_device(trans, root, device);
+                unlock_chunks(root);
                if (ret) {
                        btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
                }
+        }
+        ret = btrfs_add_device(trans, root, device);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto error_trans;
+        }
+        if (seeding_dev) {
+                char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
                ret = btrfs_finish_sprout(trans, root);
                if (ret) {
                        btrfs_abort_transaction(trans, root, ret);
@@ -2175,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                                                root->fs_info->fsid);
                if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
                        goto error_trans;
-        } else {
-                ret = btrfs_add_device(trans, root, device);
-                if (ret) {
-                        btrfs_abort_transaction(trans, root, ret);
-                        goto error_trans;
-                }
        }
-        /*
-         * we've got more storage, clear any full flags on the space
-         * infos
-         */
-        btrfs_clear_space_info_full(root->fs_info);
-        unlock_chunks(root);
        root->fs_info->num_tolerated_disk_barrier_failures =
                btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
        ret = btrfs_commit_transaction(trans, root);
@@ -2221,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        return ret;
 error_trans:
-        unlock_chunks(root);
        btrfs_end_transaction(trans, root);
        rcu_string_free(device->name);
        btrfs_kobj_rm_device(root->fs_info, device);
@@ -2236,6 +2290,7 @@ error:
 }
 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device *srcdev,
                                  struct btrfs_device **device_out)
 {
        struct request_queue *q;
@@ -2248,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
        int ret = 0;
        *device_out = NULL;
-        if (fs_info->fs_devices->seeding)
+        if (fs_info->fs_devices->seeding) {
+                btrfs_err(fs_info, "the filesystem is a seed filesystem!");
                return -EINVAL;
+        }
        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                  fs_info->bdev_holder);
-        if (IS_ERR(bdev))
+        if (IS_ERR(bdev)) {
+                btrfs_err(fs_info, "target device %s is invalid!", device_path);
                return PTR_ERR(bdev);
+        }
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
        devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
+                        btrfs_err(fs_info, "target device is in the filesystem!");
                        ret = -EEXIST;
                        goto error;
                }
        }
+        if (i_size_read(bdev->bd_inode) <
+            btrfs_device_get_total_bytes(srcdev)) {
+                btrfs_err(fs_info, "target device is smaller than source device!");
+                ret = -EINVAL;
+                goto error;
+        }
        device = btrfs_alloc_device(NULL, &devid, NULL);
        if (IS_ERR(device)) {
                ret = PTR_ERR(device);
@@ -2289,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
        device->io_width = root->sectorsize;
        device->io_align = root->sectorsize;
        device->sector_size = root->sectorsize;
-        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->total_bytes = btrfs_device_get_total_bytes(srcdev);
-        device->disk_total_bytes = device->total_bytes;
+        device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+        device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+        ASSERT(list_empty(&srcdev->resized_list));
+        device->commit_total_bytes = srcdev->commit_total_bytes;
+        device->commit_bytes_used = device->bytes_used;
        device->dev_root = fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
@@ -2302,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
        list_add(&device->dev_list, &fs_info->fs_devices->devices);
        fs_info->fs_devices->num_devices++;
        fs_info->fs_devices->open_devices++;
-        if (device->can_discard)
-                fs_info->fs_devices->num_can_discard++;
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        *device_out = device;
@@ -2362,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item,
-        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+                                     btrfs_device_get_disk_total_bytes(device));
+        btrfs_set_device_bytes_used(leaf, dev_item,
+                                    btrfs_device_get_bytes_used(device));
        btrfs_mark_buffer_dirty(leaf);
 out:
@@ -2371,40 +2444,44 @@ out:
        return ret;
 }
-static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size)
 {
        struct btrfs_super_block *super_copy =
                device->dev_root->fs_info->super_copy;
-        u64 old_total = btrfs_super_total_bytes(super_copy);
+        struct btrfs_fs_devices *fs_devices;
-        u64 diff = new_size - device->total_bytes;
+        u64 old_total;
+        u64 diff;
        if (!device->writeable)
                return -EACCES;
+        lock_chunks(device->dev_root);
+        old_total = btrfs_super_total_bytes(super_copy);
+        diff = new_size - device->total_bytes;
        if (new_size <= device->total_bytes ||
-            device->is_tgtdev_for_dev_replace)
+            device->is_tgtdev_for_dev_replace) {
+                unlock_chunks(device->dev_root);
                return -EINVAL;
+        }
+        fs_devices = device->dev_root->fs_info->fs_devices;
        btrfs_set_super_total_bytes(super_copy, old_total + diff);
        device->fs_devices->total_rw_bytes += diff;
-        device->total_bytes = new_size;
+        btrfs_device_set_total_bytes(device, new_size);
-        device->disk_total_bytes = new_size;
+        btrfs_device_set_disk_total_bytes(device, new_size);
        btrfs_clear_space_info_full(device->dev_root->fs_info);
+        if (list_empty(&device->resized_list))
+                list_add_tail(&device->resized_list,
+                              &fs_devices->resized_devices);
+        unlock_chunks(device->dev_root);
        return btrfs_update_device(trans, device);
 }
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
-                      struct btrfs_device *device, u64 new_size)
-{
-        int ret;
-        lock_chunks(device->dev_root);
-        ret = __btrfs_grow_device(trans, device, new_size);
-        unlock_chunks(device->dev_root);
-        return ret;
-}
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            u64 chunk_tree, u64 chunk_objectid,
@@ -2456,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
        u32 cur;
        struct btrfs_key key;
+        lock_chunks(root);
        array_size = btrfs_super_sys_array_size(super_copy);
        ptr = super_copy->sys_chunk_array;
@@ -2485,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                        cur += len;
                }
        }
+        unlock_chunks(root);
        return ret;
 }
-static int btrfs_relocate_chunk(struct btrfs_root *root,
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
-                         u64 chunk_tree, u64 chunk_objectid,
+                       struct btrfs_root *root, u64 chunk_offset)
-                         u64 chunk_offset)
 {
        struct extent_map_tree *em_tree;
-        struct btrfs_root *extent_root;
-        struct btrfs_trans_handle *trans;
        struct extent_map *em;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
        struct map_lookup *map;
-        int ret;
+        u64 dev_extent_len = 0;
-        int i;
+        u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        u64 chunk_tree = root->fs_info->chunk_root->objectid;
+        int i, ret = 0;
+        /* Just in case */
        root = root->fs_info->chunk_root;
-        extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
-        ret = btrfs_can_relocate(extent_root, chunk_offset);
-        if (ret)
-                return -ENOSPC;
-        /* step one, relocate all the extents inside this chunk */
-        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-        if (ret)
-                return ret;
-        trans = btrfs_start_transaction(root, 0);
-        if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                btrfs_std_error(root->fs_info, ret);
-                return ret;
-        }
-        lock_chunks(root);
-        /*
-         * step two, delete the device extents and the
-         * chunk tree entries
-         */
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
        read_unlock(&em_tree->lock);
-        BUG_ON(!em || em->start > chunk_offset ||
+        if (!em || em->start > chunk_offset ||
-               em->start + em->len < chunk_offset);
+            em->start + em->len < chunk_offset) {
+                /*
+                 * This is a logic error, but we don't want to just rely on the
+                 * user having built with ASSERT enabled, so if ASSERT doens't
+                 * do anything we still error out.
+                 */
+                ASSERT(0);
+                if (em)
+                        free_extent_map(em);
+                return -EINVAL;
+        }
        map = (struct map_lookup *)em->bdev;
        for (i = 0; i < map->num_stripes; i++) {
-                ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+                struct btrfs_device *device = map->stripes[i].dev;
-                                            map->stripes[i].physical);
+                ret = btrfs_free_dev_extent(trans, device,
-                BUG_ON(ret);
+                                            map->stripes[i].physical,
+                                            &dev_extent_len);
+                if (ret) {
+                        btrfs_abort_transaction(trans, root, ret);
+                        goto out;
+                }
+                if (device->bytes_used > 0) {
+                        lock_chunks(root);
+                        btrfs_device_set_bytes_used(device,
+                                        device->bytes_used - dev_extent_len);
+                        spin_lock(&root->fs_info->free_chunk_lock);
+                        root->fs_info->free_chunk_space += dev_extent_len;
+                        spin_unlock(&root->fs_info->free_chunk_lock);
+                        btrfs_clear_space_info_full(root->fs_info);
+                        unlock_chunks(root);
+                }
                if (map->stripes[i].dev) {
                        ret = btrfs_update_device(trans, map->stripes[i].dev);
-                        BUG_ON(ret);
+                        if (ret) {
+                                btrfs_abort_transaction(trans, root, ret);
+                                goto out;
+                        }
                }
        }
        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
                               chunk_offset);
+        if (ret) {
-        BUG_ON(ret);
+                btrfs_abort_transaction(trans, root, ret);
+                goto out;
+        }
        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
-                BUG_ON(ret);
+                if (ret) {
+                        btrfs_abort_transaction(trans, root, ret);
+                        goto out;
+                }
        }
        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
-        BUG_ON(ret);
+        if (ret) {
+                btrfs_abort_transaction(trans, extent_root, ret);
+                goto out;
+        }
        write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
@@ -2565,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        /* once for the tree */
        free_extent_map(em);
+out:
        /* once for us */
        free_extent_map(em);
+        return ret;
+}
-        unlock_chunks(root);
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+                         u64 chunk_tree, u64 chunk_objectid,
+                         u64 chunk_offset)
+{
+        struct btrfs_root *extent_root;
+        struct btrfs_trans_handle *trans;
+        int ret;
+        root = root->fs_info->chunk_root;
+        extent_root = root->fs_info->extent_root;
+        ret = btrfs_can_relocate(extent_root, chunk_offset);
+        if (ret)
+                return -ENOSPC;
+        /* step one, relocate all the extents inside this chunk */
+        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+        if (ret)
+                return ret;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                btrfs_std_error(root->fs_info, ret);
+                return ret;
+        }
+        /*
+         * step two, delete the device extents and the
+         * chunk tree entries
+         */
+        ret = btrfs_remove_chunk(trans, root, chunk_offset);
        btrfs_end_transaction(trans, root);
-        return 0;
+        return ret;
 }
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2623,8 +2751,8 @@ again:
                                                   found_key.offset);
                        if (ret == -ENOSPC)
                                failed++;
-                        else if (ret)
+                        else
-                                BUG();
+                                BUG_ON(ret);
                }
                if (found_key.offset == 0)
@@ -3031,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
        /* step one make some room on all the devices */
        devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
-                old_size = device->total_bytes;
+                old_size = btrfs_device_get_total_bytes(device);
                size_to_free = div_factor(old_size, 1);
                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
                if (!device->writeable ||
-                    device->total_bytes - device->bytes_used > size_to_free ||
+                    btrfs_device_get_total_bytes(device) -
+                    btrfs_device_get_bytes_used(device) > size_to_free ||
                    device->is_tgtdev_for_dev_replace)
                        continue;
@@ -3590,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
        max_key.type = BTRFS_ROOT_ITEM_KEY;
        max_key.offset = (u64)-1;
-        path->keep_locks = 1;
        while (1) {
                ret = btrfs_search_forward(root, &key, path, 0);
                if (ret) {
@@ -3843,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        struct btrfs_key key;
        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
-        u64 old_size = device->total_bytes;
+        u64 old_size = btrfs_device_get_total_bytes(device);
-        u64 diff = device->total_bytes - new_size;
+        u64 diff = old_size - new_size;
        if (device->is_tgtdev_for_dev_replace)
                return -EINVAL;
@@ -3857,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        lock_chunks(root);
-        device->total_bytes = new_size;
+        btrfs_device_set_total_bytes(device, new_size);
        if (device->writeable) {
                device->fs_devices->total_rw_bytes -= diff;
                spin_lock(&root->fs_info->free_chunk_lock);
@@ -3923,7 +4050,7 @@ again:
                ret = -ENOSPC;
                lock_chunks(root);
-                device->total_bytes = old_size;
+                btrfs_device_set_total_bytes(device, old_size);
                if (device->writeable)
                        device->fs_devices->total_rw_bytes += diff;
                spin_lock(&root->fs_info->free_chunk_lock);
@@ -3941,18 +4068,17 @@ again:
        }
        lock_chunks(root);
+        btrfs_device_set_disk_total_bytes(device, new_size);
+        if (list_empty(&device->resized_list))
+                list_add_tail(&device->resized_list,
+                              &root->fs_info->fs_devices->resized_devices);
-        device->disk_total_bytes = new_size;
-        /* Now btrfs_update_device() will change the on-disk size. */
-        ret = btrfs_update_device(trans, device);
-        if (ret) {
-                unlock_chunks(root);
-                btrfs_end_transaction(trans, root);
-                goto done;
-        }
        WARN_ON(diff > old_total);
        btrfs_set_super_total_bytes(super_copy, old_total - diff);
        unlock_chunks(root);
+        /* Now btrfs_update_device() will change the on-disk size. */
+        ret = btrfs_update_device(trans, device);
        btrfs_end_transaction(trans, root);
 done:
        btrfs_free_path(path);
@@ -3968,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
        u32 array_size;
        u8 *ptr;
+        lock_chunks(root);
        array_size = btrfs_super_sys_array_size(super_copy);
        if (array_size + item_size + sizeof(disk_key)
-                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+                unlock_chunks(root);
                return -EFBIG;
+        }
        ptr = super_copy->sys_chunk_array + array_size;
        btrfs_cpu_key_to_disk(&disk_key, key);
@@ -3980,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
        memcpy(ptr, chunk, item_size);
        item_size += sizeof(disk_key);
        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+        unlock_chunks(root);
        return 0;
 }
@@ -4349,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        if (ret)
                goto error_del_extent;
+        for (i = 0; i < map->num_stripes; i++) {
+                num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+                btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+        }
+        spin_lock(&extent_root->fs_info->free_chunk_lock);
+        extent_root->fs_info->free_chunk_space -= (stripe_size *
+                                                   map->num_stripes);
+        spin_unlock(&extent_root->fs_info->free_chunk_lock);
        free_extent_map(em);
        check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -4420,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                device = map->stripes[i].dev;
                dev_offset = map->stripes[i].physical;
-                device->bytes_used += stripe_size;
                ret = btrfs_update_device(trans, device);
                if (ret)
                        goto out;
@@ -4433,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                        goto out;
        }
-        spin_lock(&extent_root->fs_info->free_chunk_lock);
-        extent_root->fs_info->free_chunk_space -= (stripe_size *
-                                                   map->num_stripes);
-        spin_unlock(&extent_root->fs_info->free_chunk_lock);
        stripe = &chunk->stripe;
        for (i = 0; i < map->num_stripes; i++) {
                device = map->stripes[i].dev;
@@ -4517,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
        ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
                                  alloc_profile);
-        if (ret) {
+        return ret;
-                btrfs_abort_transaction(trans, root, ret);
+}
-                goto out;
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+        int max_errors;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                         BTRFS_BLOCK_GROUP_RAID10 |
+                         BTRFS_BLOCK_GROUP_RAID5 |
+                         BTRFS_BLOCK_GROUP_DUP)) {
+                max_errors = 1;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+                max_errors = 2;
+        } else {
+                max_errors = 0;
        }
-        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+        return max_errors;
-        if (ret)
-                btrfs_abort_transaction(trans, root, ret);
-out:
-        return ret;
 }
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4535,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
        struct map_lookup *map;
        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        int readonly = 0;
+        int miss_ndevs = 0;
        int i;
        read_lock(&map_tree->map_tree.lock);
@@ -4543,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
        if (!em)
                return 1;
-        if (btrfs_test_opt(root, DEGRADED)) {
-                free_extent_map(em);
-                return 0;
-        }
        map = (struct map_lookup *)em->bdev;
        for (i = 0; i < map->num_stripes; i++) {
+                if (map->stripes[i].dev->missing) {
+                        miss_ndevs++;
+                        continue;
+                }
                if (!map->stripes[i].dev->writeable) {
                        readonly = 1;
-                        break;
+                        goto end;
                }
        }
+        /*
+         * If the number of missing devices is larger than max errors,
+         * we can not write the data into that chunk successfully, so
+         * set it readonly.
+         */
+        if (miss_ndevs > btrfs_chunk_max_errors(map))
+                readonly = 1;
+end:
        free_extent_map(em);
        return readonly;
 }
@@ -4955,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        num_stripes = min_t(u64, map->num_stripes,
                                            stripe_nr_end - stripe_nr_orig);
                stripe_index = do_div(stripe_nr, map->num_stripes);
+                if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+                        mirror_num = 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                        num_stripes = map->num_stripes;
@@ -5058,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        /* We distribute the parity blocks across stripes */
                        tmp = stripe_nr + stripe_index;
                        stripe_index = do_div(tmp, map->num_stripes);
+                        if (!(rw & (REQ_WRITE | REQ_DISCARD |
+                                    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+                                mirror_num = 1;
                }
        } else {
                /*
@@ -5165,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                }
        }
-        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
+        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
-                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                max_errors = btrfs_chunk_max_errors(map);
-                                 BTRFS_BLOCK_GROUP_RAID10 |
-                                 BTRFS_BLOCK_GROUP_RAID5 |
-                                 BTRFS_BLOCK_GROUP_DUP)) {
-                        max_errors = 1;
-                } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
-                        max_errors = 2;
-                }
-        }
        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            dev_replace->tgtdev != NULL) {
@@ -5557,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
                name = rcu_dereference(dev->name);
                pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
                         "(%s id %llu), size=%u\n", rw,
-                         (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                         (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
-                         name->str, dev->devid, bio->bi_size);
+                         name->str, dev->devid, bio->bi_iter.bi_size);
                rcu_read_unlock();
        }
 #endif
@@ -5736,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 }
 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+                                            struct btrfs_fs_devices *fs_devices,
                                            u64 devid, u8 *dev_uuid)
 {
        struct btrfs_device *device;
-        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
        if (IS_ERR(device))
@@ -5800,7 +5951,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
        else
                generate_random_uuid(dev->uuid);
-        btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+        btrfs_init_work(&dev->work, btrfs_submit_helper,
+                        pending_bios_fn, NULL, NULL);
        return dev;
 }
@@ -5875,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                }
                if (!map->stripes[i].dev) {
                        map->stripes[i].dev =
-                                add_missing_dev(root, devid, uuid);
+                                add_missing_dev(root, root->fs_info->fs_devices,
+                                                devid, uuid);
                        if (!map->stripes[i].dev) {
                                free_extent_map(em);
                                return -EIO;
@@ -5902,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        device->devid = btrfs_device_id(leaf, dev_item);
        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
        device->total_bytes = device->disk_total_bytes;
+        device->commit_total_bytes = device->disk_total_bytes;
        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+        device->commit_bytes_used = device->bytes_used;
        device->type = btrfs_device_type(leaf, dev_item);
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5914,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 }
-static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+                                                  u8 *fsid)
 {
        struct btrfs_fs_devices *fs_devices;
        int ret;
@@ -5923,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        fs_devices = root->fs_info->fs_devices->seed;
        while (fs_devices) {
-                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
-                        ret = 0;
+                        return fs_devices;
-                        goto out;
-                }
                fs_devices = fs_devices->seed;
        }
        fs_devices = find_fsid(fsid);
        if (!fs_devices) {
-                ret = -ENOENT;
+                if (!btrfs_test_opt(root, DEGRADED))
-                goto out;
+                        return ERR_PTR(-ENOENT);
+                fs_devices = alloc_fs_devices(fsid);
+                if (IS_ERR(fs_devices))
+                        return fs_devices;
+                fs_devices->seeding = 1;
+                fs_devices->opened = 1;
+                return fs_devices;
        }
        fs_devices = clone_fs_devices(fs_devices);
-        if (IS_ERR(fs_devices)) {
+        if (IS_ERR(fs_devices))
-                ret = PTR_ERR(fs_devices);
+                return fs_devices;
-                goto out;
-        }
        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
                                   root->fs_info->bdev_holder);
        if (ret) {
                free_fs_devices(fs_devices);
+                fs_devices = ERR_PTR(ret);
                goto out;
        }
        if (!fs_devices->seeding) {
                __btrfs_close_devices(fs_devices);
                free_fs_devices(fs_devices);
-                ret = -EINVAL;
+                fs_devices = ERR_PTR(-EINVAL);
                goto out;
        }
        fs_devices->seed = root->fs_info->fs_devices->seed;
        root->fs_info->fs_devices->seed = fs_devices;
 out:
-        return ret;
+        return fs_devices;
 }
 static int read_one_dev(struct btrfs_root *root,
                        struct extent_buffer *leaf,
                        struct btrfs_dev_item *dev_item)
 {
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
        u64 devid;
        int ret;
@@ -5979,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
                           BTRFS_UUID_SIZE);
        if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
-                ret = open_seed_devices(root, fs_uuid);
+                fs_devices = open_seed_devices(root, fs_uuid);
-                if (ret && !btrfs_test_opt(root, DEGRADED))
+                if (IS_ERR(fs_devices))
-                        return ret;
+                        return PTR_ERR(fs_devices);
        }
        device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
-        if (!device || !device->bdev) {
+        if (!device) {
                if (!btrfs_test_opt(root, DEGRADED))
                        return -EIO;
-                if (!device) {
+                btrfs_warn(root->fs_info, "devid %llu missing", devid);
-                        btrfs_warn(root->fs_info, "devid %llu missing", devid);
+                device = add_missing_dev(root, fs_devices, devid, dev_uuid);
-                        device = add_missing_dev(root, devid, dev_uuid);
+                if (!device)
-                        if (!device)
+                        return -ENOMEM;
-                                return -ENOMEM;
+        } else {
-                } else if (!device->missing) {
+                if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+                        return -EIO;
+                if(!device->bdev && !device->missing) {
                        /*
                         * this happens when a device that was properly setup
                         * in the device info lists suddenly goes bad.
                         * device->bdev is NULL, and so we have to set
                         * device->missing to one here
                         */
-                        root->fs_info->fs_devices->missing_devices++;
+                        device->fs_devices->missing_devices++;
                        device->missing = 1;
                }
+                /* Move the device to its own fs_devices */
+                if (device->fs_devices != fs_devices) {
+                        ASSERT(device->missing);
+                        list_move(&device->dev_list, &fs_devices->devices);
+                        device->fs_devices->num_devices--;
+                        fs_devices->num_devices++;
+                        device->fs_devices->missing_devices--;
+                        fs_devices->missing_devices++;
+                        device->fs_devices = fs_devices;
+                }
        }
        if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6319,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
        struct btrfs_root *dev_root = fs_info->dev_root;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
+        int stats_cnt;
        int ret = 0;
        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (!device->dev_stats_valid || !device->dev_stats_dirty)
+                if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
                        continue;
+                stats_cnt = atomic_read(&device->dev_stats_ccnt);
                ret = update_dev_stat_item(trans, dev_root, device);
                if (!ret)
-                        device->dev_stats_dirty = 0;
+                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
        }
        mutex_unlock(&fs_devices->device_list_mutex);
@@ -6427,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
        return 0;
 }
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *curr, *next;
+        if (list_empty(&fs_devices->resized_devices))
+                return;
+        mutex_lock(&fs_devices->device_list_mutex);
+        lock_chunks(fs_info->dev_root);
+        list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+                                 resized_list) {
+                list_del_init(&curr->resized_list);
+                curr->commit_total_bytes = curr->disk_total_bytes;
+        }
+        unlock_chunks(fs_info->dev_root);
+        mutex_unlock(&fs_devices->device_list_mutex);
+}
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+                                        struct btrfs_transaction *transaction)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct btrfs_device *dev;
+        int i;
+        if (list_empty(&transaction->pending_chunks))
+                return;
+        /* In order to kick the device replace finish process */
+        lock_chunks(root);
+        list_for_each_entry(em, &transaction->pending_chunks, list) {
+                map = (struct map_lookup *)em->bdev;
+                for (i = 0; i < map->num_stripes; i++) {
+                        dev = map->stripes[i].dev;
+                        dev->commit_bytes_used = dev->bytes_used;
+                }
+        }
+        unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
 #include <linux/btrfs.h>
 #include "async-thread.h"
+extern struct mutex uuid_mutex;
 #define BTRFS_STRIPE_LEN        (64 * 1024)
 struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
        struct bio *tail;
 };
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device)  \
+        seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
 struct btrfs_device {
        struct list_head dev_list;
        struct list_head dev_alloc_list;
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_root *dev_root;
+        struct rcu_string *name;
+        u64 generation;
+        spinlock_t io_lock ____cacheline_aligned;
+        int running_pending;
        /* regular prio bios */
        struct btrfs_pending_bios pending_bios;
        /* WRITE_SYNC bios */
        struct btrfs_pending_bios pending_sync_bios;
-        u64 generation;
+        struct block_device *bdev;
-        int running_pending;
+        /* the mode sent to blkdev_get */
+        fmode_t mode;
        int writeable;
        int in_fs_metadata;
        int missing;
        int can_discard;
        int is_tgtdev_for_dev_replace;
-        spinlock_t io_lock;
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
-        /* the mode sent to blkdev_get */
+        seqcount_t data_seqcount;
-        fmode_t mode;
+#endif
-        struct block_device *bdev;
-        struct rcu_string *name;
        /* the internal btrfs device id */
        u64 devid;
-        /* size of the device */
+        /* size of the device in memory */
        u64 total_bytes;
-        /* size of the disk */
+        /* size of the device on disk */
        u64 disk_total_bytes;
        /* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
        /* minimal io size for this device */
        u32 sector_size;
        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];
+        /*
+         * size of the device on the current transaction
+         *
+         * This variant is update when committing the transaction,
+         * and protected by device_list_mutex
+         */
+        u64 commit_total_bytes;
+        /* bytes used on the current transaction */
+        u64 commit_bytes_used;
+        /*
+         * used to manage the device which is resized
+         *
+         * It is protected by chunk_lock.
+         */
+        struct list_head resized_list;
        /* for sending down flush barriers */
        int nobarriers;
        struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
        struct radix_tree_root reada_zones;
        struct radix_tree_root reada_extents;
        /* disk I/O failure stats. For detailed description refer to
         * enum btrfs_dev_stat_values in ioctl.h */
        int dev_stats_valid;
-        int dev_stats_dirty; /* counters need to be written to disk */
+        /* Counter to record the change of device stats */
+        atomic_t dev_stats_ccnt;
        atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
 };
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name)                                 \
+static inline u64                                                       \
+btrfs_device_get_##name(const struct btrfs_device *dev)                 \
+{                                                                       \
+        u64 size;                                                       \
+        unsigned int seq;                                               \
+                                                                        \
+        do {                                                            \
+                seq = read_seqcount_begin(&dev->data_seqcount);         \
+                size = dev->name;                                       \
+        } while (read_seqcount_retry(&dev->data_seqcount, seq));        \
+        return size;                                                    \
+}                                                                       \
+                                                                        \
+static inline void                                                      \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)             \
+{                                                                       \
+        preempt_disable();                                              \
+        write_seqcount_begin(&dev->data_seqcount);                      \
+        dev->name = size;                                               \
+        write_seqcount_end(&dev->data_seqcount);                        \
+        preempt_enable();                                               \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name)                                 \
+static inline u64                                                       \
+btrfs_device_get_##name(const struct btrfs_device *dev)                 \
+{                                                                       \
+        u64 size;                                                       \
+                                                                        \
+        preempt_disable();                                              \
+        size = dev->name;                                               \
+        preempt_enable();                                               \
+        return size;                                                    \
+}                                                                       \
+                                                                        \
+static inline void                                                      \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)             \
+{                                                                       \
+        preempt_disable();                                              \
+        dev->name = size;                                               \
+        preempt_enable();                                               \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name)                                 \
+static inline u64                                                       \
+btrfs_device_get_##name(const struct btrfs_device *dev)                 \
+{                                                                       \
+        return dev->name;                                               \
+}                                                                       \
+                                                                        \
+static inline void                                                      \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size)             \
+{                                                                       \
+        dev->name = size;                                               \
+}
+#endif
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
-        /* the device with this id has the most recent copy of the super */
-        u64 latest_devid;
-        u64 latest_trans;
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
        u64 missing_devices;
        u64 total_rw_bytes;
-        u64 num_can_discard;
        u64 total_devices;
        struct block_device *latest_bdev;
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
        struct mutex device_list_mutex;
        struct list_head devices;
+        struct list_head resized_devices;
        /* devices not currently being allocated */
        struct list_head alloc_list;
        struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
 */
 typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
 struct btrfs_io_bio {
-        unsigned long mirror_num;
+        unsigned int mirror_num;
-        unsigned long stripe_index;
+        unsigned int stripe_index;
+        u64 logical;
        u8 *csum;
        u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
        u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device *srcdev,
                                  struct btrfs_device **device_out);
 int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
                                u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, u64 chunk_offset);
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+        return atomic_read(&dev->dev_stats_ccnt);
+}
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
 {
        atomic_inc(dev->dev_stat_values + index);
-        dev->dev_stats_dirty = 1;
+        smp_mb__before_atomic();
+        atomic_inc(&dev->dev_stats_ccnt);
 }
 static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
        int ret;
        ret = atomic_xchg(dev->dev_stat_values + index, 0);
-        dev->dev_stats_dirty = 1;
+        smp_mb__before_atomic();
+        atomic_inc(&dev->dev_stats_ccnt);
        return ret;
 }
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
                                      int index, unsigned long val)
 {
        atomic_set(dev->dev_stat_values + index, val);
-        dev->dev_stats_dirty = 1;
+        smp_mb__before_atomic();
+        atomic_inc(&dev->dev_stats_ccnt);
 }
 static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
 {
        btrfs_dev_stat_set(dev, index, 0);
 }
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+                                        struct btrfs_transaction *transaction);
 #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
         * first xattr that we find and walk forward
         */
        key.objectid = btrfs_ino(inode);
-        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;
        path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                /* check to make sure this item is what we want */
                if (found_key.objectid != key.objectid)
                        break;
-                if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+                if (found_key.type != BTRFS_XATTR_ITEM_KEY)
                        break;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
 #include "compression.h"
 struct workspace {
-        z_stream inf_strm;
+        z_stream strm;
-        z_stream def_strm;
        char *buf;
        struct list_head list;
 };
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
-        vfree(workspace->def_strm.workspace);
+        vfree(workspace->strm.workspace);
-        vfree(workspace->inf_strm.workspace);
        kfree(workspace->buf);
        kfree(workspace);
 }
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
 static struct list_head *zlib_alloc_workspace(void)
 {
        struct workspace *workspace;
+        int workspacesize;
        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
        if (!workspace)
                return ERR_PTR(-ENOMEM);
-        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+        workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
-                                                MAX_WBITS, MAX_MEM_LEVEL));
+                        zlib_inflate_workspacesize());
-        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+        workspace->strm.workspace = vmalloc(workspacesize);
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-        if (!workspace->def_strm.workspace ||
+        if (!workspace->strm.workspace || !workspace->buf)
-            !workspace->inf_strm.workspace || !workspace->buf)
                goto fail;
        INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
        *total_out = 0;
        *total_in = 0;
-        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+        if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
                printk(KERN_WARNING "BTRFS: deflateInit failed\n");
                ret = -EIO;
                goto out;
        }
-        workspace->def_strm.total_in = 0;
+        workspace->strm.total_in = 0;
-        workspace->def_strm.total_out = 0;
+        workspace->strm.total_out = 0;
        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
        data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
        pages[0] = out_page;
        nr_pages = 1;
-        workspace->def_strm.next_in = data_in;
+        workspace->strm.next_in = data_in;
-        workspace->def_strm.next_out = cpage_out;
+        workspace->strm.next_out = cpage_out;
-        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->strm.avail_out = PAGE_CACHE_SIZE;
-        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+        workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
-        while (workspace->def_strm.total_in < len) {
+        while (workspace->strm.total_in < len) {
-                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+                ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
                        printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
                               ret);
-                        zlib_deflateEnd(&workspace->def_strm);
+                        zlib_deflateEnd(&workspace->strm);
                        ret = -EIO;
                        goto out;
                }
                /* we're making it bigger, give up */
-                if (workspace->def_strm.total_in > 8192 &&
+                if (workspace->strm.total_in > 8192 &&
-                    workspace->def_strm.total_in <
+                    workspace->strm.total_in <
-                    workspace->def_strm.total_out) {
+                    workspace->strm.total_out) {
                        ret = -E2BIG;
                        goto out;
                }
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
                 * before the total_in so we will pull in a new page for
                 * the stream end if required
                 */
-                if (workspace->def_strm.avail_out == 0) {
+                if (workspace->strm.avail_out == 0) {
                        kunmap(out_page);
                        if (nr_pages == nr_dest_pages) {
                                out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
                        cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
-                        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                        workspace->strm.avail_out = PAGE_CACHE_SIZE;
-                        workspace->def_strm.next_out = cpage_out;
+                        workspace->strm.next_out = cpage_out;
                }
                /* we're all done */
-                if (workspace->def_strm.total_in >= len)
+                if (workspace->strm.total_in >= len)
                        break;
                /* we've read in a full page, get a new one */
-                if (workspace->def_strm.avail_in == 0) {
+                if (workspace->strm.avail_in == 0) {
-                        if (workspace->def_strm.total_out > max_out)
+                        if (workspace->strm.total_out > max_out)
                                break;
-                        bytes_left = len - workspace->def_strm.total_in;
+                        bytes_left = len - workspace->strm.total_in;
                        kunmap(in_page);
                        page_cache_release(in_page);
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
                        in_page = find_get_page(mapping,
                                                start >> PAGE_CACHE_SHIFT);
                        data_in = kmap(in_page);
-                        workspace->def_strm.avail_in = min(bytes_left,
+                        workspace->strm.avail_in = min(bytes_left,
                                                           PAGE_CACHE_SIZE);
-                        workspace->def_strm.next_in = data_in;
+                        workspace->strm.next_in = data_in;
                }
        }
-        workspace->def_strm.avail_in = 0;
+        workspace->strm.avail_in = 0;
-        ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+        ret = zlib_deflate(&workspace->strm, Z_FINISH);
-        zlib_deflateEnd(&workspace->def_strm);
+        zlib_deflateEnd(&workspace->strm);
        if (ret != Z_STREAM_END) {
                ret = -EIO;
                goto out;
        }
-        if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+        if (workspace->strm.total_out >= workspace->strm.total_in) {
                ret = -E2BIG;
                goto out;
        }
        ret = 0;
-        *total_out = workspace->def_strm.total_out;
+        *total_out = workspace->strm.total_out;
-        *total_in = workspace->def_strm.total_in;
+        *total_in = workspace->strm.total_in;
 out:
        *out_pages = nr_pages;
        if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
        size_t total_out = 0;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+        unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
-                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
        unsigned long pg_offset;
        data_in = kmap(pages_in[page_in_index]);
-        workspace->inf_strm.next_in = data_in;
+        workspace->strm.next_in = data_in;
-        workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+        workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
-        workspace->inf_strm.total_in = 0;
+        workspace->strm.total_in = 0;
-        workspace->inf_strm.total_out = 0;
+        workspace->strm.total_out = 0;
-        workspace->inf_strm.next_out = workspace->buf;
+        workspace->strm.next_out = workspace->buf;
-        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->strm.avail_out = PAGE_CACHE_SIZE;
        pg_offset = 0;
        /* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
            !(((data_in[0]<<8) + data_in[1]) % 31)) {
                wbits = -((data_in[0] >> 4) + 8);
-                workspace->inf_strm.next_in += 2;
+                workspace->strm.next_in += 2;
-                workspace->inf_strm.avail_in -= 2;
+                workspace->strm.avail_in -= 2;
        }
-        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+        if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
                printk(KERN_WARNING "BTRFS: inflateInit failed\n");
                return -EIO;
        }
-        while (workspace->inf_strm.total_in < srclen) {
+        while (workspace->strm.total_in < srclen) {
-                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
                buf_start = total_out;
-                total_out = workspace->inf_strm.total_out;
+                total_out = workspace->strm.total_out;
                /* we didn't make progress in this inflate call, we're done */
                if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
                        goto done;
                }
-                workspace->inf_strm.next_out = workspace->buf;
+                workspace->strm.next_out = workspace->buf;
-                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                workspace->strm.avail_out = PAGE_CACHE_SIZE;
-                if (workspace->inf_strm.avail_in == 0) {
+                if (workspace->strm.avail_in == 0) {
                        unsigned long tmp;
                        kunmap(pages_in[page_in_index]);
                        page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
                                break;
                        }
                        data_in = kmap(pages_in[page_in_index]);
-                        workspace->inf_strm.next_in = data_in;
+                        workspace->strm.next_in = data_in;
-                        tmp = srclen - workspace->inf_strm.total_in;
+                        tmp = srclen - workspace->strm.total_in;
-                        workspace->inf_strm.avail_in = min(tmp,
+                        workspace->strm.avail_in = min(tmp,
                                                           PAGE_CACHE_SIZE);
                }
        }
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
        else
                ret = 0;
 done:
-        zlib_inflateEnd(&workspace->inf_strm);
+        zlib_inflateEnd(&workspace->strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
        return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        unsigned long total_out = 0;
        char *kaddr;
-        workspace->inf_strm.next_in = data_in;
+        workspace->strm.next_in = data_in;
-        workspace->inf_strm.avail_in = srclen;
+        workspace->strm.avail_in = srclen;
-        workspace->inf_strm.total_in = 0;
+        workspace->strm.total_in = 0;
-        workspace->inf_strm.next_out = workspace->buf;
+        workspace->strm.next_out = workspace->buf;
-        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->strm.avail_out = PAGE_CACHE_SIZE;
-        workspace->inf_strm.total_out = 0;
+        workspace->strm.total_out = 0;
        /* If it's deflate, and it's got no preset dictionary, then
           we can tell zlib to skip the adler32 check. */
        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
            !(((data_in[0]<<8) + data_in[1]) % 31)) {
                wbits = -((data_in[0] >> 4) + 8);
-                workspace->inf_strm.next_in += 2;
+                workspace->strm.next_in += 2;
-                workspace->inf_strm.avail_in -= 2;
+                workspace->strm.avail_in -= 2;
        }
-        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+        if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
                printk(KERN_WARNING "BTRFS: inflateInit failed\n");
                return -EIO;
        }
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
                unsigned long bytes;
                unsigned long pg_offset = 0;
-                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
                buf_start = total_out;
-                total_out = workspace->inf_strm.total_out;
+                total_out = workspace->strm.total_out;
                if (total_out == buf_start) {
                        ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
                pg_offset += bytes;
                bytes_left -= bytes;
 next:
-                workspace->inf_strm.next_out = workspace->buf;
+                workspace->strm.next_out = workspace->buf;
-                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                workspace->strm.avail_out = PAGE_CACHE_SIZE;
        }
        if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
        else
                ret = 0;
-        zlib_inflateEnd(&workspace->inf_strm);
+        zlib_inflateEnd(&workspace->strm);
        return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..44c14a87750e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
                bh = page_buffers(page);
                if (bh->b_size == size) {
                        end_block = init_page_buffers(page, bdev,
-                                                index << sizebits, size);
+                                                (sector_t)index << sizebits,
+                                                size);
                        goto done;
                }
                if (!try_to_free_buffers(page))
@@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
         */
        spin_lock(&inode->i_mapping->private_lock);
        link_dev_buffers(page, bh);
-        end_block = init_page_buffers(page, bdev, index << sizebits, size);
+        end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+                        size);
        spin_unlock(&inode->i_mapping->private_lock);
 done:
        ret = (block < end_block) ? 1 : -ENXIO;
@@ -1251,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
 * a local interrupt disable for that.
 */
-#define BH_LRU_SIZE     8
+#define BH_LRU_SIZE     16
 struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
@@ -2954,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 /*
 * This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
 * of the physical sector size.
 *
 * We'll just truncate the bio to the size of the device,
@@ -2964,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 * errors, this only handles the "we need to be able to
 * do IO at the final sector" case.
 */
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
 {
        sector_t maxsector;
-        unsigned bytes;
+        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+        unsigned truncated_bytes;
        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
        if (!maxsector)
@@ -2982,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
                return;
        maxsector -= bio->bi_iter.bi_sector;
-        bytes = bio->bi_iter.bi_size;
+        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
-        if (likely((bytes >> 9) <= maxsector))
                return;
-        /* Uhhuh. We've got a bh that straddles the device size! */
+        /* Uhhuh. We've got a bio that straddles the device size! */
-        bytes = maxsector << 9;
+        truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
        /* Truncate the bio.. */
-        bio->bi_iter.bi_size = bytes;
+        bio->bi_iter.bi_size -= truncated_bytes;
-        bio->bi_io_vec[0].bv_len = bytes;
+        bvec->bv_len -= truncated_bytes;
        /* ..and clear the end of the buffer for reads */
        if ((rw & RW_MASK) == READ) {
-                void *kaddr = kmap_atomic(bh->b_page);
+                zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
-                memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+                                truncated_bytes);
-                kunmap_atomic(kaddr);
-                flush_dcache_page(bh->b_page);
        }
 }
@@ -3039,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
        bio->bi_flags |= bio_flags;
        /* Take care of bh's that straddle the end of the device */
-        guard_bh_eod(rw, bio, bh);
+        guard_bio_eod(rw, bio);
        if (buffer_meta(bh))
                rw |= REQ_META;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
               cache->brun_percent  < 100);
        if (*args) {
-                pr_err("'bind' command doesn't take an argument");
+                pr_err("'bind' command doesn't take an argument\n");
                return -EINVAL;
        }
        if (!cache->rootdirname) {
-                pr_err("No cache directory specified");
+                pr_err("No cache directory specified\n");
                return -EINVAL;
        }
        /* don't permit already bound caches to be re-bound */
        if (test_bit(CACHEFILES_READY, &cache->flags)) {
-                pr_err("Cache already bound");
+                pr_err("Cache already bound\n");
                return -EBUSY;
        }
@@ -248,7 +248,7 @@ error_open_root:
        kmem_cache_free(cachefiles_object_jar, fsdef);
 error_root_object:
        cachefiles_end_secure(cache, saved_cred);
-        pr_err("Failed to register: %d", ret);
+        pr_err("Failed to register: %d\n", ret);
        return ret;
 }
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
                                         char *args)
 {
-        pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
+        pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
        return -EINVAL;
 }
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
        _enter(",%s", args);
        if (!*args) {
-                pr_err("Empty directory specified");
+                pr_err("Empty directory specified\n");
                return -EINVAL;
        }
        if (cache->rootdirname) {
-                pr_err("Second cache directory specified");
+                pr_err("Second cache directory specified\n");
                return -EEXIST;
        }
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
        _enter(",%s", args);
        if (!*args) {
-                pr_err("Empty security context specified");
+                pr_err("Empty security context specified\n");
                return -EINVAL;
        }
        if (cache->secctx) {
-                pr_err("Second security context specified");
+                pr_err("Second security context specified\n");
                return -EINVAL;
        }
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
        _enter(",%s", args);
        if (!*args) {
-                pr_err("Empty tag specified");
+                pr_err("Empty tag specified\n");
                return -EINVAL;
        }
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
                goto inval;
        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-                pr_err("cull applied to unready cache");
+                pr_err("cull applied to unready cache\n");
                return -EIO;
        }
        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-                pr_err("cull applied to dead cache");
+                pr_err("cull applied to dead cache\n");
                return -EIO;
        }
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 notdir:
        path_put(&path);
-        pr_err("cull command requires dirfd to be a directory");
+        pr_err("cull command requires dirfd to be a directory\n");
        return -ENOTDIR;
 inval:
-        pr_err("cull command requires dirfd and filename");
+        pr_err("cull command requires dirfd and filename\n");
        return -EINVAL;
 }
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
        return 0;
 inval:
-        pr_err("debug command requires mask");
+        pr_err("debug command requires mask\n");
        return -EINVAL;
 }
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
                goto inval;
        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-                pr_err("inuse applied to unready cache");
+                pr_err("inuse applied to unready cache\n");
                return -EIO;
        }
        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-                pr_err("inuse applied to dead cache");
+                pr_err("inuse applied to dead cache\n");
                return -EIO;
        }
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 notdir:
        path_put(&path);
-        pr_err("inuse command requires dirfd to be a directory");
+        pr_err("inuse command requires dirfd to be a directory\n");
        return -ENOTDIR;
 inval:
-        pr_err("inuse command requires dirfd and filename");
+        pr_err("inuse command requires dirfd and filename\n");
        return -EINVAL;
 }
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 #define cachefiles_io_error(___cache, FMT, ...)         \
 do {                                                    \
-        pr_err("I/O Error: " FMT, ##__VA_ARGS__);       \
+        pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__);   \
        fscache_io_error(&(___cache)->cache);           \
        set_bit(CACHEFILES_DEAD, &(___cache)->flags);   \
 } while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
 error_object_jar:
        misc_deregister(&cachefiles_dev);
 error_dev:
-        pr_err("failed to register: %d", ret);
+        pr_err("failed to register: %d\n", ret);
        return ret;
 }
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..dad7d9542a24 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -543,7 +543,7 @@ lookup_again:
                               next, next->d_inode, next->d_inode->i_ino);
                } else if (!S_ISDIR(next->d_inode->i_mode)) {
-                        pr_err("inode %lu is not a directory",
+                        pr_err("inode %lu is not a directory\n",
                               next->d_inode->i_ino);
                        ret = -ENOBUFS;
                        goto error;
@@ -574,7 +574,7 @@ lookup_again:
                } else if (!S_ISDIR(next->d_inode->i_mode) &&
                           !S_ISREG(next->d_inode->i_mode)
                           ) {
-                        pr_err("inode %lu is not a file or directory",
+                        pr_err("inode %lu is not a file or directory\n",
                               next->d_inode->i_ino);
                        ret = -ENOBUFS;
                        goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
        ASSERT(subdir->d_inode);
        if (!S_ISDIR(subdir->d_inode->i_mode)) {
-                pr_err("%s is not a directory", dirname);
+                pr_err("%s is not a directory\n", dirname);
                ret = -EIO;
                goto check_error;
        }
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
            !subdir->d_inode->i_op->lookup ||
            !subdir->d_inode->i_op->mkdir ||
            !subdir->d_inode->i_op->create ||
-            !subdir->d_inode->i_op->rename ||
+            (!subdir->d_inode->i_op->rename &&
+             !subdir->d_inode->i_op->rename2) ||
            !subdir->d_inode->i_op->rmdir ||
            !subdir->d_inode->i_op->unlink)
                goto check_error;
@@ -795,13 +796,13 @@ check_error:
 mkdir_error:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(subdir);
-        pr_err("mkdir %s failed with error %d", dirname, ret);
+        pr_err("mkdir %s failed with error %d\n", dirname, ret);
        return ERR_PTR(ret);
 lookup_error:
        mutex_unlock(&dir->d_inode->i_mutex);
        ret = PTR_ERR(subdir);
-        pr_err("Lookup %s failed with error %d", dirname, ret);
+        pr_err("Lookup %s failed with error %d\n", dirname, ret);
        return ERR_PTR(ret);
 nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
        if (ret == -EIO) {
                cachefiles_io_error(cache, "Lookup failed");
        } else if (ret != -ENOMEM) {
-                pr_err("Internal error: %d", ret);
+                pr_err("Internal error: %d\n", ret);
                ret = -EIO;
        }
@@ -950,7 +951,7 @@ error:
        }
        if (ret != -ENOMEM) {
-                pr_err("Internal error: %d", ret);
+                pr_err("Internal error: %d\n", ret);
                ret = -EIO;
        }
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..25e745b8eb1b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
        struct cachefiles_one_read *monitor;
        struct cachefiles_object *object;
        struct fscache_retrieval *op;
-        struct pagevec pagevec;
        int error, max;
        op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
        _enter("{ino=%lu}", object->backer->d_inode->i_ino);
-        pagevec_init(&pagevec, 0);
        max = 8;
        spin_lock_irq(&object->work_lock);
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 {
        struct cachefiles_object *object;
        struct cachefiles_cache *cache;
-        struct pagevec pagevec;
        struct inode *inode;
        sector_t block0, block;
        unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        op->op.flags |= FSCACHE_OP_ASYNC;
        op->op.processor = cachefiles_read_copier;
-        pagevec_init(&pagevec, 0);
        /* we assume the absence or presence of the first block is a good
         * enough indication for the page as a whole
         * - TODO: don't use bmap() for this as it is _not_ actually good
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
        }
        if (ret != -EEXIST) {
-                pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
+                pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
                       dentry->d_name.len, dentry->d_name.len,
                       dentry->d_name.name, dentry->d_inode->i_ino,
                       -ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
                if (ret == -ERANGE)
                        goto bad_type_length;
-                pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
+                pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
                       dentry->d_name.len, dentry->d_name.len,
                       dentry->d_name.name, dentry->d_inode->i_ino,
                       -ret);
@@ -85,14 +85,14 @@ error:
        return ret;
 bad_type_length:
-        pr_err("Cache object %lu type xattr length incorrect",
+        pr_err("Cache object %lu type xattr length incorrect\n",
               dentry->d_inode->i_ino);
        ret = -EIO;
        goto error;
 bad_type:
        xtype[2] = 0;
-        pr_err("Cache object %*.*s [%lu] type %s not %s",
+        pr_err("Cache object %*.*s [%lu] type %s not %s\n",
               dentry->d_name.len, dentry->d_name.len,
               dentry->d_name.name, dentry->d_inode->i_ino,
               xtype, type);
@@ -293,7 +293,7 @@ error:
        return ret;
 bad_type_length:
-        pr_err("Cache object %lu xattr length incorrect",
+        pr_err("Cache object %lu xattr length incorrect\n",
               dentry->d_inode->i_ino);
        ret = -EIO;
        goto error;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
          support for OS/2 and Windows ME and similar servers is provided as
          well.
+          The module also provides optional support for the followon
+          protocols for CIFS including SMB3, which enables
+          useful performance and security features (see the description
+          of CONFIG_CIFS_SMB2).
          The cifs module provides an advanced network file system
          client for mounting to CIFS compliant servers.  It includes
          support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
          depends on CIFS_XATTR && KEYS
          help
            Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
-            is handed over to the application/caller.
+            is handed over to the application/caller.  See the man
+            page for getcifsacl for more information.
 config CIFS_DEBUG
        bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
           Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 config CIFS_SMB2
-        bool "SMB2 network file system support"
+        bool "SMB2 and SMB3 network file system support"
        depends on CIFS && INET
        select NLS
        select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
        select DNS_RESOLVER
        help
-          This enables experimental support for the SMB2 (Server Message Block
+          This enables support for the Server Message Block version 2
-          version 2) protocol. The SMB2 protocol is the successor to the
+          family of protocols, including SMB3.  SMB3 support is
-          popular CIFS and SMB network file sharing protocols. SMB2 is the
+          enabled on mount by specifying "vers=3.0" in the mount
-          native file sharing mechanism for recent versions of Windows
+          options. These protocols are the successors to the popular
-          operating systems (since Vista).  SMB2 enablement will eventually
+          CIFS and SMB network file sharing protocols. SMB3 is the
-          allow users better performance, security and features, than would be
+          native file sharing mechanism for the more recent
-          possible with cifs. Note that smb2 mount options also are simpler
+          versions of Windows (Windows 8 and Windows 2012 and
-          (compared to cifs) due to protocol improvements.
+          later) and Samba server and many others support SMB3 well.
+          In general SMB3 enables better performance, security
-          Unless you are a developer or tester, say N.
+          and features, than would be possible with CIFS (Note that
+          when mounting to Samba, due to the CIFS POSIX extensions,
+          CIFS mounts can provide slightly better POSIX compatibility
+          than SMB3 mounts do though). Note that SMB2/SMB3 mount
+          options are also slightly simpler (compared to CIFS) due
+          to protocol improvements.
 config CIFS_FSCACHE
          bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 85c70d5969ac..9d7996e8e793 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
+{
+        struct super_block *sb = file->f_path.dentry->d_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        if (server->ops->fallocate)
+                return server->ops->fallocate(file, tcon, mode, off, len);
+        return -EOPNOTSUPP;
+}
 static int cifs_permission(struct inode *inode, int mask)
 {
        struct cifs_sb_info *cifs_sb;
@@ -813,8 +826,9 @@ cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv
        if (!(S_ISREG(inode->i_mode)))
                return -EINVAL;
-        /* check if file is oplocked */
+        /* Check if file is oplocked if this is request for new lease */
-        if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+        if (arg == F_UNLCK ||
+            ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
            ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
                return generic_setlease(file, arg, lease, priv);
        else if (tlink_tcon(cfile->tlink)->local_lease &&
@@ -909,6 +923,7 @@ const struct file_operations cifs_file_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_strict_ops = {
@@ -928,6 +943,7 @@ const struct file_operations cifs_file_strict_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_direct_ops = {
@@ -948,6 +964,7 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_nobrl_ops = {
@@ -966,6 +983,7 @@ const struct file_operations cifs_file_nobrl_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_strict_nobrl_ops = {
@@ -984,6 +1002,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -1003,6 +1022,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b0fafa499505..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "2.04"
+#define CIFS_VERSION   "2.05"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0012e1e291d4..25b8392bfdd2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
-/* used to define string lengths for reversing unicode strings */
-/*         (256+1)*2 = 514                                     */
-/*           (max path length + 1 for null) * 2 for unicode    */
-#define MAX_NAME 514
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
@@ -409,6 +404,10 @@ struct smb_version_operations {
        /* get mtu credits */
        int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
                                unsigned int *, unsigned int *);
+        /* check if we need to issue closedir */
+        bool (*dir_needs_close)(struct cifsFileInfo *);
+        long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
+                          loff_t);
 };
 struct smb_version_values {
@@ -883,6 +882,7 @@ struct cifs_tcon {
                                for this mount even if server would support */
        bool local_lease:1; /* check leases (only) on local system not remote */
        bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
+        bool broken_sparse_sup; /* if server or share does not support sparse */
        bool need_reconnect:1; /* connection reset, tid now invalid */
 #ifdef CONFIG_CIFS_SMB2
        bool print:1;           /* set if connection to printer share */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 33df36ef9d52..5f9822ac0245 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2253,6 +2253,29 @@ typedef struct {
 /* minimum includes first three fields, and empty FS Name */
 #define MIN_FS_ATTR_INFO_SIZE 12
+/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORT_INTEGRITY_STREAMS  0x04000000
+#define FILE_SUPPORTS_USN_JOURNAL       0x02000000
+#define FILE_SUPPORTS_OPEN_BY_FILE_ID   0x01000000
+#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
+#define FILE_SUPPORTS_HARD_LINKS        0x00400000
+#define FILE_SUPPORTS_TRANSACTIONS      0x00200000
+#define FILE_SEQUENTIAL_WRITE_ONCE      0x00100000
+#define FILE_READ_ONLY_VOLUME           0x00080000
+#define FILE_NAMED_STREAMS              0x00040000
+#define FILE_SUPPORTS_ENCRYPTION        0x00020000
+#define FILE_SUPPORTS_OBJECT_IDS        0x00010000
+#define FILE_VOLUME_IS_COMPRESSED       0x00008000
+#define FILE_SUPPORTS_REMOTE_STORAGE    0x00000100
+#define FILE_SUPPORTS_REPARSE_POINTS    0x00000080
+#define FILE_SUPPORTS_SPARSE_FILES      0x00000040
+#define FILE_VOLUME_QUOTAS              0x00000020
+#define FILE_FILE_COMPRESSION           0x00000010
+#define FILE_PERSISTENT_ACLS            0x00000008
+#define FILE_UNICODE_ON_DISK            0x00000004
+#define FILE_CASE_PRESERVED_NAMES       0x00000002
+#define FILE_CASE_SENSITIVE_SEARCH      0x00000001
 typedef struct {
        __le32 Attributes;
        __le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 03ed8a09581c..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        tmp_end++;
                        if (!(tmp_end < end && tmp_end[1] == delim)) {
                                /* No it is not. Set the password to NULL */
+                                kfree(vol->password);
                                vol->password = NULL;
                                break;
                        }
@@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                        options = end;
                        }
+                        kfree(vol->password);
                        /* Now build new password string */
                        temp_len = strlen(value);
                        vol->password = kzalloc(temp_len+1, GFP_KERNEL);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..6cbd9c688cfe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
                goto out;
        }
+        if (file->f_flags & O_DIRECT &&
+            CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        file->f_op = &cifs_file_direct_nobrl_ops;
+                else
+                        file->f_op = &cifs_file_direct_ops;
+                }
        file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
        if (file_info == NULL) {
                if (server->ops->close)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4ab2f79ffa7a..5f29354b072a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
        cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
                 inode, file->f_flags, full_path);
+        if (file->f_flags & O_DIRECT &&
+            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        file->f_op = &cifs_file_direct_nobrl_ops;
+                else
+                        file->f_op = &cifs_file_direct_ops;
+        }
        if (server->oplocks)
                oplock = REQ_OPLOCK;
        else
@@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
        cifs_dbg(FYI, "Freeing private data in close dir\n");
        spin_lock(&cifs_file_list_lock);
-        if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+        if (server->ops->dir_needs_close(cfile)) {
                cfile->invalidHandle = true;
                spin_unlock(&cifs_file_list_lock);
                if (server->ops->close_dir)
@@ -3560,15 +3568,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                lru_cache_add_file(page);
                                unlock_page(page);
                                page_cache_release(page);
-                                if (rc == -EAGAIN)
-                                        list_add_tail(&page->lru, &tmplist);
                        }
+                        /* Fallback to the readpage in error/reconnect cases */
                        kref_put(&rdata->refcount, cifs_readdata_release);
-                        if (rc == -EAGAIN) {
-                                /* Re-add pages to the page_list and retry */
-                                list_splice(&tmplist, page_list);
-                                continue;
-                        }
                        break;
                }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 426d6c6ad8bf..7899a40465b3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1720,13 +1720,22 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
 unlink_target:
        /* Try unlinking the target dentry if it's not negative */
        if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
-                tmprc = cifs_unlink(target_dir, target_dentry);
+                if (d_is_dir(target_dentry))
+                        tmprc = cifs_rmdir(target_dir, target_dentry);
+                else
+                        tmprc = cifs_unlink(target_dir, target_dentry);
                if (tmprc)
                        goto cifs_rename_exit;
                rc = cifs_do_rename(xid, source_dentry, from_name,
                                    target_dentry, to_name);
        }
+        /* force revalidate to go get info when needed */
+        CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
+        source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
+                target_dir->i_mtime = current_fs_time(source_dir->i_sb);
 cifs_rename_exit:
        kfree(info_buf_source);
        kfree(from_name);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..5657416d3483 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
        if (rc)
                goto out;
-        rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
+        if (tcon->ses->server->ops->create_mf_symlink)
-                                        fromName, buf, &bytes_written);
+                rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+                                        cifs_sb, fromName, buf, &bytes_written);
+        else
+                rc = -EOPNOTSUPP;
        if (rc)
                goto out;
@@ -339,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
        if (rc)
                return rc;
-        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+                rc = -ENOENT;
                /* it's not a symlink */
                goto out;
+        }
        io_parms.netfid = fid.netfid;
        io_parms.pid = current->tgid;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 81340c6253eb..b7415d596dbd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -574,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
                cinode->oplock = 0;
 }
-static int
-cifs_oplock_break_wait(void *unused)
-{
-        schedule();
-        return signal_pending(current) ? -ERESTARTSYS : 0;
-}
 /*
 * We wait for oplock breaks to be processed before we attempt to perform
 * writes.
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
        /* BB what about the timezone? BB */
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
-        u64 t;
+        s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+        /*
+         * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+         * the alternative, do_div, does not work with negative numbers so have
+         * to special case them
+         */
+        if (t < 0) {
+                t = -t;
+                ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+                ts.tv_nsec = -ts.tv_nsec;
+                ts.tv_sec = -t;
+        } else {
+                ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+                ts.tv_sec = t;
+        }
-        t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
-        ts.tv_nsec = do_div(t, 10000000) * 100;
-        ts.tv_sec = t;
        return ts;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b15862e0f68c..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -593,11 +593,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
                /* close and restart search */
                cifs_dbg(FYI, "search backing up - close and restart search\n");
                spin_lock(&cifs_file_list_lock);
-                if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+                if (server->ops->dir_needs_close(cfile)) {
                        cfile->invalidHandle = true;
                        spin_unlock(&cifs_file_list_lock);
-                        if (server->ops->close)
+                        if (server->ops->close_dir)
-                                server->ops->close(xid, tcon, &cfile->fid);
+                                server->ops->close_dir(xid, tcon, &cfile->fid);
                } else
                        spin_unlock(&cifs_file_list_lock);
                if (cfile->srch_inf.ntwrk_buf_start) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 39ee32688eac..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
        kfree(ses->serverOS);
        ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-        if (ses->serverOS)
+        if (ses->serverOS) {
                strncpy(ses->serverOS, bcc_ptr, len);
-        if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+                if (strncmp(ses->serverOS, "OS/2", 4) == 0)
-                cifs_dbg(FYI, "OS/2 server\n");
+                        cifs_dbg(FYI, "OS/2 server\n");
+        }
        bcc_ptr += len + 1;
        bleft -= len + 1;
@@ -744,14 +745,6 @@ out:
        sess_free_buffer(sess_data);
 }
-#else
-static void
-sess_auth_lanman(struct sess_data *sess_data)
-{
-        sess_data->result = -EOPNOTSUPP;
-        sess_data->func = NULL;
-}
 #endif
 static void
@@ -1102,15 +1095,6 @@ out:
        ses->auth_key.response = NULL;
 }
-#else
-static void
-sess_auth_kerberos(struct sess_data *sess_data)
-{
-        cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
-        sess_data->result = -ENOSYS;
-        sess_data->func = NULL;
-}
 #endif /* ! CONFIG_CIFS_UPCALL */
 /*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5e8c22d6c7b9..52131d8cb4d5 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
                tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
                if (tmprc == -EOPNOTSUPP)
                        *symlink = true;
-                else
+                else if (tmprc == 0)
                        CIFSSMBClose(xid, tcon, fid.netfid);
        }
@@ -1015,6 +1015,12 @@ cifs_wp_retry_size(struct inode *inode)
        return CIFS_SB(inode->i_sb)->wsize;
 }
+static bool
+cifs_dir_needs_close(struct cifsFileInfo *cfile)
+{
+        return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
+}
 struct smb_version_operations smb1_operations = {
        .send_cancel = send_nt_cancel,
        .compare_fids = cifs_compare_fids,
@@ -1086,6 +1092,7 @@ struct smb_version_operations smb1_operations = {
        .create_mf_symlink = cifs_create_mf_symlink,
        .is_read_op = cifs_is_read_op,
        .wp_retry_size = cifs_wp_retry_size,
+        .dir_needs_close = cifs_dir_needs_close,
 #ifdef CONFIG_CIFS_XATTR
        .query_all_EAs = CIFSSMBQAllEAs,
        .set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
                goto out;
        }
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL) {
                rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0150182a4494..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
        *adjust_tz = false;
        *symlink = false;
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL)
                return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index e31a9dfdcd39..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
        {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
        {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
-        {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
+        {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
        {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
        {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
        {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
        "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
        {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+        {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+        "STATUS_REPARSE_NOT_HANDLED"},
        {STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
        "STATUS_DEVICE_REQUIRES_CLEANING"},
        {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
@@ -298,7 +300,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
        {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
        {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
-        {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"},
+        {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
        {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
        {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
        {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f2e6ac29a8d6..4aa7a0f07d6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length)
                /* Windows 7 server returns 24 bytes more */
                if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
                        return 0;
-                /* server can return one byte more */
+                /* server can return one byte more due to implied bcc[0] */
                if (clc_len == 4 + len + 1)
                        return 0;
+                /*
+                 * MacOS server pads after SMB2.1 write response with 3 bytes
+                 * of junk. Other servers match RFC1001 len to actual
+                 * SMB2/SMB3 frame length (header + smb2 response specific data)
+                 * Log the server error (once), but allow it and continue
+                 * since the frame is parseable.
+                 */
+                if (clc_len < 4 /* RFC1001 header size */ + len) {
+                        printk_once(KERN_WARNING
+                                "SMB2 server sent bad RFC1001 len %d not %d\n",
+                                len, clc_len - 4);
+                        return 0;
+                }
                return 1;
        }
        return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 77f8aeb9c2fc..f522193b7184 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        int rc;
        struct smb2_file_all_info *smb2_data;
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL)
                return -ENOMEM;
@@ -731,11 +731,72 @@ smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
        return SMB2_write(xid, parms, written, iov, nr_segs);
 }
+/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
+static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
+                struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
+{
+        struct cifsInodeInfo *cifsi;
+        int rc;
+        cifsi = CIFS_I(inode);
+        /* if file already sparse don't bother setting sparse again */
+        if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
+                return true; /* already sparse */
+        if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
+                return true; /* already not sparse */
+        /*
+         * Can't check for sparse support on share the usual way via the
+         * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
+         * since Samba server doesn't set the flag on the share, yet
+         * supports the set sparse FSCTL and returns sparse correctly
+         * in the file attributes. If we fail setting sparse though we
+         * mark that server does not support sparse files for this share
+         * to avoid repeatedly sending the unsupported fsctl to server
+         * if the file is repeatedly extended.
+         */
+        if (tcon->broken_sparse_sup)
+                return false;
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
+                        true /* is_fctl */, &setsparse, 1, NULL, NULL);
+        if (rc) {
+                tcon->broken_sparse_sup = true;
+                cifs_dbg(FYI, "set sparse rc = %d\n", rc);
+                return false;
+        }
+        if (setsparse)
+                cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
+        else
+                cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
+        return true;
+}
 static int
 smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
                   struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
 {
        __le64 eof = cpu_to_le64(size);
+        struct inode *inode;
+        /*
+         * If extending file more than one page make sparse. Many Linux fs
+         * make files sparse by default when extending via ftruncate
+         */
+        inode = cfile->dentry->d_inode;
+        if (!set_alloc && (size > inode->i_size + 8192)) {
+                __u8 set_sparse = 1;
+                /* whether set sparse succeeds or not, extend the file */
+                smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
+        }
        return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
                            cfile->fid.volatile_fid, cfile->pid, &eof, false);
 }
@@ -954,6 +1015,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
+static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
+                            loff_t offset, loff_t len, bool keep_size)
+{
+        struct inode *inode;
+        struct cifsInodeInfo *cifsi;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct file_zero_data_information fsctl_buf;
+        long rc;
+        unsigned int xid;
+        xid = get_xid();
+        inode = cfile->dentry->d_inode;
+        cifsi = CIFS_I(inode);
+        /* if file not oplocked can't be sure whether asking to extend size */
+        if (!CIFS_CACHE_READ(cifsi))
+                if (keep_size == false)
+                        return -EOPNOTSUPP;
+        /*
+         * Must check if file sparse since fallocate -z (zero range) assumes
+         * non-sparse allocation
+         */
+        if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
+                return -EOPNOTSUPP;
+        /*
+         * need to make sure we are not asked to extend the file since the SMB3
+         * fsctl does not change the file size. In the future we could change
+         * this to zero the first part of the range then set the file size
+         * which for a non sparse file would zero the newly extended range
+         */
+        if (keep_size == false)
+                if (i_size_read(inode) < offset + len)
+                        return -EOPNOTSUPP;
+        cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+        fsctl_buf.FileOffset = cpu_to_le64(offset);
+        fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+                        true /* is_fctl */, (char *)&fsctl_buf,
+                        sizeof(struct file_zero_data_information), NULL, NULL);
+        free_xid(xid);
+        return rc;
+}
+static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode;
+        struct cifsInodeInfo *cifsi;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct file_zero_data_information fsctl_buf;
+        long rc;
+        unsigned int xid;
+        __u8 set_sparse = 1;
+        xid = get_xid();
+        inode = cfile->dentry->d_inode;
+        cifsi = CIFS_I(inode);
+        /* Need to make file sparse, if not already, before freeing range. */
+        /* Consider adding equivalent for compressed since it could also work */
+        if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
+                return -EOPNOTSUPP;
+        cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+        fsctl_buf.FileOffset = cpu_to_le64(offset);
+        fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+                        true /* is_fctl */, (char *)&fsctl_buf,
+                        sizeof(struct file_zero_data_information), NULL, NULL);
+        free_xid(xid);
+        return rc;
+}
+static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
+                           loff_t off, loff_t len)
+{
+        /* KEEP_SIZE already checked for by do_fallocate */
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return smb3_punch_hole(file, tcon, off, len);
+        else if (mode & FALLOC_FL_ZERO_RANGE) {
+                if (mode & FALLOC_FL_KEEP_SIZE)
+                        return smb3_zero_range(file, tcon, off, len, true);
+                return smb3_zero_range(file, tcon, off, len, false);
+        }
+        return -EOPNOTSUPP;
+}
 static void
 smb2_downgrade_oplock(struct TCP_Server_Info *server,
                        struct cifsInodeInfo *cinode, bool set_level2)
@@ -1161,6 +1321,12 @@ smb2_wp_retry_size(struct inode *inode)
                     SMB2_MAX_BUFFER_SIZE);
 }
+static bool
+smb2_dir_needs_close(struct cifsFileInfo *cfile)
+{
+        return !cfile->invalidHandle;
+}
 struct smb_version_operations smb20_operations = {
        .compare_fids = smb2_compare_fids,
        .setup_request = smb2_setup_request,
@@ -1236,6 +1402,7 @@ struct smb_version_operations smb20_operations = {
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
 };
 struct smb_version_operations smb21_operations = {
@@ -1313,6 +1480,7 @@ struct smb_version_operations smb21_operations = {
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
 };
 struct smb_version_operations smb30_operations = {
@@ -1393,6 +1561,8 @@ struct smb_version_operations smb30_operations = {
        .clone_range = smb2_clone_range,
        .validate_negotiate = smb3_validate_negotiate,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
+        .fallocate = smb3_fallocate,
 };
 struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 42ebc1a8be6c..74b3a6684383 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
        struct smb2_sess_setup_rsp *rsp = NULL;
        struct kvec iov[2];
        int rc = 0;
-        int resp_buftype;
+        int resp_buftype = CIFS_NO_BUFFER;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
        struct TCP_Server_Info *server = ses->server;
        u16 blob_length = 0;
@@ -907,7 +907,8 @@ tcon_exit:
 tcon_error_exit:
        if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
                cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
-                tcon->bad_network_name = true;
+                if (tcon)
+                        tcon->bad_network_name = true;
        }
        goto tcon_exit;
 }
@@ -1224,7 +1225,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        cifs_dbg(FYI, "SMB2 IOCTL\n");
-        *out_data = NULL;
+        if (out_data != NULL)
+                *out_data = NULL;
        /* zero out returned data len, in case of error */
        if (plen)
                *plen = 0;
@@ -1400,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
        rsp = (struct smb2_close_rsp *)iov[0].iov_base;
        if (rc != 0) {
-                if (tcon)
+                cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
-                        cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
                goto close_exit;
        }
@@ -1530,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
 {
        return query_info(xid, tcon, persistent_fid, volatile_fid,
                          FILE_ALL_INFORMATION,
-                          sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+                          sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                          sizeof(struct smb2_file_all_info), data);
 }
@@ -2177,6 +2179,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
        if (rc) {
+                if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+                        srch_inf->endOfSearch = true;
+                        rc = 0;
+                }
                cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
                goto qdir_exit;
        }
@@ -2214,11 +2220,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        else
                cifs_dbg(VFS, "illegal search buffer type\n");
-        if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
-                srch_inf->endOfSearch = 1;
-        else
-                srch_inf->endOfSearch = 0;
        return rc;
 qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 69f3595d3952..fbe486c285a9 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -573,6 +573,12 @@ struct copychunk_ioctl {
        __u32 Reserved2;
 } __packed;
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+        __le64  FileOffset;
+        __le64  BeyondFinalZero;
+} __packed;
 struct copychunk_ioctl_rsp {
        __le32 ChunksWritten;
        __le32 ChunkBytesWritten;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 0e538b5c9622..83efa59535be 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -63,7 +63,7 @@
 #define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
 #define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
 #define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
-#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000980C8
 #define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
 #define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
 #define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..cb25a1a5e307 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
                                        unsigned int hash)
 {
        hash += (unsigned long) parent / L1_CACHE_BYTES;
-        hash = hash + (hash >> d_hash_shift);
+        return dentry_hashtable + hash_32(hash, d_hash_shift);
-        return dentry_hashtable + (hash & d_hash_mask);
 }
 /* Statistics gathering. */
@@ -2373,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 }
 EXPORT_SYMBOL(dentry_update_name_case);
-static void switch_names(struct dentry *dentry, struct dentry *target)
+static void switch_names(struct dentry *dentry, struct dentry *target,
+                         bool exchange)
 {
        if (dname_external(target)) {
                if (dname_external(dentry)) {
@@ -2407,13 +2407,19 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                         */
                        unsigned int i;
                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+                        if (!exchange) {
+                                memcpy(dentry->d_iname, target->d_name.name,
+                                                target->d_name.len + 1);
+                                dentry->d_name.hash_len = target->d_name.hash_len;
+                                return;
+                        }
                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                swap(((long *) &dentry->d_iname)[i],
                                     ((long *) &target->d_iname)[i]);
                        }
                }
        }
-        swap(dentry->d_name.len, target->d_name.len);
+        swap(dentry->d_name.hash_len, target->d_name.hash_len);
 }
 static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2443,25 +2449,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
        }
 }
-static void dentry_unlock_parents_for_move(struct dentry *dentry,
+static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
-                                        struct dentry *target)
 {
        if (target->d_parent != dentry->d_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (target->d_parent != target)
                spin_unlock(&target->d_parent->d_lock);
+        spin_unlock(&target->d_lock);
+        spin_unlock(&dentry->d_lock);
 }
 /*
 * When switching names, the actual string doesn't strictly have to
 * be preserved in the target - because we're dropping the target
 * anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch.
+ * the new name before we switch, unless we are going to rehash
- *
+ * it.  Note that if we *do* unhash the target, we are not allowed
- * Note that we have to be a lot more careful about getting the hash
+ * to rehash it without giving it a new name/hash key - whether
- * switched - we have to switch the hash value properly even if it
+ * we swap or overwrite the names here, resulting name won't match
- * then no longer matches the actual (corrupted) string of the target.
+ * the reality in filesystem; it's only there for d_path() purposes.
- * The hash value has to match the hash queue that the dentry is on..
+ * Note that all of this is happening under rename_lock, so the
+ * any hash lookup seeing it in the middle of manipulations will
+ * be discarded anyway.  So we do not care what happens to the hash
+ * key in that case.
 */
 /*
 * __d_move - move a dentry
@@ -2507,36 +2517,30 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
                           d_hash(dentry->d_parent, dentry->d_name.hash));
        }
-        list_del(&dentry->d_u.d_child);
-        list_del(&target->d_u.d_child);
        /* Switch the names.. */
-        switch_names(dentry, target);
+        switch_names(dentry, target, exchange);
-        swap(dentry->d_name.hash, target->d_name.hash);
-        /* ... and switch the parents */
+        /* ... and switch them in the tree */
        if (IS_ROOT(dentry)) {
+                /* splicing a tree */
                dentry->d_parent = target->d_parent;
                target->d_parent = target;
-                INIT_LIST_HEAD(&target->d_u.d_child);
+                list_del_init(&target->d_u.d_child);
+                list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
        } else {
+                /* swapping two dentries */
                swap(dentry->d_parent, target->d_parent);
+                list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
-                /* And add them back to the (new) parent lists */
+                list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
-                list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
+                if (exchange)
+                        fsnotify_d_move(target);
+                fsnotify_d_move(dentry);
        }
-        list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);
-        dentry_unlock_parents_for_move(dentry, target);
+        dentry_unlock_for_move(dentry, target);
-        if (exchange)
-                fsnotify_d_move(target);
-        spin_unlock(&target->d_lock);
-        fsnotify_d_move(dentry);
-        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -2634,39 +2638,6 @@ out_err:
        return ret;
 }
-/*
- * Prepare an anonymous dentry for life in the superblock's dentry tree as a
- * named dentry in place of the dentry to be replaced.
- * returns with anon->d_lock held!
- */
-static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
-{
-        struct dentry *dparent;
-        dentry_lock_for_move(anon, dentry);
-        write_seqcount_begin(&dentry->d_seq);
-        write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
-        dparent = dentry->d_parent;
-        switch_names(dentry, anon);
-        swap(dentry->d_name.hash, anon->d_name.hash);
-        dentry->d_parent = dentry;
-        list_del_init(&dentry->d_u.d_child);
-        anon->d_parent = dparent;
-        list_move(&anon->d_u.d_child, &dparent->d_subdirs);
-        write_seqcount_end(&dentry->d_seq);
-        write_seqcount_end(&anon->d_seq);
-        dentry_unlock_parents_for_move(anon, dentry);
-        spin_unlock(&dentry->d_lock);
-        /* anon->d_lock still locked, returns locked */
-}
 /**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
@@ -2712,11 +2683,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                                return ERR_PTR(-EIO);
                        }
                        write_seqlock(&rename_lock);
-                        __d_materialise_dentry(dentry, new);
+                        __d_move(new, dentry, false);
                        write_sequnlock(&rename_lock);
-                        __d_drop(new);
-                        _d_rehash(new);
-                        spin_unlock(&new->d_lock);
                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        iput(inode);
@@ -2776,9 +2744,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        } else if (IS_ROOT(alias)) {
                                /* Is this an anonymous mountpoint that we
                                 * could splice into our tree? */
-                                __d_materialise_dentry(dentry, alias);
+                                __d_move(alias, dentry, false);
                                write_sequnlock(&rename_lock);
-                                __d_drop(alias);
                                goto found;
                        } else {
                                /* Nope, but we must(!) avoid directory
@@ -2804,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        actual = __d_instantiate_unique(dentry, inode);
        if (!actual)
                actual = dentry;
-        else
-                BUG_ON(!d_unhashed(actual));
-        spin_lock(&actual->d_lock);
+        d_rehash(actual);
 found:
-        _d_rehash(actual);
-        spin_unlock(&actual->d_lock);
        spin_unlock(&inode->i_lock);
 out_nolock:
        if (actual == dentry) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c3116404ab49..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
        ssize_t ret;
-        ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
+        ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
                                &sdio->from);
        if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index db0fad3269c0..b4b6ab9873ae 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
-                        "[%s]; rc = [%d]\n", __func__,
+                        "[%pd]; rc = [%d]\n", __func__,
-                        ecryptfs_dentry->d_name.name, rc);
+                        ecryptfs_dentry, rc);
                goto out_free;
        }
        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d4a9431ec73c..1686dc2da9fd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir)
 static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
 {
-        if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+        return ecryptfs_inode_to_lower(inode) == lower_inode;
-                return 1;
-        return 0;
 }
 static int ecryptfs_inode_set(struct inode *inode, void *opaque)
@@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode,
        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
-        if (IS_ERR(lower_dir_dentry)) {
-                ecryptfs_printk(KERN_ERR, "Error locking directory of "
-                                "dentry\n");
-                inode = ERR_CAST(lower_dir_dentry);
-                goto out;
-        }
        rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
        if (rc) {
                printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode,
        fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
 out_lock:
        unlock_dir(lower_dir_dentry);
-out:
        return inode;
 }
@@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
-                        "[%s]; rc = [%d]\n", __func__,
+                        "[%pd]; rc = [%d]\n", __func__,
-                        ecryptfs_dentry->d_name.name, rc);
+                        ecryptfs_dentry, rc);
                goto out;
        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
@@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
-                        "[%s]; rc = [%d]\n", __func__,
+                        "[%pd]; rc = [%d]\n", __func__,
-                        dentry->d_name.name, rc);
+                        dentry, rc);
                return rc;
        }
@@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%pd]\n", __func__, rc,
-                                ecryptfs_dentry->d_name.name);
+                                ecryptfs_dentry);
                goto out;
        }
        if (lower_dentry->d_inode)
@@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        }
        rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-        if (!rc)
+        if (!rc && dentry->d_inode)
                fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
 out:
        return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 4725a07f003c..635e8e16a5b7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -26,7 +26,6 @@
 */
 #include <linux/string.h>
-#include <linux/syscalls.h>
 #include <linux/pagemap.h>
 #include <linux/key.h>
 #include <linux/random.h>
@@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                                        "(Tag 11 not allowed by itself)\n");
                        rc = -EIO;
                        goto out_wipe_list;
-                        break;
                default:
                        ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
                                        "of the file header; hex value of "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e57380e5f6bd..286f10b0363b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void)
                mutex_lock(&ecryptfs_msg_ctx_lists_mux);
                for (i = 0; i < ecryptfs_message_buf_len; i++) {
                        mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
-                        if (ecryptfs_msg_ctx_arr[i].msg)
+                        kfree(ecryptfs_msg_ctx_arr[i].msg);
-                                kfree(ecryptfs_msg_ctx_arr[i].msg);
                        mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
                }
                kfree(ecryptfs_msg_ctx_arr);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                goto error_tgt_fput;
        /* Check if EPOLLWAKEUP is allowed */
-        ep_take_care_of_epollwakeup(&epds);
+        if (ep_op_has_event(op))
+                ep_take_care_of_epollwakeup(&epds);
        /*
         * We have to check that the file structure underneath the file descriptor
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b88edc05c230..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                                ext2_count_free_blocks(sb));
+                                ext2_count_free_blocks(sb), GFP_KERNEL);
        if (!err) {
                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext2_count_free_inodes(sb));
+                                ext2_count_free_inodes(sb), GFP_KERNEL);
        }
        if (!err) {
                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext2_count_dirs(sb));
+                                ext2_count_dirs(sb), GFP_KERNEL);
        }
        if (err) {
                ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index e85ff15a060e..fc3cdcf24aed 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -237,6 +237,8 @@ struct ext3_new_group_data {
 #define EXT3_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT3_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
+/* Number of supported quota types */
+#define EXT3_MAXQUOTAS 2
 /*
 *  Mount options
@@ -248,7 +250,7 @@ struct ext3_mount_options {
        unsigned long s_commit_interval;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
+        char *s_qf_names[EXT3_MAXQUOTAS];
 #endif
 };
@@ -669,7 +671,7 @@ struct ext3_sb_info {
        unsigned long s_commit_interval;
        struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+        char *s_qf_names[EXT3_MAXQUOTAS];       /* Names of quota files with journalled quota */
        int s_jquota_fmt;                       /* Format of quota to use */
 #endif
 };
@@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations;
 #define EXT3_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT3_QUOTA_DEL_BLOCKS(sb) 0
 #endif
-#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
 int
 ext3_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 08cdfe5461e3..7015db0bafd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb)
        percpu_counter_destroy(&sbi->s_dirs_counter);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT3_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
 #endif
@@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        /* Needed for iput() to work correctly and not trash data */
        sb->s_flags |= MS_ACTIVE;
        /* Turn on quotas so that they are updated correctly */
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT3_MAXQUOTAS; i++) {
                if (EXT3_SB(sb)->s_qf_names[i]) {
                        int ret = ext3_quota_on_mount(sb, i);
                        if (ret < 0)
@@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT3_MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
                        dquot_quota_off(sb, i);
        }
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount2;
        }
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext3_count_free_blocks(sb));
+                        ext3_count_free_blocks(sb), GFP_KERNEL);
        if (!err) {
                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext3_count_free_inodes(sb));
+                                ext3_count_free_inodes(sb), GFP_KERNEL);
        }
        if (!err) {
                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext3_count_dirs(sb));
+                                ext3_count_dirs(sb), GFP_KERNEL);
        }
        if (err) {
                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
@@ -2139,7 +2139,7 @@ failed_mount2:
        kfree(sbi->s_group_desc);
 failed_mount:
 #ifdef CONFIG_QUOTA
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT3_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
 #endif
        ext3_blkdev_remove(sbi);
@@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        old_opts.s_commit_interval = sbi->s_commit_interval;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT3_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
                                                         GFP_KERNEL);
@@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        }
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT3_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
 #endif
        if (enable_quota)
@@ -2777,7 +2777,7 @@ restore_opts:
        sbi->s_commit_interval = old_opts.s_commit_interval;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT3_MAXQUOTAS; i++) {
                kfree(sbi->s_qf_names[i]);
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
@@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
                 */
                overhead += ngroups * (2 + sbi->s_itb_per_group);
-                /* Add the journal blocks as well */
+                /* Add the internal journal blocks as well */
-                overhead += sbi->s_journal->j_maxlen;
+                if (sbi->s_journal && !sbi->journal_bdev)
+                        overhead += sbi->s_journal->j_maxlen;
                sbi->s_overhead_last = overhead;
                smp_wmb();
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b19760b1de5..b0c225cdb52c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 /*
 * Special error return code only used by dx_probe() and its callers.
 */
-#define ERR_BAD_DX_DIR  -75000
+#define ERR_BAD_DX_DIR  (-(MAX_ERRNO - 1))
 /*
 * Timeout and state flag for lazy initialization inode thread.
@@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        up_write(&EXT4_I(inode)->i_data_sem);
 }
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+        int changed = 0;
+        if (newsize > inode->i_size) {
+                i_size_write(inode, newsize);
+                changed = 1;
+        }
+        if (newsize > EXT4_I(inode)->i_disksize) {
+                ext4_update_i_disksize(inode, newsize);
+                changed |= 2;
+        }
+        return changed;
+}
 struct ext4_group_info {
        unsigned long   bb_state;
        struct rb_root  bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 76c2df382b7d..74292a71b384 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4665,7 +4665,8 @@ retry:
 }
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
-                                  ext4_lblk_t len, int flags, int mode)
+                                  ext4_lblk_t len, loff_t new_size,
+                                  int flags, int mode)
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
@@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        int retries = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
+        loff_t epos;
        map.m_lblk = offset;
+        map.m_len = len;
        /*
         * Don't normalize the request if it can fit in one extent so
         * that it doesn't get unnecessarily split into multiple
@@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        credits = ext4_chunk_trans_blocks(inode, len);
 retry:
-        while (ret >= 0 && ret < len) {
+        while (ret >= 0 && len) {
-                map.m_lblk = map.m_lblk + ret;
-                map.m_len = len = len - ret;
                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
@@ -4709,6 +4710,21 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
+                map.m_lblk += ret;
+                map.m_len = len = len - ret;
+                epos = (loff_t)map.m_lblk << inode->i_blkbits;
+                inode->i_ctime = ext4_current_time(inode);
+                if (new_size) {
+                        if (epos > new_size)
+                                epos = new_size;
+                        if (ext4_update_inode_size(inode, epos) & 0x1)
+                                inode->i_mtime = inode->i_ctime;
+                } else {
+                        if (epos > inode->i_size)
+                                ext4_set_inode_flag(inode,
+                                                    EXT4_INODE_EOFBLOCKS);
+                }
+                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
                        break;
@@ -4731,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        loff_t new_size = 0;
        int ret = 0;
        int flags;
-        int partial;
+        int credits;
+        int partial_begin, partial_end;
        loff_t start, end;
        ext4_lblk_t lblk;
        struct address_space *mapping = inode->i_mapping;
@@ -4771,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        if (start < offset || end > offset + len)
                return -EINVAL;
-        partial = (offset + len) & ((1 << blkbits) - 1);
+        partial_begin = offset & ((1 << blkbits) - 1);
+        partial_end = (offset + len) & ((1 << blkbits) - 1);
        lblk = start >> blkbits;
        max_blocks = (end >> blkbits);
@@ -4805,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                 * If we have a partial block after EOF we have to allocate
                 * the entire block.
                 */
-                if (partial)
+                if (partial_end)
                        max_blocks += 1;
        }
@@ -4813,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                /* Now release the pages and zero block aligned part of pages*/
                truncate_pagecache_range(inode, start, end - 1);
+                inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
                /* Wait all existing dio workers, newcomers will block on i_mutex */
                ext4_inode_block_unlocked_dio(inode);
@@ -4825,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                if (ret)
                        goto out_dio;
-                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-                                             mode);
+                                             flags, mode);
                if (ret)
                        goto out_dio;
        }
+        if (!partial_begin && !partial_end)
+                goto out_dio;
-        handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+        /*
+         * In worst case we have to writeout two nonadjacent unwritten
+         * blocks and update the inode
+         */
+        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+        if (ext4_should_journal_data(inode))
+                credits += 2;
+        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(inode->i_sb, ret);
@@ -4839,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        }
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        if (new_size) {
-                if (new_size > i_size_read(inode))
+                ext4_update_inode_size(inode, new_size);
-                        i_size_write(inode, new_size);
-                if (new_size > EXT4_I(inode)->i_disksize)
-                        ext4_update_i_disksize(inode, new_size);
        } else {
                /*
                * Mark that we allocate beyond EOF so the subsequent truncate
@@ -4853,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        ext4_mark_inode_dirty(handle, inode);
        /* Zero out partial block at the edges of the range */
@@ -4880,13 +4903,11 @@ out_mutex:
 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
-        handle_t *handle;
        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
        int flags;
        ext4_lblk_t lblk;
-        struct timespec tv;
        unsigned int blkbits = inode->i_blkbits;
        /* Return error if mode is not supported */
@@ -4937,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                        goto out;
        }
-        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+                                     flags, mode);
        if (ret)
                goto out;
-        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
-        if (IS_ERR(handle))
+                ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
-                goto out;
+                                                EXT4_I(inode)->i_sync_tid);
-        tv = inode->i_ctime = ext4_current_time(inode);
-        if (new_size) {
-                if (new_size > i_size_read(inode)) {
-                        i_size_write(inode, new_size);
-                        inode->i_mtime = tv;
-                }
-                if (new_size > EXT4_I(inode)->i_disksize)
-                        ext4_update_i_disksize(inode, new_size);
-        } else {
-                /*
-                * Mark that we allocate beyond EOF so the subsequent truncate
-                * can proceed even if the new size is the same as i_size.
-                */
-                if ((offset + len) > i_size_read(inode))
-                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
-        ext4_mark_inode_dirty(handle, inode);
-        if (file->f_flags & O_SYNC)
-                ext4_handle_sync(handle);
-        ext4_journal_stop(handle);
 out:
        mutex_unlock(&inode->i_mutex);
        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 367a60c07cf0..3aa26e9117c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file,
        } else
                copied = block_write_end(file, mapping, pos,
                                         len, copied, page, fsdata);
        /*
-         * No need to use i_size_read() here, the i_size
+         * it's important to update i_size while still holding page lock:
-         * cannot change under us because we hole i_mutex.
-         *
-         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
-        if (pos + copied > inode->i_size) {
+        i_size_changed = ext4_update_inode_size(inode, pos + copied);
-                i_size_write(inode, pos + copied);
-                i_size_changed = 1;
-        }
-        if (pos + copied > EXT4_I(inode)->i_disksize) {
-                /* We need to mark inode dirty even if
-                 * new_i_size is less that inode->i_size
-                 * but greater than i_disksize. (hint delalloc)
-                 */
-                ext4_update_i_disksize(inode, (pos + copied));
-                i_size_changed = 1;
-        }
        unlock_page(page);
        page_cache_release(page);
@@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
-        loff_t new_i_size;
+        int size_changed = 0;
        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
                if (!partial)
                        SetPageUptodate(page);
        }
-        new_i_size = pos + copied;
+        size_changed = ext4_update_inode_size(inode, pos + copied);
-        if (new_i_size > inode->i_size)
-                i_size_write(inode, pos+copied);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
+        unlock_page(page);
-                ext4_update_i_disksize(inode, new_i_size);
+        page_cache_release(page);
+        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }
-        unlock_page(page);
-        page_cache_release(page);
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
@@ -2095,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
+        int progress = 0;
        mpd->io_submit.io_end->offset =
                                ((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2111,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
-                            (err == -ENOSPC && ext4_count_free_clusters(sb)))
+                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
+                                if (progress)
+                                        goto update_disksize;
                                return err;
+                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
@@ -2129,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
                        *give_up_on_write = true;
                        return err;
                }
+                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
-                        return err;
+                        goto update_disksize;
        } while (map->m_len);
+update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 956027711faf..8b0f9ef517d6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        int last = first + count - 1;
        struct super_block *sb = e4b->bd_sb;
+        if (WARN_ON(count == 0))
+                return;
        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        /* Don't bother if the block group is corrupt. */
@@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
        int err;
        if (pa == NULL) {
+                if (ac->ac_f_ex.fe_len == 0)
+                        return;
                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                if (err) {
                        /*
@@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                               ac->ac_f_ex.fe_len);
                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+                ext4_mb_unload_buddy(&e4b);
                return;
        }
        if (pa->pa_type == MB_INODE_PA)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b147a67baa0d..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                                   buffer */
        int num = 0;
        ext4_lblk_t  nblocks;
-        int i, err;
+        int i, err = 0;
        int namelen;
        *res_dir = NULL;
@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                 * return.  Otherwise, fall back to doing a search the
                 * old fashioned way.
                 */
-                if (bh || (err != ERR_BAD_DX_DIR))
+                if (err == -ENOENT)
+                        return NULL;
+                if (err && err != ERR_BAD_DX_DIR)
+                        return ERR_PTR(err);
+                if (bh)
                        return bh;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
@@ -1295,6 +1299,11 @@ restart:
                                }
                                num++;
                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
+                                if (unlikely(err)) {
+                                        if (ra_max == 0)
+                                                return ERR_PTR(err);
+                                        break;
+                                }
                                bh_use[ra_max] = bh;
                                if (bh)
                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                return ERR_PTR(-ENAMETOOLONG);
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return (struct dentry *) bh;
        inode = NULL;
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
        struct buffer_head *bh;
        bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+        if (IS_ERR(bh))
+                return (struct dentry *) bh;
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (!bh)
                goto end_rmdir;
@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (!bh)
                goto end_unlink;
@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
        struct ext4_dir_entry_2 *de;
        bh = ext4_find_entry(dir, d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (bh) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                brelse(bh);
@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
        return retval;
 }
-static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
+                               int force_reread)
 {
        int retval;
        /*
@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
        if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
            ent->de->name_len != ent->dentry->d_name.len ||
            strncmp(ent->de->name, ent->dentry->d_name.name,
-                    ent->de->name_len)) {
+                    ent->de->name_len) ||
+            force_reread) {
                retval = ext4_find_delete_entry(handle, ent->dir,
                                                &ent->dentry->d_name);
        } else {
@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                .dentry = new_dentry,
                .inode = new_dentry->d_inode,
        };
+        int force_reread;
        int retval;
        dquot_initialize(old.dir);
@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dquot_initialize(new.inode);
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+        if (IS_ERR(old.bh))
+                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
+        if (IS_ERR(new.bh)) {
+                retval = PTR_ERR(new.bh);
+                new.bh = NULL;
+                goto end_rename;
+        }
        if (new.bh) {
                if (!new.inode) {
                        brelse(new.bh);
@@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (retval)
                        goto end_rename;
        }
+        /*
+         * If we're renaming a file within an inline_data dir and adding or
+         * setting the new dirent causes a conversion from inline_data to
+         * extents/blockmap, we need to force the dirent delete code to
+         * re-read the directory, or else we end up trying to delete a dirent
+         * from what is now the extent tree root (or a block map).
+         */
+        force_reread = (new.dir->i_ino == old.dir->i_ino &&
+                        ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
@@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (retval)
                        goto end_rename;
        }
+        if (force_reread)
+                force_reread = !ext4_test_inode_flag(new.dir,
+                                                     EXT4_INODE_INLINE_DATA);
        /*
         * Like most other Unix systems, set the ctime for inodes on a
@@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * ok, that's it
         */
-        ext4_rename_delete(handle, &old);
+        ext4_rename_delete(handle, &old, force_reread);
        if (new.inode) {
                ext4_dec_count(handle, new.inode);
@@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
                                 &old.de, &old.inlined);
+        if (IS_ERR(old.bh))
+                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
+        if (IS_ERR(new.bh)) {
+                retval = PTR_ERR(new.bh);
+                new.bh = NULL;
+                goto end_rename;
+        }
        /* RENAME_EXCHANGE case: old *and* new must both exist */
        if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
                bh = bclean(handle, sb, block);
                if (IS_ERR(bh)) {
                        err = PTR_ERR(bh);
+                        bh = NULL;
                        goto out;
                }
                overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
                bh = bclean(handle, sb, block);
                if (IS_ERR(bh)) {
                        err = PTR_ERR(bh);
+                        bh = NULL;
                        goto out;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad154b9..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
-                /* journal checksum v2 */
+                /* journal checksum v3 */
                compat = 0;
-                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
                jbd2_journal_clear_features(sbi->s_journal,
                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                                JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                                JBD2_FEATURE_INCOMPAT_CSUM_V2);
        }
@@ -3891,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Register extent status tree shrinker */
        ext4_es_register_shrinker(sbi);
-        if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
+        err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
@@ -4105,17 +4107,20 @@ no_journal:
        block = ext4_count_free_clusters(sb);
        ext4_free_blocks_count_set(sbi->s_es, 
                                   EXT4_C2B(sbi, block));
-        err = percpu_counter_init(&sbi->s_freeclusters_counter, block);
+        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
-                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei);
+                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+                                          GFP_KERNEL);
        }
        if (!err)
                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                          ext4_count_dirs(sb));
+                                          ext4_count_dirs(sb), GFP_KERNEL);
        if (!err)
-                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+                                          GFP_KERNEL);
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount6;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
          mounted as f2fs. Each file shows the whole f2fs information.
          /sys/kernel/debug/f2fs/status includes:
-            - major file system information managed by f2fs currently
+            - major filesystem information managed by f2fs currently
            - average SIT information about whole segments
            - current memory footprint consumed by f2fs.
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
        bool "F2FS consistency checking feature"
        depends on F2FS_FS
        help
-          Enables BUG_ONs which check the file system consistency in runtime.
+          Enables BUG_ONs which check the filesystem consistency in runtime.
          If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5bada52..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
        return page;
 }
-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+        bool readahead = false;
+        struct page *page;
+        page = find_get_page(META_MAPPING(sbi), index);
+        if (!page || (page && !PageUptodate(page)))
+                readahead = true;
+        f2fs_put_page(page, 0);
+        if (readahead)
+                ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+        return get_meta_page(sbi, index);
+}
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
 {
        switch (type) {
        case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
        case META_SSA:
        case META_CP:
                return 0;
+        case META_POR:
+                return MAX_BLKADDR(sbi);
        default:
                BUG();
        }
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
 /*
 * Readahead CP/NAT/SIT/SSA pages
 */
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
 {
        block_t prev_blk_addr = 0;
        struct page *page;
-        int blkno = start;
+        block_t blkno = start;
-        int max_blks = get_max_meta_blks(sbi, type);
+        block_t max_blks = get_max_meta_blks(sbi, type);
        struct f2fs_io_info fio = {
                .type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
                        break;
                case META_SSA:
                case META_CP:
-                        /* get ssa/cp block addr */
+                case META_POR:
+                        if (unlikely(blkno >= max_blks))
+                                goto out;
+                        if (unlikely(blkno < SEG0_BLKADDR(sbi)))
+                                goto out;
                        blk_addr = blkno;
                        break;
                default:
@@ -151,8 +172,7 @@ out:
 static int f2fs_write_meta_page(struct page *page,
                                struct writeback_control *wbc)
 {
-        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        trace_f2fs_writepage(page, META);
@@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page,
                goto redirty_out;
        if (wbc->for_reclaim)
                goto redirty_out;
+        if (unlikely(f2fs_cp_error(sbi)))
-        /* Should not write any meta pages, if any IO error was occurred */
+                goto redirty_out;
-        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
-                goto no_write;
        f2fs_wait_on_page_writeback(page, META);
        write_meta_page(sbi, page);
-no_write:
        dec_page_count(sbi, F2FS_DIRTY_META);
        unlock_page(page);
        return 0;
@@ -180,7 +197,7 @@ redirty_out:
 static int f2fs_write_meta_pages(struct address_space *mapping,
                                struct writeback_control *wbc)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
        long diff, written;
        trace_f2fs_writepages(mapping->host, wbc, META);
@@ -262,15 +279,12 @@ continue_unlock:
 static int f2fs_set_meta_page_dirty(struct page *page)
 {
-        struct address_space *mapping = page->mapping;
-        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
        trace_f2fs_set_page_dirty(page, META);
        SetPageUptodate(page);
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
-                inc_page_count(sbi, F2FS_DIRTY_META);
+                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
                return 1;
        }
        return 0;
@@ -348,7 +362,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
        return e ? true : false;
 }
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
 {
        struct ino_entry *e, *tmp;
        int i;
@@ -381,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
        spin_lock(&sbi->ino_lock[ORPHAN_INO]);
-        f2fs_bug_on(sbi->n_orphans == 0);
+        f2fs_bug_on(sbi, sbi->n_orphans == 0);
        sbi->n_orphans--;
        spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
 }
@@ -401,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
        struct inode *inode = f2fs_iget(sbi->sb, ino);
-        f2fs_bug_on(IS_ERR(inode));
+        f2fs_bug_on(sbi, IS_ERR(inode));
        clear_nlink(inode);
        /* truncate all the data during iput */
@@ -446,8 +460,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
        struct f2fs_orphan_block *orphan_blk = NULL;
        unsigned int nentries = 0;
        unsigned short index;
-        unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
+        unsigned short orphan_blocks =
-                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+                        (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
        struct page *page = NULL;
        struct ino_entry *orphan = NULL;
@@ -462,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
        list_for_each_entry(orphan, head, list) {
                if (!page) {
                        page = find_get_page(META_MAPPING(sbi), start_blk++);
-                        f2fs_bug_on(!page);
+                        f2fs_bug_on(sbi, !page);
                        orphan_blk =
                                (struct f2fs_orphan_block *)page_address(page);
                        memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -622,7 +636,7 @@ fail_no_cp:
 static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
                return -EEXIST;
@@ -634,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
        return 0;
 }
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+void update_dirty_page(struct inode *inode, struct page *page)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct dir_inode_entry *new;
        int ret = 0;
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
                return;
+        if (!S_ISDIR(inode->i_mode)) {
+                inode_inc_dirty_pages(inode);
+                goto out;
+        }
        new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        new->inode = inode;
        INIT_LIST_HEAD(&new->list);
        spin_lock(&sbi->dir_inode_lock);
        ret = __add_dirty_inode(inode, new);
-        inode_inc_dirty_dents(inode);
+        inode_inc_dirty_pages(inode);
-        SetPagePrivate(page);
        spin_unlock(&sbi->dir_inode_lock);
        if (ret)
                kmem_cache_free(inode_entry_slab, new);
+out:
+        SetPagePrivate(page);
 }
 void add_dirty_dir_inode(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct dir_inode_entry *new =
                        f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        int ret = 0;
@@ -677,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct dir_inode_entry *entry;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&sbi->dir_inode_lock);
-        if (get_dirty_dents(inode) ||
+        if (get_dirty_pages(inode) ||
                        !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
                spin_unlock(&sbi->dir_inode_lock);
                return;
@@ -737,7 +757,7 @@ retry:
 /*
 * Freeze all the FS-operations for checkpoint.
 */
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
@@ -745,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
                .for_reclaim = 0,
        };
        struct blk_plug plug;
+        int err = 0;
        blk_start_plug(&plug);
@@ -754,11 +775,15 @@ retry_flush_dents:
        if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
                f2fs_unlock_all(sbi);
                sync_dirty_dir_inodes(sbi);
+                if (unlikely(f2fs_cp_error(sbi))) {
+                        err = -EIO;
+                        goto out;
+                }
                goto retry_flush_dents;
        }
        /*
-         * POR: we should ensure that there is no dirty node pages
+         * POR: we should ensure that there are no dirty node pages
         * until finishing nat/sit flush.
         */
 retry_flush_nodes:
@@ -767,9 +792,16 @@ retry_flush_nodes:
        if (get_pages(sbi, F2FS_DIRTY_NODES)) {
                up_write(&sbi->node_write);
                sync_node_pages(sbi, 0, &wbc);
+                if (unlikely(f2fs_cp_error(sbi))) {
+                        f2fs_unlock_all(sbi);
+                        err = -EIO;
+                        goto out;
+                }
                goto retry_flush_nodes;
        }
+out:
        blk_finish_plug(&plug);
+        return err;
 }
 static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -793,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
        finish_wait(&sbi->cp_wait, &wait);
 }
-static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
-        nid_t last_nid = 0;
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        nid_t last_nid = nm_i->next_scan_nid;
        block_t start_blk;
        struct page *cp_page;
        unsigned int data_sum_blocks, orphan_blocks;
@@ -813,8 +846,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
        /* Flush all the NAT/SIT pages */
-        while (get_pages(sbi, F2FS_DIRTY_META))
+        while (get_pages(sbi, F2FS_DIRTY_META)) {
                sync_meta_pages(sbi, META, LONG_MAX);
+                if (unlikely(f2fs_cp_error(sbi)))
+                        return;
+        }
        next_free_nid(sbi, &last_nid);
@@ -825,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
        ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
                ckpt->cur_node_segno[i] =
                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
                ckpt->cur_node_blkoff[i] =
@@ -833,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
                ckpt->alloc_type[i + CURSEG_HOT_NODE] =
                                curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
        }
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
                ckpt->cur_data_segno[i] =
                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
                ckpt->cur_data_blkoff[i] =
@@ -848,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* 2 cp  + n data seg summary + orphan inode blocks */
        data_sum_blocks = npages_for_summary_flush(sbi);
-        if (data_sum_blocks < 3)
+        if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
        else
                clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
-        orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
+        orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
-                                        / F2FS_ORPHANS_PER_BLOCK;
        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
                        orphan_blocks);
-        if (is_umount) {
+        if (cpc->reason == CP_UMOUNT) {
                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks + NR_CURSEG_NODE_TYPE);
        } else {
                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks);
        }
@@ -875,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        else
                clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+        if (sbi->need_fsck)
+                set_ckpt_flags(ckpt, CP_FSCK_FLAG);
        /* update SIT/NAT bitmap */
        get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -909,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        write_data_summaries(sbi, start_blk);
        start_blk += data_sum_blocks;
-        if (is_umount) {
+        if (cpc->reason == CP_UMOUNT) {
                write_node_summaries(sbi, start_blk);
                start_blk += NR_CURSEG_NODE_TYPE;
        }
@@ -924,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* wait for previous submitted node/meta pages writeback */
        wait_on_all_pages_writeback(sbi);
+        if (unlikely(f2fs_cp_error(sbi)))
+                return;
        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -934,27 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* Here, we only have one bio having CP pack */
        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
-        if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        release_dirty_inode(sbi);
-                clear_prefree_segments(sbi);
-                release_dirty_inode(sbi);
+        if (unlikely(f2fs_cp_error(sbi)))
-                F2FS_RESET_SB_DIRT(sbi);
+                return;
-        }
+        clear_prefree_segments(sbi);
+        F2FS_RESET_SB_DIRT(sbi);
 }
 /*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
 */
-void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        unsigned long long ckpt_ver;
-        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
        mutex_lock(&sbi->cp_mutex);
-        block_operations(sbi);
-        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+        if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+                goto out;
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto out;
+        if (block_operations(sbi))
+                goto out;
+        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
        f2fs_submit_merged_bio(sbi, DATA, WRITE);
        f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -970,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* write cached NAT/SIT entries to NAT/SIT area */
        flush_nat_entries(sbi);
-        flush_sit_entries(sbi);
+        flush_sit_entries(sbi, cpc);
        /* unlock all the fs_lock[] in do_checkpoint() */
-        do_checkpoint(sbi, is_umount);
+        do_checkpoint(sbi, cpc);
        unblock_operations(sbi);
-        mutex_unlock(&sbi->cp_mutex);
        stat_inc_cp_count(sbi->stat_info);
-        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
+out:
+        mutex_unlock(&sbi->cp_mutex);
+        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
 }
 void init_ino_entry_info(struct f2fs_sb_info *sbi)
@@ -999,8 +1048,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
         * for cp pack we can have max 1020*504 orphan entries
         */
        sbi->n_orphans = 0;
-        sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+        sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
-                                * F2FS_ORPHANS_PER_BLOCK;
+                        NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099c51c..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
                struct page *page = bvec->bv_page;
                if (unlikely(err)) {
-                        SetPageError(page);
+                        set_page_dirty(page);
                        set_bit(AS_EIO, &page->mapping->flags);
                        f2fs_stop_checkpoint(sbi);
                }
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
        bio = bio_alloc(GFP_NOIO, npages);
        bio->bi_bdev = sbi->sb->s_bdev;
-        bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+        bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
        bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
        bio->bi_private = sbi;
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
                __submit_merged_bio(io);
 alloc_new:
        if (io->bio == NULL) {
-                int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+                int bio_blocks = MAX_BIO_BLOCKS(sbi);
                io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
                io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
 int reserve_new_block(struct dnode_of_data *dn)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
        int err;
        /* if inode_page exists, index should be zero */
-        f2fs_bug_on(!need_put && index);
+        f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
        err = get_dnode_of_data(dn, index, ALLOC_NODE);
        if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        block_t start_blkaddr, end_blkaddr;
        int need_update = true;
-        f2fs_bug_on(blk_addr == NEW_ADDR);
+        f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
                                                        dn->ofs_in_node;
@@ -396,7 +396,6 @@ end_update:
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
        struct dnode_of_data dn;
        struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return page;
        }
-        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
                                        sync ? READ_SYNC : READA);
        if (err)
                return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 */
 struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
        struct dnode_of_data dn;
        struct page *page;
@@ -490,7 +488,8 @@ repeat:
                return page;
        }
-        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+                                        dn.data_blkaddr, READ_SYNC);
        if (err)
                return ERR_PTR(err);
@@ -517,7 +516,6 @@ repeat:
 struct page *get_new_data_page(struct inode *inode,
                struct page *ipage, pgoff_t index, bool new_i_size)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
-                err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
-                                                                READ_SYNC);
+                                                dn.data_blkaddr, READ_SYNC);
                if (err)
                        goto put_err;
@@ -573,10 +571,12 @@ put_err:
 static int __allocate_data_block(struct dnode_of_data *dn)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        struct f2fs_summary sum;
        block_t new_blkaddr;
        struct node_info ni;
+        pgoff_t fofs;
        int type;
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        update_extent_cache(new_blkaddr, dn);
        clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+        /* update i_size */
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+                                                        dn->ofs_in_node;
+        if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+                i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
        dn->data_blkaddr = new_blkaddr;
        return 0;
 }
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 static int __get_data_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create, bool fiemap)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blkbits = inode->i_sb->s_blocksize_bits;
        unsigned maxblocks = bh_result->b_size >> blkbits;
        struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
                goto out;
        if (create) {
-                f2fs_balance_fs(sbi);
+                f2fs_balance_fs(F2FS_I_SB(inode));
-                f2fs_lock_op(sbi);
+                f2fs_lock_op(F2FS_I_SB(inode));
        }
        /* When reading holes, we need its node page */
@@ -691,7 +696,7 @@ get_next:
                        allocated = true;
                        blkaddr = dn.data_blkaddr;
                }
-                /* Give more consecutive addresses for the read ahead */
+                /* Give more consecutive addresses for the readahead */
                if (blkaddr == (bh_result->b_blocknr + ofs)) {
                        ofs++;
                        dn.ofs_in_node++;
@@ -707,7 +712,7 @@ put_out:
        f2fs_put_dnode(&dn);
 unlock_out:
        if (create)
-                f2fs_unlock_op(sbi);
+                f2fs_unlock_op(F2FS_I_SB(inode));
 out:
        trace_f2fs_get_data_block(inode, iblock, bh_result, err);
        return err;
@@ -739,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
        trace_f2fs_readpage(page, DATA);
-        /* If the file has inline data, try to read it directlly */
+        /* If the file has inline data, try to read it directly */
        if (f2fs_has_inline_data(inode))
                ret = f2fs_read_inline_data(inode, page);
        else
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
                                        struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        loff_t i_size = i_size_read(inode);
        const pgoff_t end_index = ((unsigned long long) i_size)
                                                        >> PAGE_CACHE_SHIFT;
@@ -836,10 +841,19 @@ write:
        /* Dentry blocks are controlled by checkpoint */
        if (S_ISDIR(inode->i_mode)) {
+                if (unlikely(f2fs_cp_error(sbi)))
+                        goto redirty_out;
                err = do_write_data_page(page, &fio);
                goto done;
        }
+        /* we should bypass data pages to proceed the kworkder jobs */
+        if (unlikely(f2fs_cp_error(sbi))) {
+                SetPageError(page);
+                unlock_page(page);
+                goto out;
+        }
        if (!wbc->for_reclaim)
                need_balance_fs = true;
        else if (has_not_enough_free_secs(sbi, 0))
@@ -857,7 +871,7 @@ done:
        clear_cold_data(page);
 out:
-        inode_dec_dirty_dents(inode);
+        inode_dec_dirty_pages(inode);
        unlock_page(page);
        if (need_balance_fs)
                f2fs_balance_fs(sbi);
@@ -883,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        bool locked = false;
        int ret;
        long diff;
@@ -895,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
                return 0;
        if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
-                        get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+                        get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
                        available_free_memory(sbi, DIRTY_DENTS))
                goto skip_write;
@@ -917,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        return ret;
 skip_write:
-        wbc->pages_skipped += get_dirty_dents(inode);
+        wbc->pages_skipped += get_dirty_pages(inode);
        return 0;
 }
@@ -927,7 +941,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
-                truncate_blocks(inode, inode->i_size);
+                truncate_blocks(inode, inode->i_size, true);
        }
 }
@@ -936,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct page *page;
        pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
        struct dnode_of_data dn;
@@ -946,7 +960,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
        f2fs_balance_fs(sbi);
 repeat:
-        err = f2fs_convert_inline_data(inode, pos + len);
+        err = f2fs_convert_inline_data(inode, pos + len, NULL);
        if (err)
                goto fail;
@@ -1038,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
        trace_f2fs_write_end(inode, pos, len, copied);
-        set_page_dirty(page);
+        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+                register_inmem_page(inode, page);
+        else
+                set_page_dirty(page);
        if (pos + copied > i_size_read(inode)) {
                i_size_write(inode, pos + copied);
@@ -1083,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        if (check_direct_IO(inode, rw, iter, offset))
                return 0;
-        /* clear fsync mark to recover these blocks */
-        fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
        trace_f2fs_direct_IO_enter(inode, offset, count, rw);
        err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1101,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
+        if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+                return;
        if (PageDirty(page))
-                inode_dec_dirty_dents(inode);
+                inode_dec_dirty_pages(inode);
        ClearPagePrivate(page);
 }
@@ -1124,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
-                set_dirty_dir_page(inode, page);
+                update_dirty_page(inode, page);
                return 1;
        }
        return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba33be11..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        struct f2fs_stat_info *si = F2FS_STAT(sbi);
        int i;
-        /* valid check of the segment numbers */
+        /* validation check of the segment numbers */
        si->hit_ext = sbi->read_hit_ext;
        si->total_ext = sbi->total_hit_ext;
        si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
        total_vblocks = 0;
        blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
        hblks_per_sec = blks_per_sec / 2;
-        for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+        for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
                vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
                dist = abs(vblocks - hblks_per_sec);
                bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
                        ndirty++;
                }
        }
-        dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+        dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
        si->bimodal = bimodal / dist;
        if (si->dirty_count)
                si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        /* build sit */
        si->base_mem += sizeof(struct sit_info);
-        si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
+        si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
-        si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
-        si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+        si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
        if (sbi->segs_per_sec > 1)
-                si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
+                si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
        si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
        /* build free segmap */
        si->base_mem += sizeof(struct free_segmap_info);
-        si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
-        si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+        si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
        /* build curseg */
        si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        /* build dirty segmap */
        si->base_mem += sizeof(struct dirty_seglist_info);
-        si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
-        si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+        si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
-        /* buld nm */
+        /* build nm */
        si->base_mem += sizeof(struct f2fs_nm_info);
        si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c3d903..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,9 +124,9 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                /*
                 * For the most part, it should be a bug when name_len is zero.
-                 * We stop here for figuring out where the bugs are occurred.
+                 * We stop here for figuring out where the bugs has occurred.
                 */
-                f2fs_bug_on(!de->name_len);
+                f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
                bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
        }
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
        bool room = false;
        int max_slots = 0;
-        f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
+        f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
        nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
        nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 int update_dent_inode(struct inode *inode, const struct qstr *name)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *page;
-        page = get_node_page(sbi, inode->i_ino);
+        page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
        if (IS_ERR(page))
                return PTR_ERR(page);
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
 static struct page *init_inode_metadata(struct inode *inode,
                struct inode *dir, const struct qstr *name)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
        struct page *page;
        int err;
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
                if (err)
                        goto put_error;
        } else {
-                page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+                page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
                if (IS_ERR(page))
                        return page;
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
                 * we should remove this inode from orphan list.
                 */
                if (inode->i_nlink == 0)
-                        remove_orphan_inode(sbi, inode->i_ino);
+                        remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
                inc_nlink(inode);
        }
        return page;
@@ -391,7 +389,7 @@ put_error:
 error:
        /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
        truncate_inode_pages(&inode->i_data, 0);
-        truncate_blocks(inode, 0);
+        truncate_blocks(inode, 0, false);
        remove_dirty_dir_inode(inode);
        remove_inode_page(inode);
        return ERR_PTR(err);
@@ -563,7 +561,7 @@ fail:
 }
 /*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
 * entry in name page does not need to be touched during deletion.
 */
 void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 {
        struct  f2fs_dentry_block *dentry_blk;
        unsigned int bit_pos;
-        struct address_space *mapping = page->mapping;
+        struct inode *dir = page->mapping->host;
-        struct inode *dir = mapping->host;
        int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
        int i;
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (inode) {
-                struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+                struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
                down_write(&F2FS_I(inode)->i_sem);
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                truncate_hole(dir, page->index, page->index + 1);
                clear_page_dirty_for_io(page);
                ClearPageUptodate(page);
-                inode_dec_dirty_dents(dir);
+                inode_dec_dirty_pages(dir);
        }
        f2fs_put_page(page, 1);
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab5338a97a..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
 #include <linux/sched.h>
 #ifdef CONFIG_F2FS_CHECK_FS
-#define f2fs_bug_on(condition)  BUG_ON(condition)
+#define f2fs_bug_on(sbi, condition)     BUG_ON(condition)
 #define f2fs_down_write(x, y)   down_write_nest_lock(x, y)
 #else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(sbi, condition)                                     \
+        do {                                                            \
+                if (unlikely(condition)) {                              \
+                        WARN_ON(1);                                     \
+                        sbi->need_fsck = true;                          \
+                }                                                       \
+        } while (0)
 #define f2fs_down_write(x, y)   down_write(x)
 #endif
@@ -90,6 +96,20 @@ enum {
        SIT_BITMAP
 };
+enum {
+        CP_UMOUNT,
+        CP_SYNC,
+        CP_DISCARD,
+};
+struct cp_control {
+        int reason;
+        __u64 trim_start;
+        __u64 trim_end;
+        __u64 trim_minlen;
+        __u64 trimmed;
+};
 /*
 * For CP/NAT/SIT/SSA readahead
 */
@@ -97,7 +117,8 @@ enum {
        META_CP,
        META_NAT,
        META_SIT,
-        META_SSA
+        META_SSA,
+        META_POR,
 };
 /* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
 struct fsync_inode_entry {
        struct list_head list;  /* list head */
        struct inode *inode;    /* vfs inode pointer */
-        block_t blkaddr;        /* block address locating the last inode */
+        block_t blkaddr;        /* block address locating the last fsync */
+        block_t last_dentry;    /* block address locating the last dentry */
+        block_t last_inode;     /* block address locating the last inode */
 };
 #define nats_in_cursum(sum)             (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
 #define sit_in_journal(sum, i)          (sum->sit_j.entries[i].se)
 #define segno_in_journal(sum, i)        (sum->sit_j.entries[i].segno)
+#define MAX_NAT_JENTRIES(sum)   (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum)   (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
 static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
 {
        int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
        return before;
 }
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+                                                                int type)
+{
+        if (type == NAT_JOURNAL)
+                return size <= MAX_NAT_JENTRIES(sum);
+        return size <= MAX_SIT_JENTRIES(sum);
+}
 /*
 * ioctl commands
 */
-#define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
+#define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+#define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+#define F2FS_IOCTL_MAGIC                0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE     _IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE    _IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 3)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
        /* Use below internally in f2fs*/
        unsigned long flags;            /* use to pass per-file flags */
        struct rw_semaphore i_sem;      /* protect fi info */
-        atomic_t dirty_dents;           /* # of dirty dentry pages */
+        atomic_t dirty_pages;           /* # of dirty pages */
        f2fs_hash_t chash;              /* hash value of given file name */
        unsigned int clevel;            /* maximum level of given file name */
        nid_t i_xattr_nid;              /* node id that contains xattrs */
        unsigned long long xattr_ver;   /* cp version of xattr modification */
        struct extent_info ext;         /* in-memory extent cache entry */
        struct dir_inode_entry *dirty_dir;      /* the pointer of dirty dir */
+        struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
+        struct mutex inmem_lock;        /* lock for inmemory pages */
 };
 static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
+        struct radix_tree_root nat_set_root;/* root of the nat set cache */
        rwlock_t nat_tree_lock;         /* protect nat_tree_lock */
-        unsigned int nat_cnt;           /* the # of cached nat entries */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
-        struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
+        unsigned int nat_cnt;           /* the # of cached nat entries */
-        struct list_head nat_entry_set; /* nat entry set list */
        unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
        /* free node ids management */
@@ -332,18 +373,16 @@ enum {
 };
 struct flush_cmd {
-        struct flush_cmd *next;
        struct completion wait;
+        struct llist_node llnode;
        int ret;
 };
 struct flush_cmd_control {
        struct task_struct *f2fs_issue_flush;   /* flush thread */
        wait_queue_head_t flush_wait_queue;     /* waiting queue for wake-up */
-        struct flush_cmd *issue_list;           /* list for command issue */
+        struct llist_head issue_list;           /* list for command issue */
-        struct flush_cmd *dispatch_list;        /* list for command dispatch */
+        struct llist_node *dispatch_list;       /* list for command dispatch */
-        spinlock_t issue_lock;                  /* for issue list lock */
-        struct flush_cmd *issue_tail;           /* list tail of issue list */
 };
 struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
        int nr_discards;                        /* # of discards in the list */
        int max_discards;                       /* max. discards to be issued */
+        struct list_head sit_entry_set; /* sit entry set list */
        unsigned int ipu_policy;        /* in-place-update policy */
        unsigned int min_ipu_util;      /* in-place-update threshold */
+        unsigned int min_fsync_blocks;  /* threshold for fsync */
        /* for flush command control */
        struct flush_cmd_control *cmd_control_info;
@@ -395,7 +437,7 @@ enum count_type {
 };
 /*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
 * The available types are:
 * DATA                 User data pages. It operates as async mode.
 * NODE                 Node pages. It operates as async mode.
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
        struct buffer_head *raw_super_buf;      /* buffer head of raw sb */
        struct f2fs_super_block *raw_super;     /* raw super block pointer */
        int s_dirty;                            /* dirty flag for checkpoint */
+        bool need_fsck;                         /* need fsck.f2fs to fix */
        /* for node-related operations */
        struct f2fs_nm_info *nm_info;           /* node manager */
@@ -470,7 +513,7 @@ struct f2fs_sb_info {
        struct list_head dir_inode_list;        /* dir inode list */
        spinlock_t dir_inode_lock;              /* for dir inode list lock */
-        /* basic file system units */
+        /* basic filesystem units */
        unsigned int log_sectors_per_block;     /* log2 sectors per block */
        unsigned int log_blocksize;             /* log2 block size */
        unsigned int blocksize;                 /* block size */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
        return sb->s_fs_info;
 }
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+        return F2FS_SB(inode->i_sb);
+}
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+        return F2FS_I_SB(mapping->host);
+}
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+        return F2FS_M_SB(page->mapping);
+}
 static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
 {
        return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
                                                blkcnt_t count)
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
+        f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
-        f2fs_bug_on(inode->i_blocks < count);
+        f2fs_bug_on(sbi, inode->i_blocks < count);
        inode->i_blocks -= count;
        sbi->total_valid_block_count -= (block_t)count;
        spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
        F2FS_SET_SB_DIRT(sbi);
 }
-static inline void inode_inc_dirty_dents(struct inode *inode)
+static inline void inode_inc_dirty_pages(struct inode *inode)
 {
-        inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
+        atomic_inc(&F2FS_I(inode)->dirty_pages);
-        atomic_inc(&F2FS_I(inode)->dirty_dents);
+        if (S_ISDIR(inode->i_mode))
+                inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
 }
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
        atomic_dec(&sbi->nr_pages[count_type]);
 }
-static inline void inode_dec_dirty_dents(struct inode *inode)
+static inline void inode_dec_dirty_pages(struct inode *inode)
 {
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
                return;
-        dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
+        atomic_dec(&F2FS_I(inode)->dirty_pages);
-        atomic_dec(&F2FS_I(inode)->dirty_dents);
+        if (S_ISDIR(inode->i_mode))
+                dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
 }
 static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
        return atomic_read(&sbi->nr_pages[count_type]);
 }
-static inline int get_dirty_dents(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
 {
-        return atomic_read(&F2FS_I(inode)->dirty_dents);
+        return atomic_read(&F2FS_I(inode)->dirty_pages);
 }
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -799,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
        /*
         * odd numbered checkpoint should at cp segment 0
-         * and even segent must be at cp segment 1
+         * and even segment must be at cp segment 1
         */
        if (!(ckpt_version & 1))
                start_addr += sbi->blocks_per_seg;
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(!sbi->total_valid_block_count);
+        f2fs_bug_on(sbi, !sbi->total_valid_block_count);
-        f2fs_bug_on(!sbi->total_valid_node_count);
+        f2fs_bug_on(sbi, !sbi->total_valid_node_count);
-        f2fs_bug_on(!inode->i_blocks);
+        f2fs_bug_on(sbi, !inode->i_blocks);
        inode->i_blocks--;
        sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
+        f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
        sbi->total_valid_inode_count++;
        spin_unlock(&sbi->stat_lock);
 }
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
        spin_lock(&sbi->stat_lock);
-        f2fs_bug_on(!sbi->total_valid_inode_count);
+        f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
        sbi->total_valid_inode_count--;
        spin_unlock(&sbi->stat_lock);
 }
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
                return;
        if (unlock) {
-                f2fs_bug_on(!PageLocked(page));
+                f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
                unlock_page(page);
        }
        page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
        FI_INLINE_DATA,         /* used for inline data*/
        FI_APPEND_WRITE,        /* inode has appended data */
        FI_UPDATE_WRITE,        /* inode has in-place-update data */
-        FI_NEED_IPU,            /* used fo ipu for fdatasync */
+        FI_NEED_IPU,            /* used for ipu per file */
+        FI_ATOMIC_FILE,         /* indicate atomic file */
+        FI_VOLATILE_FILE,       /* indicate volatile file */
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
 }
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
 static inline void *inline_data_addr(struct page *page)
 {
        struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1096,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb)
        return sb->s_flags & MS_RDONLY;
 }
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+        return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
 static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 {
        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1136,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
 void update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
 /*
 * namei.c
@@ -1183,9 +1262,9 @@ struct dnode_of_data;
 struct node_info;
 bool available_free_memory(struct f2fs_sb_info *, int);
-int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
-void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1202,10 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
-                struct f2fs_summary *, struct node_info *, block_t);
 void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
 int restore_node_summary(struct f2fs_sb_info *, unsigned int,
                                struct f2fs_summary_block *);
@@ -1218,6 +1295,8 @@ void destroy_node_manager_caches(void);
 /*
 * segment.c
 */
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1226,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
 void discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1238,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
 void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
-                                struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
                block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1247,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
                                        int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
 int __init create_segment_manager_caches(void);
@@ -1258,10 +1337,12 @@ void destroy_segment_manager_caches(void);
 */
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
 void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
 bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
@@ -1269,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 void recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
-void set_dirty_dir_page(struct inode *, struct page *);
+void update_dirty_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
 void init_ino_entry_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -1357,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_inline_inode(inode)                                    \
        do {                                                            \
                if (f2fs_has_inline_data(inode))                        \
-                        ((F2FS_SB(inode->i_sb))->inline_inode++);       \
+                        ((F2FS_I_SB(inode))->inline_inode++);           \
        } while (0)
 #define stat_dec_inline_inode(inode)                                    \
        do {                                                            \
                if (f2fs_has_inline_data(inode))                        \
-                        ((F2FS_SB(inode->i_sb))->inline_inode--);       \
+                        ((F2FS_I_SB(inode))->inline_inode--);           \
        } while (0)
 #define stat_inc_seg_type(sbi, curseg)                                  \
@@ -1439,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
 */
 bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
 int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
 void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9bd569..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 {
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vma->vm_file);
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct dnode_of_data dn;
        int err;
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        sb_start_pagefault(inode->i_sb);
+        /* force to convert with normal data indices */
+        err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+        if (err)
+                goto out;
        /* block allocation */
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
        return 1;
 }
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        bool need_cp = false;
+        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+                need_cp = true;
+        else if (file_wrong_pino(inode))
+                need_cp = true;
+        else if (!space_for_roll_forward(sbi))
+                need_cp = true;
+        else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+                need_cp = true;
+        else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+                need_cp = true;
+        return need_cp;
+}
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        struct f2fs_inode_info *fi = F2FS_I(inode);
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        nid_t ino = inode->i_ino;
        int ret = 0;
        bool need_cp = false;
        struct writeback_control wbc = {
@@ -129,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trace_f2fs_sync_file_enter(inode);
        /* if fdatasync is triggered, let's do in-place-update */
-        if (datasync)
+        if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
                set_inode_flag(fi, FI_NEED_IPU);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-        if (datasync)
+        clear_inode_flag(fi, FI_NEED_IPU);
-                clear_inode_flag(fi, FI_NEED_IPU);
        if (ret) {
                trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
                return ret;
@@ -144,33 +168,31 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * if there is no written data, don't waste time to write recovery info.
         */
        if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
-                !exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+                        !exist_written_data(sbi, ino, APPEND_INO)) {
+                struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+                /* But we need to avoid that there are some inode updates */
+                if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
+                        f2fs_put_page(i, 0);
+                        goto go_write;
+                }
+                f2fs_put_page(i, 0);
                if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
-                        exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+                                exist_written_data(sbi, ino, UPDATE_INO))
                        goto flush_out;
                goto out;
        }
+go_write:
        /* guarantee free sections for fsync */
        f2fs_balance_fs(sbi);
-        down_read(&fi->i_sem);
        /*
         * Both of fdatasync() and fsync() are able to be recovered from
         * sudden-power-off.
         */
-        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+        down_read(&fi->i_sem);
-                need_cp = true;
+        need_cp = need_do_checkpoint(inode);
-        else if (file_wrong_pino(inode))
-                need_cp = true;
-        else if (!space_for_roll_forward(sbi))
-                need_cp = true;
-        else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-                need_cp = true;
-        else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
-                need_cp = true;
        up_read(&fi->i_sem);
        if (need_cp) {
@@ -194,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                        up_write(&fi->i_sem);
                }
        } else {
-                /* if there is no written node page, write its inode page */
+sync_nodes:
-                while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+                sync_node_pages(sbi, ino, &wbc);
-                        if (fsync_mark_done(sbi, inode->i_ino))
-                                goto out;
+                if (need_inode_block_update(sbi, ino)) {
                        mark_inode_dirty_sync(inode);
                        ret = f2fs_write_inode(inode, NULL);
                        if (ret)
                                goto out;
+                        goto sync_nodes;
                }
-                ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+                ret = wait_on_node_pages_writeback(sbi, ino);
                if (ret)
                        goto out;
                /* once recovery info is written, don't need to tack this */
-                remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+                remove_dirty_inode(sbi, ino, APPEND_INO);
                clear_inode_flag(fi, FI_APPEND_WRITE);
 flush_out:
-                remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+                remove_dirty_inode(sbi, ino, UPDATE_INO);
                clear_inode_flag(fi, FI_UPDATE_WRITE);
-                ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
+                ret = f2fs_issue_flush(F2FS_I_SB(inode));
        }
 out:
        trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -288,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                if (err && err != -ENOENT) {
                        goto fail;
                } else if (err == -ENOENT) {
-                        /* direct node is not exist */
+                        /* direct node does not exists */
                        if (whence == SEEK_DATA) {
                                pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
                                                        F2FS_I(inode));
@@ -340,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
                                                maxbytes, i_size_read(inode));
        case SEEK_DATA:
        case SEEK_HOLE:
+                if (offset < 0)
+                        return -ENXIO;
                return f2fs_seek_block(file, offset, whence);
        }
@@ -356,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
        int nr_free = 0, ofs = dn->ofs_in_node;
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_node *raw_node;
        __le32 *addr;
@@ -417,9 +443,9 @@ out:
        f2fs_put_page(page, 1);
 }
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        unsigned int blocksize = inode->i_sb->s_blocksize;
        struct dnode_of_data dn;
        pgoff_t free_from;
@@ -433,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from)
        free_from = (pgoff_t)
                        ((from + blocksize - 1) >> (sbi->log_blocksize));
-        f2fs_lock_op(sbi);
+        if (lock)
+                f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
        if (err) {
                if (err == -ENOENT)
                        goto free_next;
-                f2fs_unlock_op(sbi);
+                if (lock)
+                        f2fs_unlock_op(sbi);
                trace_f2fs_truncate_blocks_exit(inode, err);
                return err;
        }
@@ -448,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from)
        count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
        count -= dn.ofs_in_node;
-        f2fs_bug_on(count < 0);
+        f2fs_bug_on(sbi, count < 0);
        if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
                truncate_data_blocks_range(&dn, count);
@@ -458,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from)
        f2fs_put_dnode(&dn);
 free_next:
        err = truncate_inode_blocks(inode, free_from);
-        f2fs_unlock_op(sbi);
+        if (lock)
+                f2fs_unlock_op(sbi);
 done:
        /* lastly zero out the first data page */
        truncate_partial_data_page(inode, from);
@@ -475,7 +504,7 @@ void f2fs_truncate(struct inode *inode)
        trace_f2fs_truncate(inode);
-        if (!truncate_blocks(inode, i_size_read(inode))) {
+        if (!truncate_blocks(inode, i_size_read(inode), true)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                mark_inode_dirty(inode);
        }
@@ -531,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
-        if ((attr->ia_valid & ATTR_SIZE) &&
+        if (attr->ia_valid & ATTR_SIZE) {
-                        attr->ia_size != i_size_read(inode)) {
+                err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
-                err = f2fs_convert_inline_data(inode, attr->ia_size);
                if (err)
                        return err;
-                truncate_setsize(inode, attr->ia_size);
+                if (attr->ia_size != i_size_read(inode)) {
-                f2fs_truncate(inode);
+                        truncate_setsize(inode, attr->ia_size);
-                f2fs_balance_fs(F2FS_SB(inode->i_sb));
+                        f2fs_truncate(inode);
+                        f2fs_balance_fs(F2FS_I_SB(inode));
+                } else {
+                        /*
+                         * giving a chance to truncate blocks past EOF which
+                         * are fallocated with FALLOC_FL_KEEP_SIZE.
+                         */
+                        f2fs_truncate(inode);
+                }
        }
        __setattr_copy(inode, attr);
@@ -573,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
 static void fill_zero(struct inode *inode, pgoff_t index,
                                        loff_t start, loff_t len)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct page *page;
        if (!len)
@@ -622,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
        loff_t off_start, off_end;
        int ret = 0;
-        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+        if (!S_ISREG(inode->i_mode))
+                return -EOPNOTSUPP;
+        /* skip punching hole beyond i_size */
+        if (offset >= inode->i_size)
+                return ret;
+        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
        if (ret)
                return ret;
@@ -645,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
                if (pg_start < pg_end) {
                        struct address_space *mapping = inode->i_mapping;
                        loff_t blk_start, blk_end;
-                        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+                        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
                        f2fs_balance_fs(sbi);
@@ -666,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 static int expand_inode_data(struct inode *inode, loff_t offset,
                                        loff_t len, int mode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        pgoff_t index, pg_start, pg_end;
        loff_t new_size = i_size_read(inode);
        loff_t off_start, off_end;
@@ -678,7 +721,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        if (ret)
                return ret;
-        ret = f2fs_convert_inline_data(inode, offset + len);
+        ret = f2fs_convert_inline_data(inode, offset + len, NULL);
        if (ret)
                return ret;
@@ -762,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
                return flags & F2FS_OTHER_FLMASK;
 }
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+        struct inode *inode = file_inode(filp);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+        return put_user(flags, (int __user *)arg);
+}
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
        struct f2fs_inode_info *fi = F2FS_I(inode);
-        unsigned int flags;
+        unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+        unsigned int oldflags;
        int ret;
-        switch (cmd) {
+        ret = mnt_want_write_file(filp);
-        case F2FS_IOC_GETFLAGS:
+        if (ret)
-                flags = fi->i_flags & FS_FL_USER_VISIBLE;
+                return ret;
-                return put_user(flags, (int __user *) arg);
-        case F2FS_IOC_SETFLAGS:
-        {
-                unsigned int oldflags;
-                ret = mnt_want_write_file(filp);
+        if (!inode_owner_or_capable(inode)) {
-                if (ret)
+                ret = -EACCES;
-                        return ret;
+                goto out;
+        }
-                if (!inode_owner_or_capable(inode)) {
+        if (get_user(flags, (int __user *)arg)) {
-                        ret = -EACCES;
+                ret = -EFAULT;
-                        goto out;
+                goto out;
-                }
+        }
+        flags = f2fs_mask_flags(inode->i_mode, flags);
+        mutex_lock(&inode->i_mutex);
+        oldflags = fi->i_flags;
-                if (get_user(flags, (int __user *) arg)) {
+        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-                        ret = -EFAULT;
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        mutex_unlock(&inode->i_mutex);
+                        ret = -EPERM;
                        goto out;
                }
+        }
-                flags = f2fs_mask_flags(inode->i_mode, flags);
+        flags = flags & FS_FL_USER_MODIFIABLE;
+        flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+        fi->i_flags = flags;
+        mutex_unlock(&inode->i_mutex);
-                mutex_lock(&inode->i_mutex);
+        f2fs_set_inode_flags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(inode);
+out:
+        mnt_drop_write_file(filp);
+        return ret;
+}
-                oldflags = fi->i_flags;
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-                if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+        if (!inode_owner_or_capable(inode))
-                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                return -EACCES;
-                                mutex_unlock(&inode->i_mutex);
-                                ret = -EPERM;
+        f2fs_balance_fs(sbi);
-                                goto out;
-                        }
-                }
-                flags = flags & FS_FL_USER_MODIFIABLE;
+        set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-                flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
-                fi->i_flags = flags;
-                mutex_unlock(&inode->i_mutex);
-                f2fs_set_inode_flags(inode);
+        return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
-                inode->i_ctime = CURRENT_TIME;
+}
-                mark_inode_dirty(inode);
-out:
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
-                mnt_drop_write_file(filp);
+{
+        struct inode *inode = file_inode(filp);
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        if (f2fs_is_volatile_file(inode))
+                return 0;
+        ret = mnt_want_write_file(filp);
+        if (ret)
                return ret;
-        }
+        if (f2fs_is_atomic_file(inode))
+                commit_inmem_pages(inode, false);
+        ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+        mnt_drop_write_file(filp);
+        return ret;
+}
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+        return 0;
+}
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+        struct inode *inode = file_inode(filp);
+        struct super_block *sb = inode->i_sb;
+        struct request_queue *q = bdev_get_queue(sb->s_bdev);
+        struct fstrim_range range;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!blk_queue_discard(q))
+                return -EOPNOTSUPP;
+        if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+                                sizeof(range)))
+                return -EFAULT;
+        range.minlen = max((unsigned int)range.minlen,
+                                q->limits.discard_granularity);
+        ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user((struct fstrim_range __user *)arg, &range,
+                                sizeof(range)))
+                return -EFAULT;
+        return 0;
+}
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        switch (cmd) {
+        case F2FS_IOC_GETFLAGS:
+                return f2fs_ioc_getflags(filp, arg);
+        case F2FS_IOC_SETFLAGS:
+                return f2fs_ioc_setflags(filp, arg);
+        case F2FS_IOC_START_ATOMIC_WRITE:
+                return f2fs_ioc_start_atomic_write(filp);
+        case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+                return f2fs_ioc_commit_atomic_write(filp);
+        case F2FS_IOC_START_VOLATILE_WRITE:
+                return f2fs_ioc_start_volatile_write(filp);
+        case FITRIM:
+                return f2fs_ioc_fitrim(filp, arg);
        default:
                return -ENOTTY;
        }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d90ccc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
                 * 3. IO subsystem is idle by checking the # of requests in
                 *    bdev's request list.
                 *
-                 * Note) We have to avoid triggering GCs too much frequently.
+                 * Note) We have to avoid triggering GCs frequently.
                 * Because it is possible that some segments can be
                 * invalidated soon after by user update or deletion.
                 * So, I'd like to wait some time to collect dirty segments.
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
         * selected by background GC before.
         * Those segments guarantee they have small valid blocks.
         */
-        for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
+        for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
                if (sec_usage_check(sbi, secno))
                        continue;
                clear_bit(secno, dirty_i->victim_secmap);
@@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
        u = (vblocks * 100) >> sbi->log_blocks_per_seg;
-        /* Handle if the system time is changed by user */
+        /* Handle if the system time has changed by the user */
        if (mtime < sit_i->min_mtime)
                sit_i->min_mtime = mtime;
        if (mtime > sit_i->max_mtime)
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
        unsigned int secno, max_cost;
        int nsearched = 0;
+        mutex_lock(&dirty_i->seglist_lock);
        p.alloc_mode = alloc_mode;
        select_policy(sbi, gc_type, type, &p);
        p.min_segno = NULL_SEGNO;
        p.min_cost = max_cost = get_max_cost(sbi, &p);
-        mutex_lock(&dirty_i->seglist_lock);
        if (p.alloc_mode == LFS && gc_type == FG_GC) {
                p.min_segno = check_bg_victims(sbi);
                if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                unsigned long cost;
                unsigned int segno;
-                segno = find_next_bit(p.dirty_segmap,
+                segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
-                                                TOTAL_SEGS(sbi), p.offset);
+                if (segno >= MAIN_SEGS(sbi)) {
-                if (segno >= TOTAL_SEGS(sbi)) {
                        if (sbi->last_victim[p.gc_mode]) {
                                sbi->last_victim[p.gc_mode] = 0;
                                p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
                if (IS_ERR(node_page))
                        continue;
+                /* block may become invalid during get_node_page */
+                if (check_valid_map(sbi, segno, off) == 0) {
+                        f2fs_put_page(node_page, 1);
+                        continue;
+                }
                /* set page dirty and write it */
                if (gc_type == FG_GC) {
                        f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
                f2fs_wait_on_page_writeback(page, DATA);
                if (clear_page_dirty_for_io(page))
-                        inode_dec_dirty_dents(inode);
+                        inode_dec_dirty_pages(inode);
                set_cold_data(page);
                do_write_data_page(page, &fio);
                clear_cold_data(page);
@@ -593,7 +598,7 @@ next_step:
                if (phase == 2) {
                        inode = f2fs_iget(sb, dni.ino);
-                        if (IS_ERR(inode))
+                        if (IS_ERR(inode) || is_bad_inode(inode))
                                continue;
                        start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -688,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
        int gc_type = BG_GC;
        int nfree = 0;
        int ret = -1;
+        struct cp_control cpc = {
+                .reason = CP_SYNC,
+        };
        INIT_LIST_HEAD(&ilist);
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
-        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+        if (unlikely(f2fs_cp_error(sbi)))
                goto stop;
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
                gc_type = FG_GC;
-                write_checkpoint(sbi, false);
+                write_checkpoint(sbi, &cpc);
        }
        if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
                goto gc_more;
        if (gc_type == FG_GC)
-                write_checkpoint(sbi, false);
+                write_checkpoint(sbi, &cpc);
 stop:
        mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
        block_t invalid_user_blocks = sbi->user_block_count -
                                        written_block_count(sbi);
        /*
-         * Background GC is triggered with the following condition.
+         * Background GC is triggered with the following conditions.
         * 1. There are a number of invalid blocks.
         * 2. There is not enough free space.
         */
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf7281..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
        buf[1] += b1;
 }
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+                                unsigned int *buf, int num)
 {
        unsigned pad, val;
        int i;
@@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 {
        __u32 hash;
        f2fs_hash_t f2fs_hash;
-        const char *p;
+        const unsigned char *p;
        __u32 in[8], buf[4];
-        const char *name = name_info->name;
+        const unsigned char *name = name_info->name;
        size_t len = name_info->len;
        if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beeccef9ae1..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
 bool f2fs_may_inline(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        block_t nr_blocks;
        loff_t i_size;
-        if (!test_opt(sbi, INLINE_DATA))
+        if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+                return false;
+        if (f2fs_is_atomic_file(inode))
                return false;
        nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *ipage;
        void *src_addr, *dst_addr;
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
                goto out;
        }
-        ipage = get_node_page(sbi, inode->i_ino);
+        ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
        if (IS_ERR(ipage)) {
                unlock_page(page);
                return PTR_ERR(ipage);
@@ -68,12 +69,12 @@ out:
 static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 {
-        int err;
+        int err = 0;
        struct page *ipage;
        struct dnode_of_data dn;
        void *src_addr, *dst_addr;
        block_t new_blk_addr;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_io_info fio = {
                .type = DATA,
                .rw = WRITE_SYNC | REQ_PRIO,
@@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
                goto out;
        }
+        /* someone else converted inline_data already */
+        if (!f2fs_has_inline_data(inode))
+                goto out;
        /*
         * i_addr[0] is not used for inline data,
         * so reserving new block will not destroy inline data
@@ -124,9 +129,10 @@ out:
        return err;
 }
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+                                                struct page *page)
 {
-        struct page *page;
+        struct page *new_page = page;
        int err;
        if (!f2fs_has_inline_data(inode))
@@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
        else if (to_size <= MAX_INLINE_DATA)
                return 0;
-        page = grab_cache_page(inode->i_mapping, 0);
+        if (!page || page->index != 0) {
-        if (!page)
+                new_page = grab_cache_page(inode->i_mapping, 0);
-                return -ENOMEM;
+                if (!new_page)
+                        return -ENOMEM;
+        }
-        err = __f2fs_convert_inline_data(inode, page);
+        err = __f2fs_convert_inline_data(inode, new_page);
-        f2fs_put_page(page, 1);
+        if (!page || page->index != 0)
+                f2fs_put_page(new_page, 1);
        return err;
 }
 int f2fs_write_inline_data(struct inode *inode,
-                           struct page *page, unsigned size)
+                                struct page *page, unsigned size)
 {
        void *src_addr, *dst_addr;
        struct page *ipage;
@@ -181,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
 void truncate_inline_data(struct inode *inode, u64 from)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *ipage;
        if (from >= MAX_INLINE_DATA)
                return;
-        ipage = get_node_page(sbi, inode->i_ino);
+        ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
        if (IS_ERR(ipage))
                return;
@@ -199,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from)
        f2fs_put_page(ipage, 1);
 }
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_inode *ri = NULL;
        void *src_addr, *dst_addr;
        struct page *ipage;
@@ -218,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage)
                ri = F2FS_INODE(npage);
        if (f2fs_has_inline_data(inode) &&
-                        ri && ri->i_inline & F2FS_INLINE_DATA) {
+                        ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 process_inline:
                ipage = get_node_page(sbi, inode->i_ino);
-                f2fs_bug_on(IS_ERR(ipage));
+                f2fs_bug_on(sbi, IS_ERR(ipage));
                f2fs_wait_on_page_writeback(ipage, NODE);
@@ -230,22 +238,22 @@ process_inline:
                memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
-                return -1;
+                return true;
        }
        if (f2fs_has_inline_data(inode)) {
                ipage = get_node_page(sbi, inode->i_ino);
-                f2fs_bug_on(IS_ERR(ipage));
+                f2fs_bug_on(sbi, IS_ERR(ipage));
                f2fs_wait_on_page_writeback(ipage, NODE);
                zero_user_segment(ipage, INLINE_DATA_OFFSET,
                                 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
                clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
-        } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+        } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
-                truncate_blocks(inode, 0);
+                truncate_blocks(inode, 0, false);
                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
                goto process_inline;
        }
-        return 0;
+        return false;
 }
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 static int do_read_inode(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct page *node_page;
        struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
 void update_inode_page(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct page *node_page;
 retry:
        node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
                        inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 */
 void f2fs_evict_inode(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+        /* some remained atomic pages should discarded */
+        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+                commit_inmem_pages(inode, true);
        trace_f2fs_evict_inode(inode);
        truncate_inode_pages_final(&inode->i_data);
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
                        inode->i_ino == F2FS_META_INO(sbi))
                goto out_clear;
-        f2fs_bug_on(get_dirty_dents(inode));
+        f2fs_bug_on(sbi, get_dirty_pages(inode));
        remove_dirty_dir_inode(inode);
        if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
 out_clear:
        clear_inode(inode);
 }
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        clear_nlink(inode);
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        i_size_write(inode, 0);
+        if (F2FS_HAS_BLOCKS(inode))
+                f2fs_truncate(inode);
+        remove_inode_page(inode);
+        stat_dec_inline_inode(inode);
+        alloc_nid_failed(sbi, inode->i_ino);
+        f2fs_unlock_op(sbi);
+        /* iput will drop the inode object */
+        iput(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b03776ffd2..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
 static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        nid_t ino;
        struct inode *inode;
        bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
 static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                                                bool excl)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        nid_t ino = 0;
        int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
-        f2fs_unlock_op(sbi);
        if (err)
                goto out;
+        f2fs_unlock_op(sbi);
        alloc_nid_done(sbi, ino);
@@ -133,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        unlock_new_inode(inode);
        return 0;
 out:
-        clear_nlink(inode);
+        handle_failed_inode(inode);
-        unlock_new_inode(inode);
-        make_bad_inode(inode);
-        iput(inode);
-        alloc_nid_failed(sbi, ino);
        return err;
 }
@@ -145,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        int err;
        f2fs_balance_fs(sbi);
@@ -156,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
-        f2fs_unlock_op(sbi);
        if (err)
                goto out;
+        f2fs_unlock_op(sbi);
        d_instantiate(dentry, inode);
        return 0;
 out:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
        iput(inode);
+        f2fs_unlock_op(sbi);
        return err;
 }
@@ -205,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode = dentry->d_inode;
        struct f2fs_dir_entry *de;
        struct page *page;
@@ -229,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
        f2fs_delete_entry(de, page, inode);
        f2fs_unlock_op(sbi);
-        /* In order to evict this inode,  we set it dirty */
+        /* In order to evict this inode, we set it dirty */
        mark_inode_dirty(inode);
 fail:
        trace_f2fs_unlink_exit(inode, err);
@@ -239,7 +236,7 @@ fail:
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
                                        const char *symname)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        size_t symlen = strlen(symname) + 1;
        int err;
@@ -255,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
-        f2fs_unlock_op(sbi);
        if (err)
                goto out;
+        f2fs_unlock_op(sbi);
        err = page_symlink(inode, symname, symlen);
        alloc_nid_done(sbi, inode->i_ino);
@@ -266,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        unlock_new_inode(inode);
        return err;
 out:
-        clear_nlink(inode);
+        handle_failed_inode(inode);
-        unlock_new_inode(inode);
-        make_bad_inode(inode);
-        iput(inode);
-        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
 static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        int err;
@@ -294,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
-        f2fs_unlock_op(sbi);
        if (err)
                goto out_fail;
+        f2fs_unlock_op(sbi);
        alloc_nid_done(sbi, inode->i_ino);
@@ -307,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 out_fail:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
-        clear_nlink(inode);
+        handle_failed_inode(inode);
-        unlock_new_inode(inode);
-        make_bad_inode(inode);
-        iput(inode);
-        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -326,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
                                umode_t mode, dev_t rdev)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        int err = 0;
@@ -344,27 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
        f2fs_lock_op(sbi);
        err = f2fs_add_link(dentry, inode);
-        f2fs_unlock_op(sbi);
        if (err)
                goto out;
+        f2fs_unlock_op(sbi);
        alloc_nid_done(sbi, inode->i_ino);
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        return 0;
 out:
-        clear_nlink(inode);
+        handle_failed_inode(inode);
-        unlock_new_inode(inode);
-        make_bad_inode(inode);
-        iput(inode);
-        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
 static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct page *old_dir_page;
@@ -488,8 +473,7 @@ out:
 static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct super_block *sb = old_dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct page *old_dir_page, *new_dir_page;
@@ -650,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        int err;
@@ -686,12 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 release_out:
        release_orphan_inode(sbi);
 out:
-        f2fs_unlock_op(sbi);
+        handle_failed_inode(inode);
-        clear_nlink(inode);
-        unlock_new_inode(inode);
-        make_bad_inode(inode);
-        iput(inode);
-        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -704,7 +683,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .mkdir          = f2fs_mkdir,
        .rmdir          = f2fs_rmdir,
        .mknod          = f2fs_mknod,
-        .rename         = f2fs_rename,
        .rename2        = f2fs_rename2,
        .tmpfile        = f2fs_tmpfile,
        .getattr        = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d284631..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 static void clear_node_page_dirty(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
        unsigned int long flags;
        if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                clear_page_dirty_for_io(page);
-                dec_page_count(sbi, F2FS_DIRTY_NODES);
+                dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
        }
        ClearPageUptodate(page);
 }
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
        /* get current nat block page with lock */
        src_page = get_meta_page(sbi, src_off);
        dst_page = grab_meta_page(sbi, dst_off);
-        f2fs_bug_on(PageDirty(src_page));
+        f2fs_bug_on(sbi, PageDirty(src_page));
        src_addr = page_address(src_page);
        dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
        kmem_cache_free(nat_entry_slab, e);
 }
-int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+                                                struct nat_entry *ne)
+{
+        nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+        struct nat_entry_set *head;
+        if (get_nat_flag(ne, IS_DIRTY))
+                return;
+retry:
+        head = radix_tree_lookup(&nm_i->nat_set_root, set);
+        if (!head) {
+                head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+                INIT_LIST_HEAD(&head->entry_list);
+                INIT_LIST_HEAD(&head->set_list);
+                head->set = set;
+                head->entry_cnt = 0;
+                if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+                        cond_resched();
+                        goto retry;
+                }
+        }
+        list_move_tail(&ne->list, &head->entry_list);
+        nm_i->dirty_nat_cnt++;
+        head->entry_cnt++;
+        set_nat_flag(ne, IS_DIRTY, true);
+}
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+                                                struct nat_entry *ne)
+{
+        nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+        struct nat_entry_set *head;
+        head = radix_tree_lookup(&nm_i->nat_set_root, set);
+        if (head) {
+                list_move_tail(&ne->list, &nm_i->nat_entries);
+                set_nat_flag(ne, IS_DIRTY, false);
+                head->entry_cnt--;
+                nm_i->dirty_nat_cnt--;
+        }
+}
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+                nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+        return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+                                                        start, nr);
+}
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
-        int is_cp = 1;
+        bool is_cp = true;
        read_lock(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
-        if (e && !e->checkpointed)
+        if (e && !get_nat_flag(e, IS_CHECKPOINTED))
-                is_cp = 0;
+                is_cp = false;
        read_unlock(&nm_i->nat_tree_lock);
        return is_cp;
 }
-bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
-        bool fsync_done = false;
+        bool fsynced = false;
        read_lock(&nm_i->nat_tree_lock);
-        e = __lookup_nat_cache(nm_i, nid);
+        e = __lookup_nat_cache(nm_i, ino);
-        if (e)
+        if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
-                fsync_done = e->fsync_done;
+                fsynced = true;
        read_unlock(&nm_i->nat_tree_lock);
-        return fsync_done;
+        return fsynced;
 }
-void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
+        bool need_update = true;
-        write_lock(&nm_i->nat_tree_lock);
+        read_lock(&nm_i->nat_tree_lock);
-        e = __lookup_nat_cache(nm_i, nid);
+        e = __lookup_nat_cache(nm_i, ino);
-        if (e)
+        if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
-                e->fsync_done = false;
+                        (get_nat_flag(e, IS_CHECKPOINTED) ||
-        write_unlock(&nm_i->nat_tree_lock);
+                         get_nat_flag(e, HAS_FSYNCED_INODE)))
+                need_update = false;
+        read_unlock(&nm_i->nat_tree_lock);
+        return need_update;
 }
 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
        }
        memset(new, 0, sizeof(struct nat_entry));
        nat_set_nid(new, nid);
-        new->checkpointed = true;
+        nat_reset_flag(new);
        list_add_tail(&new->list, &nm_i->nat_entries);
        nm_i->nat_cnt++;
        return new;
@@ -216,7 +270,7 @@ retry:
                        goto retry;
                }
                e->ni = *ni;
-                f2fs_bug_on(ni->blk_addr == NEW_ADDR);
+                f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
        } else if (new_blkaddr == NEW_ADDR) {
                /*
                 * when nid is reallocated,
@@ -224,20 +278,20 @@ retry:
                 * So, reinitialize it with new information.
                 */
                e->ni = *ni;
-                f2fs_bug_on(ni->blk_addr != NULL_ADDR);
+                f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
        }
        /* sanity check */
-        f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
+        f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
-        f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
+        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
                        new_blkaddr == NULL_ADDR);
-        f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
+        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
                        new_blkaddr == NEW_ADDR);
-        f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
+        f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
                        nat_get_blkaddr(e) != NULL_ADDR &&
                        new_blkaddr == NEW_ADDR);
-        /* increament version no as node is removed */
+        /* increment version no as node is removed */
        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
                unsigned char version = nat_get_version(e);
                nat_set_version(e, inc_node_version(version));
@@ -245,12 +299,17 @@ retry:
        /* change address */
        nat_set_blkaddr(e, new_blkaddr);
+        if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+                set_nat_flag(e, IS_CHECKPOINTED, false);
        __set_nat_cache_dirty(nm_i, e);
        /* update fsync_mark if its inode nat entry is still alive */
        e = __lookup_nat_cache(nm_i, ni->ino);
-        if (e)
+        if (e) {
-                e->fsync_done = fsync_done;
+                if (fsync_done && ni->nid == ni->ino)
+                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
+                set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+        }
        write_unlock(&nm_i->nat_tree_lock);
 }
@@ -274,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 }
 /*
- * This function returns always success
+ * This function always returns success
 */
 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 {
@@ -411,7 +470,7 @@ got:
 */
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct page *npage[4];
        struct page *parent;
        int offset[4];
@@ -504,15 +563,15 @@ release_out:
 static void truncate_node(struct dnode_of_data *dn)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct node_info ni;
        get_node_info(sbi, dn->nid, &ni);
        if (dn->inode->i_blocks == 0) {
-                f2fs_bug_on(ni.blk_addr != NULL_ADDR);
+                f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
                goto invalidate;
        }
-        f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+        f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
        /* Deallocate node address */
        invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
 static int truncate_dnode(struct dnode_of_data *dn)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct page *page;
        if (dn->nid == 0)
                return 1;
        /* get direct node */
-        page = get_node_page(sbi, dn->nid);
+        page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
        if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
                return 1;
        else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
                                                int ofs, int depth)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct dnode_of_data rdn = *dn;
        struct page *page;
        struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
        trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
-        page = get_node_page(sbi, dn->nid);
+        page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
        if (IS_ERR(page)) {
                trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
                return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
 static int truncate_partial_nodes(struct dnode_of_data *dn,
                        struct f2fs_inode *ri, int *offset, int depth)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct page *pages[2];
        nid_t nid[3];
        nid_t child_nid;
@@ -650,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
        /* get indirect nodes in the path */
        for (i = 0; i < idx + 1; i++) {
-                /* refernece count'll be increased */
+                /* reference count'll be increased */
-                pages[i] = get_node_page(sbi, nid[i]);
+                pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
                if (IS_ERR(pages[i])) {
                        err = PTR_ERR(pages[i]);
                        idx = i - 1;
@@ -696,7 +752,7 @@ fail:
 */
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        int err = 0, cont = 1;
        int level, offset[4], noffset[4];
        unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
 int truncate_xattr_node(struct inode *inode, struct page *page)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        nid_t nid = F2FS_I(inode)->i_xattr_nid;
        struct dnode_of_data dn;
        struct page *npage;
@@ -823,22 +879,27 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 */
 void remove_inode_page(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct page *page;
-        nid_t ino = inode->i_ino;
        struct dnode_of_data dn;
-        page = get_node_page(sbi, ino);
+        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-        if (IS_ERR(page))
+        if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
                return;
-        if (truncate_xattr_node(inode, page)) {
+        if (truncate_xattr_node(inode, dn.inode_page)) {
-                f2fs_put_page(page, 1);
+                f2fs_put_dnode(&dn);
                return;
        }
-        /* 0 is possible, after f2fs_new_inode() is failed */
-        f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
+        /* remove potential inline_data blocks */
-        set_new_dnode(&dn, inode, page, page, ino);
+        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                                S_ISLNK(inode->i_mode))
+                truncate_data_blocks_range(&dn, 1);
+        /* 0 is possible, after f2fs_new_inode() has failed */
+        f2fs_bug_on(F2FS_I_SB(inode),
+                        inode->i_blocks != 0 && inode->i_blocks != 1);
+        /* will put inode & node pages */
        truncate_node(&dn);
 }
@@ -856,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
 struct page *new_node_page(struct dnode_of_data *dn,
                                unsigned int ofs, struct page *ipage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct node_info old_ni, new_ni;
        struct page *page;
        int err;
@@ -876,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
        get_node_info(sbi, dn->nid, &old_ni);
        /* Reinitialize old_ni with new node page */
-        f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
+        f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
        new_ni = old_ni;
        new_ni.ino = dn->inode->i_ino;
        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -914,7 +975,7 @@ fail:
 */
 static int read_node_page(struct page *page, int rw)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        struct node_info ni;
        get_node_info(sbi, page->index, &ni);
@@ -990,7 +1051,7 @@ got_it:
 */
 struct page *get_node_page_ra(struct page *parent, int start)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
        struct blk_plug plug;
        struct page *page;
        int err, i, end;
@@ -1120,17 +1181,24 @@ continue_unlock:
                        /* called by fsync() */
                        if (ino && IS_DNODE(page)) {
-                                int mark = !is_checkpointed_node(sbi, ino);
                                set_fsync_mark(page, 1);
-                                if (IS_INODE(page))
+                                if (IS_INODE(page)) {
-                                        set_dentry_mark(page, mark);
+                                        if (!is_checkpointed_node(sbi, ino) &&
+                                                !has_fsynced_inode(sbi, ino))
+                                                set_dentry_mark(page, 1);
+                                        else
+                                                set_dentry_mark(page, 0);
+                                }
                                nwritten++;
                        } else {
                                set_fsync_mark(page, 0);
                                set_dentry_mark(page, 0);
                        }
-                        NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
-                        wrote++;
+                        if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+                                unlock_page(page);
+                        else
+                                wrote++;
                        if (--wbc->nr_to_write == 0)
                                break;
@@ -1199,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 static int f2fs_write_node_page(struct page *page,
                                struct writeback_control *wbc)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        nid_t nid;
        block_t new_addr;
        struct node_info ni;
@@ -1212,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page,
        if (unlikely(sbi->por_doing))
                goto redirty_out;
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto redirty_out;
        f2fs_wait_on_page_writeback(page, NODE);
        /* get old block addr of this node page */
        nid = nid_of_node(page);
-        f2fs_bug_on(page->index != nid);
+        f2fs_bug_on(sbi, page->index != nid);
        get_node_info(sbi, nid, &ni);
@@ -1248,7 +1318,7 @@ redirty_out:
 static int f2fs_write_node_pages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
        long diff;
        trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1273,15 +1343,12 @@ skip_write:
 static int f2fs_set_node_page_dirty(struct page *page)
 {
-        struct address_space *mapping = page->mapping;
-        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
        trace_f2fs_set_page_dirty(page, NODE);
        SetPageUptodate(page);
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
-                inc_page_count(sbi, F2FS_DIRTY_NODES);
+                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
                SetPagePrivate(page);
                return 1;
        }
@@ -1292,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        if (PageDirty(page))
-                dec_page_count(sbi, F2FS_DIRTY_NODES);
+                dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
        ClearPagePrivate(page);
 }
@@ -1347,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
                read_lock(&nm_i->nat_tree_lock);
                ne = __lookup_nat_cache(nm_i, nid);
                if (ne &&
-                        (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
+                        (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+                                nat_get_blkaddr(ne) != NULL_ADDR))
                        allocated = true;
                read_unlock(&nm_i->nat_tree_lock);
                if (allocated)
@@ -1404,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
                        break;
                blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
-                f2fs_bug_on(blk_addr == NEW_ADDR);
+                f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
                if (blk_addr == NULL_ADDR) {
                        if (add_free_nid(sbi, start_nid, true) < 0)
                                break;
@@ -1474,12 +1541,12 @@ retry:
        /* We should not use stale free nids created by build_free_nids */
        if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-                f2fs_bug_on(list_empty(&nm_i->free_nid_list));
+                f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
                list_for_each_entry(i, &nm_i->free_nid_list, list)
                        if (i->state == NID_NEW)
                                break;
-                f2fs_bug_on(i->state != NID_NEW);
+                f2fs_bug_on(sbi, i->state != NID_NEW);
                *nid = i->nid;
                i->state = NID_ALLOC;
                nm_i->fcnt--;
@@ -1505,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
        spin_lock(&nm_i->free_nid_list_lock);
        i = __lookup_free_nid_list(nm_i, nid);
-        f2fs_bug_on(!i || i->state != NID_ALLOC);
+        f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
        __del_from_free_nid_list(nm_i, i);
        spin_unlock(&nm_i->free_nid_list_lock);
@@ -1526,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
        spin_lock(&nm_i->free_nid_list_lock);
        i = __lookup_free_nid_list(nm_i, nid);
-        f2fs_bug_on(!i || i->state != NID_ALLOC);
+        f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
        if (!available_free_memory(sbi, FREE_NIDS)) {
                __del_from_free_nid_list(nm_i, i);
                need_free = true;
@@ -1540,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
                kmem_cache_free(free_nid_slab, i);
 }
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
-                struct f2fs_summary *sum, struct node_info *ni,
-                block_t new_blkaddr)
-{
-        rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
-        set_node_addr(sbi, ni, new_blkaddr, false);
-        clear_node_page_dirty(page);
-}
 void recover_inline_xattr(struct inode *inode, struct page *page)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        void *src_addr, *dst_addr;
        size_t inline_size;
        struct page *ipage;
        struct f2fs_inode *ri;
-        if (!f2fs_has_inline_xattr(inode))
+        ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
-                return;
+        f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
-        if (!IS_INODE(page))
-                return;
        ri = F2FS_INODE(page);
-        if (!(ri->i_inline & F2FS_INLINE_XATTR))
+        if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
-                return;
+                clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+                goto update_inode;
-        ipage = get_node_page(sbi, inode->i_ino);
+        }
-        f2fs_bug_on(IS_ERR(ipage));
        dst_addr = inline_xattr_addr(ipage);
        src_addr = inline_xattr_addr(page);
@@ -1576,28 +1629,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
        f2fs_wait_on_page_writeback(ipage, NODE);
        memcpy(dst_addr, src_addr, inline_size);
+update_inode:
        update_inode(inode, ipage);
        f2fs_put_page(ipage, 1);
 }
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
        nid_t new_xnid = nid_of_node(page);
        struct node_info ni;
-        if (!f2fs_has_xattr_block(ofs_of_node(page)))
-                return false;
        /* 1: invalidate the previous xattr nid */
        if (!prev_xnid)
                goto recover_xnid;
        /* Deallocate node address */
        get_node_info(sbi, prev_xnid, &ni);
-        f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+        f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
        invalidate_blocks(sbi, ni.blk_addr);
        dec_valid_node_count(sbi, inode);
        set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1605,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 recover_xnid:
        /* 2: allocate new xattr nid */
        if (unlikely(!inc_valid_node_count(sbi, inode)))
-                f2fs_bug_on(1);
+                f2fs_bug_on(sbi, 1);
        remove_free_nid(NM_I(sbi), new_xnid);
        get_node_info(sbi, new_xnid, &ni);
@@ -1618,7 +1668,6 @@ recover_xnid:
        set_node_addr(sbi, &ni, blkaddr, false);
        update_inode_page(inode);
-        return true;
 }
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        if (!ipage)
                return -ENOMEM;
-        /* Should not use this inode  from free nid list */
+        /* Should not use this inode from free nid list */
        remove_free_nid(NM_I(sbi), ino);
        SetPageUptodate(ipage);
@@ -1651,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        dst->i_blocks = cpu_to_le64(1);
        dst->i_links = cpu_to_le32(1);
        dst->i_xattr_nid = 0;
+        dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
        new_ni = old_ni;
        new_ni.ino = ino;
@@ -1659,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
        inc_valid_inode_count(sbi);
+        set_page_dirty(ipage);
        f2fs_put_page(ipage, 1);
        return 0;
 }
 /*
 * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
 */
 static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
                                int start, int nrpages)
@@ -1697,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
        struct f2fs_summary *sum_entry;
        struct inode *inode = sbi->sb->s_bdev->bd_inode;
        block_t addr;
-        int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        int bio_blocks = MAX_BIO_BLOCKS(sbi);
        struct page *pages[bio_blocks];
        int i, idx, last_offset, nrpages, err = 0;
@@ -1709,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
        for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
                nrpages = min(last_offset - i, bio_blocks);
-                /* read ahead node pages */
+                /* readahead node pages */
                nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
                if (!nrpages)
                        return -ENOMEM;
@@ -1739,89 +1790,6 @@ skip:
        return err;
 }
-static struct nat_entry_set *grab_nat_entry_set(void)
-{
-        struct nat_entry_set *nes =
-                        f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-        nes->entry_cnt = 0;
-        INIT_LIST_HEAD(&nes->set_list);
-        INIT_LIST_HEAD(&nes->entry_list);
-        return nes;
-}
-static void release_nat_entry_set(struct nat_entry_set *nes,
-                                                struct f2fs_nm_info *nm_i)
-{
-        f2fs_bug_on(!list_empty(&nes->entry_list));
-        nm_i->dirty_nat_cnt -= nes->entry_cnt;
-        list_del(&nes->set_list);
-        kmem_cache_free(nat_entry_set_slab, nes);
-}
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
-                                                struct list_head *head)
-{
-        struct nat_entry_set *next = nes;
-        if (list_is_last(&nes->set_list, head))
-                return;
-        list_for_each_entry_continue(next, head, set_list)
-                if (nes->entry_cnt <= next->entry_cnt)
-                        break;
-        list_move_tail(&nes->set_list, &next->set_list);
-}
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
-        struct nat_entry_set *nes;
-        nid_t start_nid = START_NID(ne->ni.nid);
-        list_for_each_entry(nes, head, set_list) {
-                if (nes->start_nid == start_nid) {
-                        list_move_tail(&ne->list, &nes->entry_list);
-                        nes->entry_cnt++;
-                        adjust_nat_entry_set(nes, head);
-                        return;
-                }
-        }
-        nes = grab_nat_entry_set();
-        nes->start_nid = start_nid;
-        list_move_tail(&ne->list, &nes->entry_list);
-        nes->entry_cnt++;
-        list_add(&nes->set_list, head);
-}
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
-        struct f2fs_nm_info *nm_i = NM_I(sbi);
-        struct list_head *dirty_list = &nm_i->dirty_nat_entries;
-        struct list_head *set_list = &nm_i->nat_entry_set;
-        struct nat_entry *ne, *tmp;
-        write_lock(&nm_i->nat_tree_lock);
-        list_for_each_entry_safe(ne, tmp, dirty_list, list) {
-                if (nat_get_blkaddr(ne) == NEW_ADDR)
-                        continue;
-                add_nat_entry(ne, set_list);
-                nm_i->dirty_nat_cnt++;
-        }
-        write_unlock(&nm_i->nat_tree_lock);
-}
-static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
-{
-        if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
-                return true;
-        else
-                return false;
-}
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1856,99 +1824,130 @@ found:
        mutex_unlock(&curseg->curseg_mutex);
 }
-/*
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
- * This function is called during the checkpointing process.
+                                                struct list_head *head, int max)
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
 {
-        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct nat_entry_set *cur;
-        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
-        struct nat_entry_set *nes, *tmp;
-        struct list_head *head = &nm_i->nat_entry_set;
-        bool to_journal = true;
-        /* merge nat entries of dirty list to nat entry set temporarily */
+        if (nes->entry_cnt >= max)
-        merge_nats_in_set(sbi);
+                goto add_out;
-        /*
+        list_for_each_entry(cur, head, set_list) {
-         * if there are no enough space in journal to store dirty nat
+                if (cur->entry_cnt >= nes->entry_cnt) {
-         * entries, remove all entries from journal and merge them
+                        list_add(&nes->set_list, cur->set_list.prev);
-         * into nat entry set.
+                        return;
-         */
+                }
-        if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
-                remove_nats_in_journal(sbi);
-                /*
-                 * merge nat entries of dirty list to nat entry set temporarily
-                 */
-                merge_nats_in_set(sbi);
        }
+add_out:
+        list_add_tail(&nes->set_list, head);
+}
-        if (!nm_i->dirty_nat_cnt)
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
-                return;
+                                        struct nat_entry_set *set)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+        bool to_journal = true;
+        struct f2fs_nat_block *nat_blk;
+        struct nat_entry *ne, *cur;
+        struct page *page = NULL;
        /*
         * there are two steps to flush nat entries:
         * #1, flush nat entries to journal in current hot data summary block.
         * #2, flush nat entries to nat page.
         */
-        list_for_each_entry_safe(nes, tmp, head, set_list) {
+        if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
-                struct f2fs_nat_block *nat_blk;
+                to_journal = false;
-                struct nat_entry *ne, *cur;
-                struct page *page;
-                nid_t start_nid = nes->start_nid;
-                if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
+        if (to_journal) {
-                        to_journal = false;
+                mutex_lock(&curseg->curseg_mutex);
+        } else {
+                page = get_next_nat_page(sbi, start_nid);
+                nat_blk = page_address(page);
+                f2fs_bug_on(sbi, !nat_blk);
+        }
+        /* flush dirty nats in nat entry set */
+        list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+                struct f2fs_nat_entry *raw_ne;
+                nid_t nid = nat_get_nid(ne);
+                int offset;
+                if (nat_get_blkaddr(ne) == NEW_ADDR)
+                        continue;
                if (to_journal) {
-                        mutex_lock(&curseg->curseg_mutex);
+                        offset = lookup_journal_in_cursum(sum,
+                                                        NAT_JOURNAL, nid, 1);
+                        f2fs_bug_on(sbi, offset < 0);
+                        raw_ne = &nat_in_journal(sum, offset);
+                        nid_in_journal(sum, offset) = cpu_to_le32(nid);
                } else {
-                        page = get_next_nat_page(sbi, start_nid);
+                        raw_ne = &nat_blk->entries[nid - start_nid];
-                        nat_blk = page_address(page);
-                        f2fs_bug_on(!nat_blk);
                }
+                raw_nat_from_node_info(raw_ne, &ne->ni);
-                /* flush dirty nats in nat entry set */
+                write_lock(&NM_I(sbi)->nat_tree_lock);
-                list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
+                nat_reset_flag(ne);
-                        struct f2fs_nat_entry *raw_ne;
+                __clear_nat_cache_dirty(NM_I(sbi), ne);
-                        nid_t nid = nat_get_nid(ne);
+                write_unlock(&NM_I(sbi)->nat_tree_lock);
-                        int offset;
-                        if (to_journal) {
+                if (nat_get_blkaddr(ne) == NULL_ADDR)
-                                offset = lookup_journal_in_cursum(sum,
+                        add_free_nid(sbi, nid, false);
-                                                        NAT_JOURNAL, nid, 1);
+        }
-                                f2fs_bug_on(offset < 0);
-                                raw_ne = &nat_in_journal(sum, offset);
-                                nid_in_journal(sum, offset) = cpu_to_le32(nid);
-                        } else {
-                                raw_ne = &nat_blk->entries[nid - start_nid];
-                        }
-                        raw_nat_from_node_info(raw_ne, &ne->ni);
-                        if (nat_get_blkaddr(ne) == NULL_ADDR &&
+        if (to_journal)
-                                add_free_nid(sbi, nid, false) <= 0) {
+                mutex_unlock(&curseg->curseg_mutex);
-                                write_lock(&nm_i->nat_tree_lock);
+        else
-                                __del_from_nat_cache(nm_i, ne);
+                f2fs_put_page(page, 1);
-                                write_unlock(&nm_i->nat_tree_lock);
-                        } else {
-                                write_lock(&nm_i->nat_tree_lock);
-                                __clear_nat_cache_dirty(nm_i, ne);
-                                write_unlock(&nm_i->nat_tree_lock);
-                        }
-                }
-                if (to_journal)
+        if (!set->entry_cnt) {
-                        mutex_unlock(&curseg->curseg_mutex);
+                radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-                else
+                kmem_cache_free(nat_entry_set_slab, set);
-                        f2fs_put_page(page, 1);
+        }
+}
-                release_nat_entry_set(nes, nm_i);
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct nat_entry_set *setvec[NATVEC_SIZE];
+        struct nat_entry_set *set, *tmp;
+        unsigned int found;
+        nid_t set_idx = 0;
+        LIST_HEAD(sets);
+        /*
+         * if there are no enough space in journal to store dirty nat
+         * entries, remove all entries from journal and merge them
+         * into nat entry set.
+         */
+        if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+                remove_nats_in_journal(sbi);
+        if (!nm_i->dirty_nat_cnt)
+                return;
+        while ((found = __gang_lookup_nat_set(nm_i,
+                                        set_idx, NATVEC_SIZE, setvec))) {
+                unsigned idx;
+                set_idx = setvec[found - 1]->set + 1;
+                for (idx = 0; idx < found; idx++)
+                        __adjust_nat_entry_set(setvec[idx], &sets,
+                                                        MAX_NAT_JENTRIES(sum));
        }
-        f2fs_bug_on(!list_empty(head));
+        /* flush dirty nats in nat entry set */
-        f2fs_bug_on(nm_i->dirty_nat_cnt);
+        list_for_each_entry_safe(set, tmp, &sets, set_list)
+                __flush_nat_entry_set(sbi, set);
+        f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1967,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
        /* not used nids: 0, node, meta, (and root counted as valid node) */
-        nm_i->available_nids = nm_i->max_nid - 3;
+        nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
        nm_i->fcnt = 0;
        nm_i->nat_cnt = 0;
        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
@@ -1975,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->free_nid_list);
        INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+        INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->nat_entries);
-        INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
-        INIT_LIST_HEAD(&nm_i->nat_entry_set);
        mutex_init(&nm_i->build_lock);
        spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2026,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        /* destroy free nid list */
        spin_lock(&nm_i->free_nid_list_lock);
        list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
-                f2fs_bug_on(i->state == NID_ALLOC);
+                f2fs_bug_on(sbi, i->state == NID_ALLOC);
                __del_from_free_nid_list(nm_i, i);
                nm_i->fcnt--;
                spin_unlock(&nm_i->free_nid_list_lock);
                kmem_cache_free(free_nid_slab, i);
                spin_lock(&nm_i->free_nid_list_lock);
        }
-        f2fs_bug_on(nm_i->fcnt);
+        f2fs_bug_on(sbi, nm_i->fcnt);
        spin_unlock(&nm_i->free_nid_list_lock);
        /* destroy nat cache */
@@ -2045,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
                for (idx = 0; idx < found; idx++)
                        __del_from_nat_cache(nm_i, natvec[idx]);
        }
-        f2fs_bug_on(nm_i->nat_cnt);
+        f2fs_bug_on(sbi, nm_i->nat_cnt);
        write_unlock(&nm_i->nat_tree_lock);
        kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
        unsigned char version;  /* version of the node */
 };
+enum {
+        IS_CHECKPOINTED,        /* is it checkpointed before? */
+        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
+        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
+        IS_DIRTY,               /* this nat entry is dirty? */
+};
 struct nat_entry {
        struct list_head list;  /* for clean or dirty nat list */
-        bool checkpointed;      /* whether it is checkpointed or not */
+        unsigned char flag;     /* for node information bits */
-        bool fsync_done;        /* whether the latest node has fsync mark */
        struct node_info ni;    /* in-memory node information */
 };
@@ -55,18 +61,32 @@ struct nat_entry {
 #define nat_get_version(nat)            (nat->ni.version)
 #define nat_set_version(nat, v)         (nat->ni.version = v)
-#define __set_nat_cache_dirty(nm_i, ne)                                 \
-        do {                                                            \
-                ne->checkpointed = false;                               \
-                list_move_tail(&ne->list, &nm_i->dirty_nat_entries);    \
-        } while (0)
-#define __clear_nat_cache_dirty(nm_i, ne)                               \
-        do {                                                            \
-                ne->checkpointed = true;                                \
-                list_move_tail(&ne->list, &nm_i->nat_entries);          \
-        } while (0)
 #define inc_node_version(version)       (++version)
+static inline void set_nat_flag(struct nat_entry *ne,
+                                unsigned int type, bool set)
+{
+        unsigned char mask = 0x01 << type;
+        if (set)
+                ne->flag |= mask;
+        else
+                ne->flag &= ~mask;
+}
+static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+{
+        unsigned char mask = 0x01 << type;
+        return ne->flag & mask;
+}
+static inline void nat_reset_flag(struct nat_entry *ne)
+{
+        /* these states can be set only after checkpoint was done */
+        set_nat_flag(ne, IS_CHECKPOINTED, true);
+        set_nat_flag(ne, HAS_FSYNCED_INODE, false);
+        set_nat_flag(ne, HAS_LAST_FSYNC, true);
+}
 static inline void node_info_from_raw_nat(struct node_info *ni,
                                                struct f2fs_nat_entry *raw_ne)
 {
@@ -90,9 +110,9 @@ enum mem_type {
 };
 struct nat_entry_set {
-        struct list_head set_list;      /* link with all nat sets */
+        struct list_head set_list;      /* link with other nat sets */
        struct list_head entry_list;    /* link with dirty nat entries */
-        nid_t start_nid;                /* start nid of nats in set */
+        nid_t set;                      /* set number*/
        unsigned int entry_cnt;         /* the # of nat entries in set */
 };
@@ -110,18 +130,19 @@ struct free_nid {
        int state;              /* in use or not: NID_NEW or NID_ALLOC */
 };
-static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct free_nid *fnid;
-        if (nm_i->fcnt <= 0)
-                return -1;
        spin_lock(&nm_i->free_nid_list_lock);
+        if (nm_i->fcnt <= 0) {
+                spin_unlock(&nm_i->free_nid_list_lock);
+                return;
+        }
        fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
        *nid = fnid->nid;
        spin_unlock(&nm_i->free_nid_list_lock);
-        return 0;
 }
 /*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
 static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
-        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        struct f2fs_node *rn = F2FS_NODE(page);
        rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d921ba2..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
 #include "node.h"
 #include "segment.h"
+/*
+ * Roll forward recovery scenarios.
+ *
+ * [Term] F: fsync_mark, D: dentry_mark
+ *
+ * 1. inode(x) | CP | inode(x) | dnode(F)
+ * -> Update the latest inode(x).
+ *
+ * 2. inode(x) | CP | inode(F) | dnode(F)
+ * -> No problem.
+ *
+ * 3. inode(x) | CP | dnode(F) | inode(x)
+ * -> Recover to the latest dnode(F), and drop the last inode(x)
+ *
+ * 4. inode(x) | CP | dnode(F) | inode(F)
+ * -> No problem.
+ *
+ * 5. CP | inode(x) | dnode(F)
+ * -> The inode(DF) was missing. Should drop this dnode(F).
+ *
+ * 6. CP | inode(DF) | dnode(F)
+ * -> No problem.
+ *
+ * 7. CP | dnode(F) | inode(DF)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *
+ * 8. CP | dnode(F) | inode(x)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *    But it will fail due to no inode(DF).
+ */
 static struct kmem_cache *fsync_entry_slab;
 bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
        return NULL;
 }
-static int recover_dentry(struct page *ipage, struct inode *inode)
+static int recover_dentry(struct inode *inode, struct page *ipage)
 {
        struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
        nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
        }
 retry:
        de = f2fs_find_entry(dir, &name, &page);
-        if (de && inode->i_ino == le32_to_cpu(de->ino))
+        if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
                goto out_unmap_put;
+        }
        if (de) {
                einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
                if (IS_ERR(einode)) {
@@ -73,7 +106,7 @@ retry:
                                err = -EEXIST;
                        goto out_unmap_put;
                }
-                err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+                err = acquire_orphan_inode(F2FS_I_SB(inode));
                if (err) {
                        iput(einode);
                        goto out_unmap_put;
@@ -108,35 +141,28 @@ out:
        return err;
 }
-static int recover_inode(struct inode *inode, struct page *node_page)
+static void recover_inode(struct inode *inode, struct page *page)
 {
-        struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
+        struct f2fs_inode *raw = F2FS_INODE(page);
-        if (!IS_INODE(node_page))
+        inode->i_mode = le16_to_cpu(raw->i_mode);
-                return 0;
+        i_size_write(inode, le64_to_cpu(raw->i_size));
+        inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
-        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+        inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
-        i_size_write(inode, le64_to_cpu(raw_inode->i_size));
+        inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
-        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
-        inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+        inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
-        inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
-        inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
-        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-        if (is_dent_dnode(node_page))
-                return recover_dentry(node_page, inode);
        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
-                        ino_of_node(node_page), raw_inode->i_name);
+                        ino_of_node(page), F2FS_INODE(page)->i_name);
-        return 0;
 }
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
        unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
        struct curseg_info *curseg;
-        struct page *page;
+        struct page *page = NULL;
        block_t blkaddr;
        int err = 0;
@@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
        curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-        /* read node page */
-        page = alloc_page(GFP_F2FS_ZERO);
-        if (!page)
-                return -ENOMEM;
-        lock_page(page);
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
+                if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
-                if (err)
+                        return 0;
-                        return err;
-                lock_page(page);
+                page = get_meta_page_ra(sbi, blkaddr);
                if (cp_ver != cpver_of_node(page))
                        break;
@@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                        }
                        /* add this fsync inode to the list */
-                        entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+                        entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
                        if (!entry) {
                                err = -ENOMEM;
                                break;
                        }
+                        /*
+                         * CP | dnode(F) | inode(DF)
+                         * For this case, we should not give up now.
+                         */
                        entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
                        if (IS_ERR(entry->inode)) {
                                err = PTR_ERR(entry->inode);
                                kmem_cache_free(fsync_entry_slab, entry);
+                                if (err == -ENOENT)
+                                        goto next;
                                break;
                        }
                        list_add_tail(&entry->list, head);
                }
                entry->blkaddr = blkaddr;
-                err = recover_inode(entry->inode, page);
+                if (IS_INODE(page)) {
-                if (err && err != -ENOENT)
+                        entry->last_inode = blkaddr;
-                        break;
+                        if (is_dent_dnode(page))
+                                entry->last_dentry = blkaddr;
+                }
 next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
+                f2fs_put_page(page, 1);
        }
+        f2fs_put_page(page, 1);
-        unlock_page(page);
-        __free_pages(page, 0);
        return err;
 }
@@ -277,16 +301,30 @@ got_it:
        ino = ino_of_node(node_page);
        f2fs_put_page(node_page, 1);
-        /* Deallocate previous index in the node page */
+        if (ino != dn->inode->i_ino) {
-        inode = f2fs_iget(sbi->sb, ino);
+                /* Deallocate previous index in the node page */
-        if (IS_ERR(inode))
+                inode = f2fs_iget(sbi->sb, ino);
-                return PTR_ERR(inode);
+                if (IS_ERR(inode))
+                        return PTR_ERR(inode);
+        } else {
+                inode = dn->inode;
+        }
        bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
-                                        le16_to_cpu(sum.ofs_in_node);
+                        le16_to_cpu(sum.ofs_in_node);
-        truncate_hole(inode, bidx, bidx + 1);
+        if (ino != dn->inode->i_ino) {
-        iput(inode);
+                truncate_hole(inode, bidx, bidx + 1);
+                iput(inode);
+        } else {
+                struct dnode_of_data tdn;
+                set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
+                if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+                        return 0;
+                if (tdn.data_blkaddr != NULL_ADDR)
+                        truncate_data_blocks_range(&tdn, 1);
+                f2fs_put_page(tdn.node_page, 1);
+        }
        return 0;
 }
@@ -300,14 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct node_info ni;
        int err = 0, recovered = 0;
-        recover_inline_xattr(inode, page);
+        /* step 1: recover xattr */
+        if (IS_INODE(page)) {
-        if (recover_inline_data(inode, page))
+                recover_inline_xattr(inode, page);
+        } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+                recover_xattr_data(inode, page, blkaddr);
                goto out;
+        }
-        if (recover_xattr_data(inode, page, blkaddr))
+        /* step 2: recover inline data */
+        if (recover_inline_data(inode, page))
                goto out;
+        /* step 3: recover data indices */
        start = start_bidx_of_node(ofs_of_node(page), fi);
        end = start + ADDRS_PER_PAGE(page, fi);
@@ -324,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        f2fs_wait_on_page_writeback(dn.node_page, NODE);
        get_node_info(sbi, dn.nid, &ni);
-        f2fs_bug_on(ni.ino != ino_of_node(page));
+        f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
-        f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
+        f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
        for (; start < end; start++) {
                block_t src, dest;
@@ -337,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        if (src == NULL_ADDR) {
                                err = reserve_new_block(&dn);
                                /* We should not get -ENOSPC */
-                                f2fs_bug_on(err);
+                                f2fs_bug_on(sbi, err);
                        }
                        /* Check the previous node page having this index */
@@ -364,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        fill_node_footer(dn.node_page, dn.nid, ni.ino,
                                        ofs_of_node(page), false);
        set_page_dirty(dn.node_page);
-        recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
 err:
        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
@@ -381,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 {
        unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
        struct curseg_info *curseg;
-        struct page *page;
+        struct page *page = NULL;
        int err = 0;
        block_t blkaddr;
@@ -389,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
        curseg = CURSEG_I(sbi, type);
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-        /* read node page */
-        page = alloc_page(GFP_F2FS_ZERO);
-        if (!page)
-                return -ENOMEM;
-        lock_page(page);
        while (1) {
                struct fsync_inode_entry *entry;
-                err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
+                if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
-                if (err)
+                        break;
-                        return err;
-                lock_page(page);
+                page = get_meta_page_ra(sbi, blkaddr);
-                if (cp_ver != cpver_of_node(page))
+                if (cp_ver != cpver_of_node(page)) {
+                        f2fs_put_page(page, 1);
                        break;
+                }
                entry = get_fsync_inode(head, ino_of_node(page));
                if (!entry)
                        goto next;
+                /*
+                 * inode(x) | CP | inode(x) | dnode(F)
+                 * In this case, we can lose the latest inode(x).
+                 * So, call recover_inode for the inode update.
+                 */
+                if (entry->last_inode == blkaddr)
+                        recover_inode(entry->inode, page);
+                if (entry->last_dentry == blkaddr) {
+                        err = recover_dentry(entry->inode, page);
+                        if (err) {
+                                f2fs_put_page(page, 1);
+                                break;
+                        }
+                }
                err = do_recover_data(sbi, entry->inode, page, blkaddr);
-                if (err)
+                if (err) {
+                        f2fs_put_page(page, 1);
                        break;
+                }
                if (entry->blkaddr == blkaddr) {
                        iput(entry->inode);
@@ -424,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
 next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
+                f2fs_put_page(page, 1);
        }
-        unlock_page(page);
-        __free_pages(page, 0);
        if (!err)
                allocate_new_segments(sbi);
        return err;
@@ -452,6 +499,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        /* step #1: find fsynced inode numbers */
        sbi->por_doing = true;
+        /* prevent checkpoint */
+        mutex_lock(&sbi->cp_mutex);
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,11 +515,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        /* step #2: recover data */
        err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-        f2fs_bug_on(!list_empty(&inode_list));
+        if (!err)
+                f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:
        destroy_fsync_dnodes(&inode_list);
        kmem_cache_destroy(fsync_entry_slab);
+        /* truncate meta pages to be used by the recovery */
+        truncate_inode_pages_range(META_MAPPING(sbi),
+                        MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
        if (err) {
                truncate_inode_pages_final(NODE_MAPPING(sbi));
                truncate_inode_pages_final(META_MAPPING(sbi));
@@ -482,8 +537,16 @@ out:
                /* Flush all the NAT/SIT pages */
                while (get_pages(sbi, F2FS_DIRTY_META))
                        sync_meta_pages(sbi, META, LONG_MAX);
+                set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+                mutex_unlock(&sbi->cp_mutex);
        } else if (need_writecp) {
-                write_checkpoint(sbi, false);
+                struct cp_control cpc = {
+                        .reason = CP_SYNC,
+                };
+                mutex_unlock(&sbi->cp_mutex);
+                write_checkpoint(sbi, &cpc);
+        } else {
+                mutex_unlock(&sbi->cp_mutex);
        }
        return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeebae2a50..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *sit_entry_set_slab;
+static struct kmem_cache *inmem_entry_slab;
 /*
 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 }
 /*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
 * f2fs_set_bit makes MSB and LSB reversed in a byte.
 * Example:
 *                             LSB <--> MSB
@@ -172,6 +174,60 @@ found_middle:
        return result + __reverse_ffz(tmp);
 }
+void register_inmem_page(struct inode *inode, struct page *page)
+{
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct inmem_pages *new;
+        new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+        /* add atomic page indices to the list */
+        new->page = page;
+        INIT_LIST_HEAD(&new->list);
+        /* increase reference count with clean state */
+        mutex_lock(&fi->inmem_lock);
+        get_page(page);
+        list_add_tail(&new->list, &fi->inmem_pages);
+        mutex_unlock(&fi->inmem_lock);
+}
+void commit_inmem_pages(struct inode *inode, bool abort)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct inmem_pages *cur, *tmp;
+        bool submit_bio = false;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = WRITE_SYNC,
+        };
+        f2fs_balance_fs(sbi);
+        f2fs_lock_op(sbi);
+        mutex_lock(&fi->inmem_lock);
+        list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+                lock_page(cur->page);
+                if (!abort && cur->page->mapping == inode->i_mapping) {
+                        f2fs_wait_on_page_writeback(cur->page, DATA);
+                        if (clear_page_dirty_for_io(cur->page))
+                                inode_dec_dirty_pages(inode);
+                        do_write_data_page(cur->page, &fio);
+                        submit_bio = true;
+                }
+                f2fs_put_page(cur->page, 1);
+                list_del(&cur->list);
+                kmem_cache_free(inmem_entry_slab, cur);
+        }
+        if (submit_bio)
+                f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        mutex_unlock(&fi->inmem_lock);
+        filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+        f2fs_unlock_op(sbi);
+}
 /*
 * This function balances dirty node and dentry pages.
 * In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
        if (kthread_should_stop())
                return 0;
-        spin_lock(&fcc->issue_lock);
+        if (!llist_empty(&fcc->issue_list)) {
-        if (fcc->issue_list) {
-                fcc->dispatch_list = fcc->issue_list;
-                fcc->issue_list = fcc->issue_tail = NULL;
-        }
-        spin_unlock(&fcc->issue_lock);
-        if (fcc->dispatch_list) {
                struct bio *bio = bio_alloc(GFP_NOIO, 0);
                struct flush_cmd *cmd, *next;
                int ret;
+                fcc->dispatch_list = llist_del_all(&fcc->issue_list);
+                fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
                bio->bi_bdev = sbi->sb->s_bdev;
                ret = submit_bio_wait(WRITE_FLUSH, bio);
-                for (cmd = fcc->dispatch_list; cmd; cmd = next) {
+                llist_for_each_entry_safe(cmd, next,
+                                          fcc->dispatch_list, llnode) {
                        cmd->ret = ret;
-                        next = cmd->next;
                        complete(&cmd->wait);
                }
                bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
        }
        wait_event_interruptible(*q,
-                        kthread_should_stop() || fcc->issue_list);
+                kthread_should_stop() || !llist_empty(&fcc->issue_list));
        goto repeat;
 }
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
                return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
        init_completion(&cmd.wait);
-        cmd.next = NULL;
-        spin_lock(&fcc->issue_lock);
+        llist_add(&cmd.llnode, &fcc->issue_list);
-        if (fcc->issue_list)
-                fcc->issue_tail->next = &cmd;
-        else
-                fcc->issue_list = &cmd;
-        fcc->issue_tail = &cmd;
-        spin_unlock(&fcc->issue_lock);
        if (!fcc->dispatch_list)
                wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
        fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
        if (!fcc)
                return -ENOMEM;
-        spin_lock_init(&fcc->issue_lock);
        init_waitqueue_head(&fcc->flush_wait_queue);
+        init_llist_head(&fcc->issue_list);
        SM_I(sbi)->cmd_control_info = fcc;
        fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
                                "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
                struct seg_entry *sentry = get_seg_entry(sbi, segno);
                enum dirty_type t = sentry->type;
+                if (unlikely(t >= DIRTY)) {
+                        f2fs_bug_on(sbi, 1);
+                        return;
+                }
                if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
                        dirty_i->nr_dirty[t]++;
        }
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
                                block_t blkstart, block_t blklen)
 {
-        sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
+        sector_t start = SECTOR_FROM_BLOCK(blkstart);
-        sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+        sector_t len = SECTOR_FROM_BLOCK(blklen);
        trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
        return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
@@ -392,21 +441,47 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
        }
 }
-static void add_discard_addrs(struct f2fs_sb_info *sbi,
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
-                        unsigned int segno, struct seg_entry *se)
 {
        struct list_head *head = &SM_I(sbi)->discard_list;
        struct discard_entry *new;
        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
        int max_blocks = sbi->blocks_per_seg;
+        struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
        unsigned long dmap[entries];
        unsigned int start = 0, end = -1;
+        bool force = (cpc->reason == CP_DISCARD);
        int i;
-        if (!test_opt(sbi, DISCARD))
+        if (!force && !test_opt(sbi, DISCARD))
+                return;
+        if (force && !se->valid_blocks) {
+                struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+                /*
+                 * if this segment is registered in the prefree list, then
+                 * we should skip adding a discard candidate, and let the
+                 * checkpoint do that later.
+                 */
+                mutex_lock(&dirty_i->seglist_lock);
+                if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
+                        mutex_unlock(&dirty_i->seglist_lock);
+                        cpc->trimmed += sbi->blocks_per_seg;
+                        return;
+                }
+                mutex_unlock(&dirty_i->seglist_lock);
+                new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+                INIT_LIST_HEAD(&new->list);
+                new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
+                new->len = sbi->blocks_per_seg;
+                list_add_tail(&new->list, head);
+                SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
+                cpc->trimmed += sbi->blocks_per_seg;
                return;
+        }
        /* zero block will be discarded through the prefree list */
        if (!se->valid_blocks || se->valid_blocks == max_blocks)
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
        for (i = 0; i < entries; i++)
                dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
-        while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+        while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
                start = __find_rev_next_bit(dmap, max_blocks, end + 1);
                if (start >= max_blocks)
                        break;
                end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+                if (end - start < cpc->trim_minlen)
+                        continue;
                new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
                INIT_LIST_HEAD(&new->list);
-                new->blkaddr = START_BLOCK(sbi, segno) + start;
+                new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
                new->len = end - start;
+                cpc->trimmed += end - start;
                list_add_tail(&new->list, head);
                SM_I(sbi)->nr_discards += end - start;
        }
 }
+void release_discard_addrs(struct f2fs_sb_info *sbi)
+{
+        struct list_head *head = &(SM_I(sbi)->discard_list);
+        struct discard_entry *entry, *this;
+        /* drop caches */
+        list_for_each_entry_safe(entry, this, head, list) {
+                list_del(&entry->list);
+                kmem_cache_free(discard_entry_slab, entry);
+        }
+}
 /*
 * Should call clear_prefree_segments after checkpoint is done.
 */
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned int segno;
-        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
-        for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs)
+        for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
                __set_test_and_free(sbi, segno);
        mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
        struct discard_entry *entry, *this;
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
-        unsigned int total_segs = TOTAL_SEGS(sbi);
        unsigned int start = 0, end = -1;
        mutex_lock(&dirty_i->seglist_lock);
        while (1) {
                int i;
-                start = find_next_bit(prefree_map, total_segs, end + 1);
+                start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
-                if (start >= total_segs)
+                if (start >= MAIN_SEGS(sbi))
                        break;
-                end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+                end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
+                                                                start + 1);
                for (i = start; i < end; i++)
                        clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
        }
 }
-static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-        if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+        if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
                sit_i->dirty_sentries++;
+                return false;
+        }
+        return true;
 }
 static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
        new_vblocks = se->valid_blocks + del;
        offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-        f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
+        f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
                                (new_vblocks > sbi->blocks_per_seg)));
        se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
        /* Update valid block bitmap */
        if (del > 0) {
                if (f2fs_set_bit(offset, se->cur_valid_map))
-                        BUG();
+                        f2fs_bug_on(sbi, 1);
        } else {
                if (!f2fs_clear_bit(offset, se->cur_valid_map))
-                        BUG();
+                        f2fs_bug_on(sbi, 1);
        }
        if (!f2fs_test_bit(offset, se->ckpt_valid_map))
                se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
        unsigned int segno = GET_SEGNO(sbi, addr);
        struct sit_info *sit_i = SIT_I(sbi);
-        f2fs_bug_on(addr == NULL_ADDR);
+        f2fs_bug_on(sbi, addr == NULL_ADDR);
        if (addr == NEW_ADDR)
                return;
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
        unsigned int segno = curseg->segno + 1;
        struct free_segmap_info *free_i = FREE_I(sbi);
-        if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+        if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
                return !test_bit(segno, free_i->free_segmap);
        return 0;
 }
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
        unsigned int segno, secno, zoneno;
-        unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
+        unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
        unsigned int hint = *newseg / sbi->segs_per_sec;
        unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
        unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                segno = find_next_zero_bit(free_i->free_segmap,
-                                        TOTAL_SEGS(sbi), *newseg + 1);
+                                        MAIN_SEGS(sbi), *newseg + 1);
                if (segno - *newseg < sbi->segs_per_sec -
                                        (*newseg % sbi->segs_per_sec))
                        goto got_it;
        }
 find_other_zone:
-        secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
+        secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
-        if (secno >= TOTAL_SECS(sbi)) {
+        if (secno >= MAIN_SECS(sbi)) {
                if (dir == ALLOC_RIGHT) {
                        secno = find_next_zero_bit(free_i->free_secmap,
-                                                        TOTAL_SECS(sbi), 0);
+                                                        MAIN_SECS(sbi), 0);
-                        f2fs_bug_on(secno >= TOTAL_SECS(sbi));
+                        f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
                } else {
                        go_left = 1;
                        left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
                        continue;
                }
                left_start = find_next_zero_bit(free_i->free_secmap,
-                                                        TOTAL_SECS(sbi), 0);
+                                                        MAIN_SECS(sbi), 0);
-                f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
+                f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
                break;
        }
        secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
        }
 got_it:
        /* set it as dirty segment in free segmap */
-        f2fs_bug_on(test_bit(segno, free_i->free_segmap));
+        f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
        __set_inuse(sbi, segno);
        *newseg = segno;
        write_unlock(&free_i->segmap_lock);
@@ -808,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
 }
 /*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
 * manner, so it should recover the existing segment information of valid blocks
 */
 static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
        .allocate_segment = allocate_segment_by_default,
 };
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
+{
+        __u64 start = range->start >> sbi->log_blocksize;
+        __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+        unsigned int start_segno, end_segno;
+        struct cp_control cpc;
+        if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
+                                                range->len < sbi->blocksize)
+                return -EINVAL;
+        if (end <= MAIN_BLKADDR(sbi))
+                goto out;
+        /* start/end segment number in main_area */
+        start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
+        end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
+                                                GET_SEGNO(sbi, end);
+        cpc.reason = CP_DISCARD;
+        cpc.trim_start = start_segno;
+        cpc.trim_end = end_segno;
+        cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+        cpc.trimmed = 0;
+        /* do checkpoint to issue discard commands safely */
+        write_checkpoint(sbi, &cpc);
+out:
+        range->len = cpc.trimmed << sbi->log_blocksize;
+        return 0;
+}
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
 static int __get_segment_type(struct page *page, enum page_type p_type)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+        switch (F2FS_P_SB(page)->active_logs) {
-        switch (sbi->active_logs) {
        case 2:
                return __get_segment_type_2(page, p_type);
        case 4:
                return __get_segment_type_4(page, p_type);
        }
        /* NR_CURSEG_TYPE(6) logs by default */
-        f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
+        f2fs_bug_on(F2FS_P_SB(page),
+                F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
        return __get_segment_type_6(page, p_type);
 }
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
 void write_data_page(struct page *page, struct dnode_of_data *dn,
                block_t *new_blkaddr, struct f2fs_io_info *fio)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_summary sum;
        struct node_info ni;
-        f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
+        f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
 void rewrite_data_page(struct page *page, block_t old_blkaddr,
                                        struct f2fs_io_info *fio)
 {
-        struct inode *inode = page->mapping->host;
+        f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
 }
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1103,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
        mutex_unlock(&curseg->curseg_mutex);
 }
-void rewrite_node_page(struct f2fs_sb_info *sbi,
-                        struct page *page, struct f2fs_summary *sum,
-                        block_t old_blkaddr, block_t new_blkaddr)
-{
-        struct sit_info *sit_i = SIT_I(sbi);
-        int type = CURSEG_WARM_NODE;
-        struct curseg_info *curseg;
-        unsigned int segno, old_cursegno;
-        block_t next_blkaddr = next_blkaddr_of_node(page);
-        unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
-        struct f2fs_io_info fio = {
-                .type = NODE,
-                .rw = WRITE_SYNC,
-        };
-        curseg = CURSEG_I(sbi, type);
-        mutex_lock(&curseg->curseg_mutex);
-        mutex_lock(&sit_i->sentry_lock);
-        segno = GET_SEGNO(sbi, new_blkaddr);
-        old_cursegno = curseg->segno;
-        /* change the current segment */
-        if (segno != curseg->segno) {
-                curseg->next_segno = segno;
-                change_curseg(sbi, type, true);
-        }
-        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
-        __add_sum_entry(sbi, type, sum);
-        /* change the current log to the next block addr in advance */
-        if (next_segno != segno) {
-                curseg->next_segno = next_segno;
-                change_curseg(sbi, type, true);
-        }
-        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-        /* rewrite node page */
-        set_page_writeback(page);
-        f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
-        f2fs_submit_merged_bio(sbi, NODE, WRITE);
-        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-        locate_dirty_segment(sbi, old_cursegno);
-        mutex_unlock(&sit_i->sentry_lock);
-        mutex_unlock(&curseg->curseg_mutex);
-}
 static inline bool is_merged_page(struct f2fs_sb_info *sbi,
                                        struct page *page, enum page_type type)
 {
@@ -1179,8 +1254,9 @@ out:
 void f2fs_wait_on_page_writeback(struct page *page,
                                enum page_type type)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
        if (PageWriteback(page)) {
+                struct f2fs_sb_info *sbi = F2FS_P_SB(page);
                if (is_merged_page(sbi, page, type))
                        f2fs_submit_merged_bio(sbi, type, WRITE);
                wait_on_page_writeback(page);
@@ -1449,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
                                        unsigned int segno)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-        unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+        unsigned int offset = SIT_BLOCK_OFFSET(segno);
        block_t blk_addr = sit_i->sit_base_addr + offset;
        check_seg_range(sbi, segno);
@@ -1475,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
        /* get current sit block page without lock */
        src_page = get_meta_page(sbi, src_off);
        dst_page = grab_meta_page(sbi, dst_off);
-        f2fs_bug_on(PageDirty(src_page));
+        f2fs_bug_on(sbi, PageDirty(src_page));
        src_addr = page_address(src_page);
        dst_addr = page_address(dst_page);
@@ -1489,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
        return dst_page;
 }
-static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+static struct sit_entry_set *grab_sit_entry_set(void)
+{
+        struct sit_entry_set *ses =
+                        f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+        ses->entry_cnt = 0;
+        INIT_LIST_HEAD(&ses->set_list);
+        return ses;
+}
+static void release_sit_entry_set(struct sit_entry_set *ses)
+{
+        list_del(&ses->set_list);
+        kmem_cache_free(sit_entry_set_slab, ses);
+}
+static void adjust_sit_entry_set(struct sit_entry_set *ses,
+                                                struct list_head *head)
+{
+        struct sit_entry_set *next = ses;
+        if (list_is_last(&ses->set_list, head))
+                return;
+        list_for_each_entry_continue(next, head, set_list)
+                if (ses->entry_cnt <= next->entry_cnt)
+                        break;
+        list_move_tail(&ses->set_list, &next->set_list);
+}
+static void add_sit_entry(unsigned int segno, struct list_head *head)
+{
+        struct sit_entry_set *ses;
+        unsigned int start_segno = START_SEGNO(segno);
+        list_for_each_entry(ses, head, set_list) {
+                if (ses->start_segno == start_segno) {
+                        ses->entry_cnt++;
+                        adjust_sit_entry_set(ses, head);
+                        return;
+                }
+        }
+        ses = grab_sit_entry_set();
+        ses->start_segno = start_segno;
+        ses->entry_cnt++;
+        list_add(&ses->set_list, head);
+}
+static void add_sits_in_set(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        struct list_head *set_list = &sm_info->sit_entry_set;
+        unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
+        unsigned int segno;
+        for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
+                add_sit_entry(segno, set_list);
+}
+static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
        int i;
-        /*
+        for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
-         * If the journal area in the current summary is full of sit entries,
+                unsigned int segno;
-         * all the sit entries will be flushed. Otherwise the sit entries
+                bool dirtied;
-         * are not able to replace with newly hot sit entries.
-         */
+                segno = le32_to_cpu(segno_in_journal(sum, i));
-        if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
+                dirtied = __mark_sit_entry_dirty(sbi, segno);
-                for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
-                        unsigned int segno;
+                if (!dirtied)
-                        segno = le32_to_cpu(segno_in_journal(sum, i));
+                        add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
-                        __mark_sit_entry_dirty(sbi, segno);
-                }
-                update_sits_in_cursum(sum, -sits_in_cursum(sum));
-                return true;
        }
-        return false;
+        update_sits_in_cursum(sum, -sits_in_cursum(sum));
 }
 /*
 * CP calls this function, which flushes SIT entries including sit_journal,
 * and moves prefree segs to free segs.
 */
-void flush_sit_entries(struct f2fs_sb_info *sbi)
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        unsigned long nsegs = TOTAL_SEGS(sbi);
+        struct sit_entry_set *ses, *tmp;
-        struct page *page = NULL;
+        struct list_head *head = &SM_I(sbi)->sit_entry_set;
-        struct f2fs_sit_block *raw_sit = NULL;
+        bool to_journal = true;
-        unsigned int start = 0, end = 0;
+        struct seg_entry *se;
-        unsigned int segno;
-        bool flushed;
        mutex_lock(&curseg->curseg_mutex);
        mutex_lock(&sit_i->sentry_lock);
        /*
-         * "flushed" indicates whether sit entries in journal are flushed
+         * add and account sit entries of dirty bitmap in sit entry
-         * to the SIT area or not.
+         * set temporarily
         */
-        flushed = flush_sits_in_journal(sbi);
+        add_sits_in_set(sbi);
-        for_each_set_bit(segno, bitmap, nsegs) {
+        /*
-                struct seg_entry *se = get_seg_entry(sbi, segno);
+         * if there are no enough space in journal to store dirty sit
-                int sit_offset, offset;
+         * entries, remove all entries from journal and add and account
+         * them in sit entry set.
+         */
+        if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+                remove_sits_in_journal(sbi);
-                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+        if (!sit_i->dirty_sentries)
+                goto out;
-                /* add discard candidates */
+        /*
-                if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+         * there are two steps to flush sit entries:
-                        add_discard_addrs(sbi, segno, se);
+         * #1, flush sit entries to journal in current cold data summary block.
+         * #2, flush sit entries to sit page.
+         */
+        list_for_each_entry_safe(ses, tmp, head, set_list) {
+                struct page *page;
+                struct f2fs_sit_block *raw_sit = NULL;
+                unsigned int start_segno = ses->start_segno;
+                unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
+                                                (unsigned long)MAIN_SEGS(sbi));
+                unsigned int segno = start_segno;
+                if (to_journal &&
+                        !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+                        to_journal = false;
+                if (!to_journal) {
+                        page = get_next_sit_page(sbi, start_segno);
+                        raw_sit = page_address(page);
+                }
-                if (flushed)
+                /* flush dirty sit entries in region of current sit set */
-                        goto to_sit_page;
+                for_each_set_bit_from(segno, bitmap, end) {
+                        int offset, sit_offset;
-                offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
+                        se = get_seg_entry(sbi, segno);
-                if (offset >= 0) {
-                        segno_in_journal(sum, offset) = cpu_to_le32(segno);
+                        /* add discard candidates */
-                        seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
+                        if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
-                        goto flush_done;
+                                cpc->trim_start = segno;
-                }
+                                add_discard_addrs(sbi, cpc);
-to_sit_page:
-                if (!page || (start > segno) || (segno > end)) {
-                        if (page) {
-                                f2fs_put_page(page, 1);
-                                page = NULL;
                        }
-                        start = START_SEGNO(sit_i, segno);
+                        if (to_journal) {
-                        end = start + SIT_ENTRY_PER_BLOCK - 1;
+                                offset = lookup_journal_in_cursum(sum,
+                                                        SIT_JOURNAL, segno, 1);
+                                f2fs_bug_on(sbi, offset < 0);
+                                segno_in_journal(sum, offset) =
+                                                        cpu_to_le32(segno);
+                                seg_info_to_raw_sit(se,
+                                                &sit_in_journal(sum, offset));
+                        } else {
+                                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+                                seg_info_to_raw_sit(se,
+                                                &raw_sit->entries[sit_offset]);
+                        }
-                        /* read sit block that will be updated */
+                        __clear_bit(segno, bitmap);
-                        page = get_next_sit_page(sbi, start);
+                        sit_i->dirty_sentries--;
-                        raw_sit = page_address(page);
+                        ses->entry_cnt--;
                }
-                /* udpate entry in SIT block */
+                if (!to_journal)
-                seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
+                        f2fs_put_page(page, 1);
-flush_done:
-                __clear_bit(segno, bitmap);
+                f2fs_bug_on(sbi, ses->entry_cnt);
-                sit_i->dirty_sentries--;
+                release_sit_entry_set(ses);
+        }
+        f2fs_bug_on(sbi, !list_empty(head));
+        f2fs_bug_on(sbi, sit_i->dirty_sentries);
+out:
+        if (cpc->reason == CP_DISCARD) {
+                for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
+                        add_discard_addrs(sbi, cpc);
        }
        mutex_unlock(&sit_i->sentry_lock);
        mutex_unlock(&curseg->curseg_mutex);
-        /* writeout last modified SIT block */
-        f2fs_put_page(page, 1);
        set_prefree_as_free_segments(sbi);
 }
@@ -1603,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
        SM_I(sbi)->sit_info = sit_i;
-        sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+        sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
        if (!sit_i->sentries)
                return -ENOMEM;
-        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
        sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
        if (!sit_i->dirty_sentries_bitmap)
                return -ENOMEM;
-        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+        for (start = 0; start < MAIN_SEGS(sbi); start++) {
                sit_i->sentries[start].cur_valid_map
                        = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
                sit_i->sentries[start].ckpt_valid_map
@@ -1623,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
        }
        if (sbi->segs_per_sec > 1) {
-                sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
+                sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
                                        sizeof(struct sec_entry));
                if (!sit_i->sec_entries)
                        return -ENOMEM;
@@ -1658,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 static int build_free_segmap(struct f2fs_sb_info *sbi)
 {
-        struct f2fs_sm_info *sm_info = SM_I(sbi);
        struct free_segmap_info *free_i;
        unsigned int bitmap_size, sec_bitmap_size;
@@ -1669,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
        SM_I(sbi)->free_info = free_i;
-        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
        free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
        if (!free_i->free_segmap)
                return -ENOMEM;
-        sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+        sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
        free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
        if (!free_i->free_secmap)
                return -ENOMEM;
@@ -1684,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
        memset(free_i->free_secmap, 0xff, sec_bitmap_size);
        /* init free segmap information */
-        free_i->start_segno =
+        free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
-                (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
        free_i->free_segments = 0;
        free_i->free_sections = 0;
        rwlock_init(&free_i->segmap_lock);
@@ -1722,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
        int sit_blk_cnt = SIT_BLK_CNT(sbi);
        unsigned int i, start, end;
        unsigned int readed, start_blk = 0;
-        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        int nrpages = MAX_BIO_BLOCKS(sbi);
        do {
                readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1730,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
                start = start_blk * sit_i->sents_per_block;
                end = (start_blk + readed) * sit_i->sents_per_block;
-                for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+                for (; start < end && start < MAIN_SEGS(sbi); start++) {
                        struct seg_entry *se = &sit_i->sentries[start];
                        struct f2fs_sit_block *sit_blk;
                        struct f2fs_sit_entry sit;
@@ -1768,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
        unsigned int start;
        int type;
-        for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+        for (start = 0; start < MAIN_SEGS(sbi); start++) {
                struct seg_entry *sentry = get_seg_entry(sbi, start);
                if (!sentry->valid_blocks)
                        __set_free(sbi, start);
@@ -1785,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct free_segmap_info *free_i = FREE_I(sbi);
-        unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
+        unsigned int segno = 0, offset = 0;
        unsigned short valid_blocks;
        while (1) {
                /* find dirty segment based on free segmap */
-                segno = find_next_inuse(free_i, total_segs, offset);
+                segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
-                if (segno >= total_segs)
+                if (segno >= MAIN_SEGS(sbi))
                        break;
                offset = segno + 1;
                valid_blocks = get_valid_blocks(sbi, segno, 0);
-                if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+                if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+                        continue;
+                if (valid_blocks > sbi->blocks_per_seg) {
+                        f2fs_bug_on(sbi, 1);
                        continue;
+                }
                mutex_lock(&dirty_i->seglist_lock);
                __locate_dirty_segment(sbi, segno, DIRTY);
                mutex_unlock(&dirty_i->seglist_lock);
@@ -1806,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 static int init_victim_secmap(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+        unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
        dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
        if (!dirty_i->victim_secmap)
@@ -1827,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
        SM_I(sbi)->dirty_info = dirty_i;
        mutex_init(&dirty_i->seglist_lock);
-        bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+        bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
        for (i = 0; i < NR_DIRTY_TYPE; i++) {
                dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1851,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
        sit_i->min_mtime = LLONG_MAX;
-        for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+        for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
                unsigned int i;
                unsigned long long mtime = 0;
@@ -1889,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
        sm_info->rec_prefree_segments = sm_info->main_segments *
                                        DEF_RECLAIM_PREFREE_SEGMENTS / 100;
-        sm_info->ipu_policy = F2FS_IPU_DISABLE;
+        sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+        sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
        INIT_LIST_HEAD(&sm_info->discard_list);
        sm_info->nr_discards = 0;
        sm_info->max_discards = 0;
+        INIT_LIST_HEAD(&sm_info->sit_entry_set);
        if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
                err = create_flush_cmd_control(sbi);
                if (err)
@@ -1991,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
                return;
        if (sit_i->sentries) {
-                for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+                for (start = 0; start < MAIN_SEGS(sbi); start++) {
                        kfree(sit_i->sentries[start].cur_valid_map);
                        kfree(sit_i->sentries[start].ckpt_valid_map);
                }
@@ -2025,11 +2197,30 @@ int __init create_segment_manager_caches(void)
        discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
                        sizeof(struct discard_entry));
        if (!discard_entry_slab)
-                return -ENOMEM;
+                goto fail;
+        sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+                        sizeof(struct nat_entry_set));
+        if (!sit_entry_set_slab)
+                goto destory_discard_entry;
+        inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+                        sizeof(struct inmem_pages));
+        if (!inmem_entry_slab)
+                goto destroy_sit_entry_set;
        return 0;
+destroy_sit_entry_set:
+        kmem_cache_destroy(sit_entry_set_slab);
+destory_discard_entry:
+        kmem_cache_destroy(discard_entry_slab);
+fail:
+        return -ENOMEM;
 }
 void destroy_segment_manager_caches(void)
 {
+        kmem_cache_destroy(sit_entry_set_slab);
        kmem_cache_destroy(discard_entry_slab);
+        kmem_cache_destroy(inmem_entry_slab);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7b0330..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
         (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /             \
          sbi->segs_per_sec))   \
-#define START_BLOCK(sbi, segno)                                         \
+#define MAIN_BLKADDR(sbi)       (SM_I(sbi)->main_blkaddr)
-        (SM_I(sbi)->seg0_blkaddr +                                      \
+#define SEG0_BLKADDR(sbi)       (SM_I(sbi)->seg0_blkaddr)
+#define MAIN_SEGS(sbi)  (SM_I(sbi)->main_segments)
+#define MAIN_SECS(sbi)  (sbi->total_sections)
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define MAX_BLKADDR(sbi)        (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
+#define SEGMENT_SIZE(sbi)       (1ULL << (sbi->log_blocksize +          \
+                                        sbi->log_blocks_per_seg))
+#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) +                    \
         (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
 #define NEXT_FREE_BLKADDR(sbi, curseg)                                  \
        (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
-#define MAIN_BASE_BLOCK(sbi)    (SM_I(sbi)->main_blkaddr)
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)     ((blk_addr) - SEG0_BLKADDR(sbi))
-#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)                             \
-        ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)                              \
        (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
 #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)                             \
@@ -77,23 +87,21 @@
 #define SIT_ENTRY_OFFSET(sit_i, segno)                                  \
        (segno % sit_i->sents_per_block)
-#define SIT_BLOCK_OFFSET(sit_i, segno)                                  \
+#define SIT_BLOCK_OFFSET(segno)                                 \
        (segno / SIT_ENTRY_PER_BLOCK)
-#define START_SEGNO(sit_i, segno)               \
+#define START_SEGNO(segno)              \
-        (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+        (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
 #define SIT_BLK_CNT(sbi)                        \
-        ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+        ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)                    \
        (BITS_TO_LONGS(nr) * sizeof(unsigned long))
-#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define TOTAL_SECS(sbi) (sbi->total_sections)
-#define SECTOR_FROM_BLOCK(sbi, blk_addr)                                \
+#define SECTOR_FROM_BLOCK(blk_addr)                                     \
-        (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
+        (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
-#define SECTOR_TO_BLOCK(sbi, sectors)                                   \
+#define SECTOR_TO_BLOCK(sectors)                                        \
-        (sectors >> (sbi)->log_sectors_per_block)
+        (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(max_hw_blocks)                                   \
+#define MAX_BIO_BLOCKS(sbi)                                             \
-        (min((int)max_hw_blocks, BIO_MAX_PAGES))
+        ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
 /*
 * indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
        void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
 };
+struct inmem_pages {
+        struct list_head list;
+        struct page *page;
+};
 struct sit_info {
        const struct segment_allocation *s_ops;
@@ -237,6 +250,12 @@ struct curseg_info {
        unsigned int next_segno;                /* preallocated segment */
 };
+struct sit_entry_set {
+        struct list_head set_list;      /* link with all sit sets */
+        unsigned int start_segno;       /* start segno of sits in set */
+        unsigned int entry_cnt;         /* the # of sit entries in set */
+};
 /*
 * inline functions
 */
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
        clear_bit(segno, free_i->free_segmap);
        free_i->free_segments++;
-        next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+        next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
        if (next >= start_segno + sbi->segs_per_sec) {
                clear_bit(secno, free_i->free_secmap);
                free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
-        return (prefree_segments(sbi) / sbi->segs_per_sec)
+        int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
-                        + free_sections(sbi) < overprovision_sections(sbi);
+        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+        return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+                                                reserved_sections(sbi) + 1);
 }
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
 * F2FS_IPU_UTIL - if FS utilization is over threashold,
 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
 *                     threashold,
+ * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
+ *                     storages. IPU will be triggered only if the # of dirty
+ *                     pages over min_fsync_blocks.
 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
 */
 #define DEF_MIN_IPU_UTIL        70
+#define DEF_MIN_FSYNC_BLOCKS    8
 enum {
        F2FS_IPU_FORCE,
        F2FS_IPU_SSR,
        F2FS_IPU_UTIL,
        F2FS_IPU_SSR_UTIL,
-        F2FS_IPU_DISABLE,
+        F2FS_IPU_FSYNC,
 };
 static inline bool need_inplace_update(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        unsigned int policy = SM_I(sbi)->ipu_policy;
        /* IPU can be done only for the user data */
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
                return false;
-        /* this is only set during fdatasync */
+        if (policy & (0x1 << F2FS_IPU_FORCE))
-        if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+                return true;
+        if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+                return true;
+        if (policy & (0x1 << F2FS_IPU_UTIL) &&
+                        utilization(sbi) > SM_I(sbi)->min_ipu_util)
+                return true;
+        if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+                        utilization(sbi) > SM_I(sbi)->min_ipu_util)
                return true;
-        switch (SM_I(sbi)->ipu_policy) {
+        /* this is only set during fdatasync */
-        case F2FS_IPU_FORCE:
+        if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+                        is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
                return true;
-        case F2FS_IPU_SSR:
-                if (need_SSR(sbi))
-                        return true;
-                break;
-        case F2FS_IPU_UTIL:
-                if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
-                        return true;
-                break;
-        case F2FS_IPU_SSR_UTIL:
-                if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
-                        return true;
-                break;
-        case F2FS_IPU_DISABLE:
-                break;
-        }
        return false;
 }
@@ -534,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
 #ifdef CONFIG_F2FS_CHECK_FS
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
-        unsigned int end_segno = SM_I(sbi)->segment_count - 1;
+        BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
-        BUG_ON(segno > end_segno);
 }
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
-        struct f2fs_sm_info *sm_info = SM_I(sbi);
+        BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
-        block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+        BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
-        block_t start_addr = sm_info->seg0_blkaddr;
-        block_t end_addr = start_addr + total_blks - 1;
-        BUG_ON(blk_addr < start_addr);
-        BUG_ON(blk_addr > end_addr);
 }
 /*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
 */
 static inline void check_block_count(struct f2fs_sb_info *sbi,
                int segno, struct f2fs_sit_entry *raw_sit)
 {
-        struct f2fs_sm_info *sm_info = SM_I(sbi);
-        unsigned int end_segno = sm_info->segment_count - 1;
        bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
        int valid_blocks = 0;
        int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
        BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
        /* check boundary of a given segment number */
-        BUG_ON(segno > end_segno);
+        BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
        /* check bitmap with valid block count */
        do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
        BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
 }
 #else
-#define check_seg_range(sbi, segno)
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
-#define verify_block_addr(sbi, blk_addr)
+{
-#define check_block_count(sbi, segno, raw_sit)
+        if (segno > TOTAL_SEGS(sbi) - 1)
+                sbi->need_fsck = true;
+}
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+        if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
+                sbi->need_fsck = true;
+}
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+                int segno, struct f2fs_sit_entry *raw_sit)
+{
+        /* check segment usage */
+        if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
+                sbi->need_fsck = true;
+        /* check boundary of a given segment number */
+        if (segno > TOTAL_SEGS(sbi) - 1)
+                sbi->need_fsck = true;
+}
 #endif
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
                                                unsigned int start)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-        unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+        unsigned int offset = SIT_BLOCK_OFFSET(start);
        block_t blk_addr = sit_i->sit_base_addr + offset;
        check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
 static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
 {
-        unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+        unsigned int block_off = SIT_BLOCK_OFFSET(start);
        if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
                f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
 {
        struct block_device *bdev = sbi->sb->s_bdev;
        struct request_queue *q = bdev_get_queue(bdev);
-        return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+        return SECTOR_TO_BLOCK(queue_max_sectors(q));
 }
 /*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
        else if (type == NODE)
                return 3 * sbi->blocks_per_seg;
        else if (type == META)
-                return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+                return MAX_BIO_BLOCKS(sbi);
        else
                return 0;
 }
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
        else if (type == NODE)
                desired = 3 * max_hw_blocks(sbi);
        else
-                desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+                desired = MAX_BIO_BLOCKS(sbi);
        wbc->nr_to_write = desired;
        return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582fc7601..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(max_small_discards),
        ATTR_LIST(ipu_policy),
        ATTR_LIST(min_ipu_util),
+        ATTR_LIST(min_fsync_blocks),
        ATTR_LIST(max_victim_search),
        ATTR_LIST(dir_level),
        ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
        /* Initialize f2fs-specific inode info */
        fi->vfs_inode.i_version = 1;
-        atomic_set(&fi->dirty_dents, 0);
+        atomic_set(&fi->dirty_pages, 0);
        fi->i_current_depth = 1;
        fi->i_advise = 0;
        rwlock_init(&fi->ext.ext_lock);
        init_rwsem(&fi->i_sem);
+        INIT_LIST_HEAD(&fi->inmem_pages);
+        mutex_init(&fi->inmem_lock);
        set_inode_flag(fi, FI_NEW_INODE);
@@ -432,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
        stop_gc_thread(sbi);
        /* We don't need to do checkpoint when it's clean */
-        if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+        if (sbi->s_dirty) {
-                write_checkpoint(sbi, true);
+                struct cp_control cpc = {
+                        .reason = CP_UMOUNT,
+                };
+                write_checkpoint(sbi, &cpc);
+        }
+        /*
+         * normally superblock is clean, so we need to release this.
+         * In addition, EIO will skip do checkpoint, we need this as well.
+         */
+        release_dirty_inode(sbi);
+        release_discard_addrs(sbi);
        iput(sbi->node_inode);
        iput(sbi->meta_inode);
@@ -457,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
        trace_f2fs_sync_fs(sb, sync);
-        if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
-                return 0;
        if (sync) {
+                struct cp_control cpc = {
+                        .reason = CP_SYNC,
+                };
                mutex_lock(&sbi->gc_mutex);
-                write_checkpoint(sbi, false);
+                write_checkpoint(sbi, &cpc);
                mutex_unlock(&sbi->gc_mutex);
        } else {
                f2fs_balance_fs(sbi);
@@ -505,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
        buf->f_bavail = user_block_count - valid_user_blocks(sbi);
-        buf->f_files = sbi->total_node_count;
+        buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-        buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+        buf->f_ffree = buf->f_files - valid_inode_count(sbi);
        buf->f_namelen = F2FS_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
@@ -613,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        org_mount_opt = sbi->mount_opt;
        active_logs = sbi->active_logs;
+        sbi->mount_opt.opt = 0;
+        sbi->active_logs = NR_CURSEG_TYPE;
        /* parse mount options */
        err = parse_options(sb, data);
        if (err)
@@ -663,7 +681,7 @@ restore_gc:
        if (need_restart_gc) {
                if (start_gc_thread(sbi))
                        f2fs_msg(sbi->sb, KERN_WARNING,
-                                "background gc thread is stop");
+                                "background gc thread has stopped");
        } else if (need_stop_gc) {
                stop_gc_thread(sbi);
        }
@@ -783,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
                return 1;
        }
-        if (le32_to_cpu(raw_super->log_sectorsize) !=
+        /* Currently, support 512/1024/2048/4096 bytes sector size */
-                                        F2FS_LOG_SECTOR_SIZE) {
+        if (le32_to_cpu(raw_super->log_sectorsize) >
-                f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
+                                F2FS_MAX_LOG_SECTOR_SIZE ||
+                le32_to_cpu(raw_super->log_sectorsize) <
+                                F2FS_MIN_LOG_SECTOR_SIZE) {
+                f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
+                        le32_to_cpu(raw_super->log_sectorsize));
                return 1;
        }
-        if (le32_to_cpu(raw_super->log_sectors_per_block) !=
+        if (le32_to_cpu(raw_super->log_sectors_per_block) +
-                                        F2FS_LOG_SECTORS_PER_BLOCK) {
+                le32_to_cpu(raw_super->log_sectorsize) !=
-                f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
+                        F2FS_MAX_LOG_SECTOR_SIZE) {
+                f2fs_msg(sb, KERN_INFO,
+                        "Invalid log sectors per block(%u) log sectorsize(%u)",
+                        le32_to_cpu(raw_super->log_sectors_per_block),
+                        le32_to_cpu(raw_super->log_sectorsize));
                return 1;
        }
        return 0;
@@ -812,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
        if (unlikely(fsmeta >= total))
                return 1;
-        if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+        if (unlikely(f2fs_cp_error(sbi))) {
                f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
                return 1;
        }
@@ -846,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
                atomic_set(&sbi->nr_pages[i], 0);
        sbi->dir_level = DEF_DIR_LEVEL;
+        sbi->need_fsck = false;
 }
 /*
@@ -899,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct buffer_head *raw_super_buf;
        struct inode *root;
        long err = -EINVAL;
+        bool retry = true;
        int i;
+try_onemore:
        /* allocate memory for f2fs-specific super block info */
        sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -1077,12 +1106,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto free_proc;
+        if (!retry)
+                sbi->need_fsck = true;
        /* recover fsynced data */
        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
                err = recover_fsync_data(sbi);
-                if (err)
+                if (err) {
                        f2fs_msg(sb, KERN_ERR,
                                "Cannot recover all fsync data errno=%ld", err);
+                        goto free_kobj;
+                }
        }
        /*
@@ -1123,6 +1157,13 @@ free_sb_buf:
        brelse(raw_super_buf);
 free_sbi:
        kfree(sbi);
+        /* give only one another chance */
+        if (retry) {
+                retry = 0;
+                shrink_dcache_sb(sb);
+                goto try_onemore;
+        }
        return err;
 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 static void *read_all_xattrs(struct inode *inode, struct page *ipage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_xattr_header *header;
        size_t size = PAGE_SIZE, inline_size = 0;
        void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
 static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                                void *txattr_addr, struct page *ipage)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        size_t inline_size = 0;
        void *xattr_addr;
        struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                        alloc_nid_failed(sbi, new_nid);
                        return PTR_ERR(xpage);
                }
-                f2fs_bug_on(new_nid);
+                f2fs_bug_on(sbi, new_nid);
                f2fs_wait_on_page_writeback(xpage, NODE);
        } else {
                struct dnode_of_data dn;
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
                int free;
                /*
                 * If value is NULL, it is remove operation.
-                 * In case of update operation, we caculate free.
+                 * In case of update operation, we calculate free.
                 */
                free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
                if (found)
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
                                const void *value, size_t size,
                                struct page *ipage, int flags)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        int err;
        /* this case is only from init_inode_metadata */
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..0bab12b20460 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages)
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        percpu_counter_init(&nr_files, 0);
+        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
 submit_op_failed:
        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
        spin_unlock(&cookie->lock);
+        fscache_unuse_cookie(object);
        kfree(op);
        _leave(" [EIO]");
        return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 /*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+        return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+                                  HZ);
+}
+/*
 * decide whether a page can be released, possibly by cancelling a store to it
 * - we're allowed to sleep if __GFP_WAIT is flagged
 */
@@ -115,7 +128,10 @@ page_busy:
        }
        fscache_stat(&fscache_n_store_vmscan_wait);
-        __fscache_wait_on_page_write(cookie, page);
+        if (!release_page_wait_timeout(cookie, page))
+                _debug("fscache writeout timeout page: %p{%lx}",
+                        page, page->index);
        gfp &= ~__GFP_WAIT;
        goto try_again;
 }
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
        struct fscache_operation *op;
        struct fscache_object *object;
-        bool wake_cookie;
+        bool wake_cookie = false;
        _enter("%p", cookie);
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
        __fscache_use_cookie(cookie);
        if (fscache_submit_exclusive_op(object, op) < 0)
-                goto nobufs;
+                goto nobufs_dec;
        spin_unlock(&cookie->lock);
        fscache_stat(&fscache_n_attr_changed_ok);
        fscache_put_operation(op);
        _leave(" = 0");
        return 0;
-nobufs:
+nobufs_dec:
        wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
        spin_unlock(&cookie->lock);
        kfree(op);
        if (wake_cookie)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 912061ac4baf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1305,6 +1305,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
                size_t start;
                ssize_t ret = iov_iter_get_pages(ii,
                                        &req->pages[req->num_pages],
+                                        *nbytesp - nbytes,
                                        req->max_pages - req->num_pages,
                                        &start);
                if (ret < 0)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
 * Returns: The length of the extent (minimum of one block)
 */
-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
 {
        const __be64 *end = (start + len);
        const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                           struct buffer_head *bh_map, struct metapath *mp,
                           const unsigned int sheight,
                           const unsigned int height,
-                           const unsigned int maxlen)
+                           const size_t maxlen)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        } else {
                /* Need to allocate indirect blocks */
                ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
-                dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+                dblks = min(maxlen, (size_t)(ptrs_per_blk -
+                                             mp->mp_list[end_of_metadata]));
                if (height == ip->i_height) {
                        /* Writing into existing tree, extend tree down */
                        iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        unsigned int bsize = sdp->sd_sb.sb_bsize;
-        const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+        const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
        const u64 *arr = sdp->sd_heightsize;
        __be64 *ptr;
        u64 size;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9a9685..5d4261ff5d23 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
        }
        if (IS_ERR(dent))
                return PTR_ERR(dent);
-        da->bh = bh;
-        da->dent = dent;
+        if (da->save_loc) {
+                da->bh = bh;
+                da->dent = dent;
+        } else {
+                brelse(bh);
+        }
        return 0;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65dda028..e1b309c24dab 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
        unsigned nr_blocks;
        struct gfs2_dirent *dent;
        struct buffer_head *bh;
+        int save_loc;
 };
 extern struct inode *gfs2_dir_search(struct inode *dir,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2c02478a86b0..80dd44dca028 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/aio.h>
+#include <linux/delay.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -959,9 +960,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
        unsigned int state;
        int flags;
        int error = 0;
+        int sleeptime;
        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT;
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
        mutex_lock(&fp->f_fl_mutex);
@@ -981,7 +983,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
                gfs2_holder_init(gl, state, flags, fl_gh);
                gfs2_glock_put(gl);
        }
-        error = gfs2_glock_nq(fl_gh);
+        for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
+                error = gfs2_glock_nq(fl_gh);
+                if (error != GLR_TRYFAILED)
+                        break;
+                fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+                fl_gh->gh_error = 0;
+                msleep(sleeptime);
+        }
        if (error) {
                gfs2_holder_uninit(fl_gh);
                if (error == GLR_TRYFAILED)
@@ -1004,7 +1013,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
        if (fl_gh->gh_gl) {
-                gfs2_glock_dq_wait(fl_gh);
+                gfs2_glock_dq(fl_gh);
                gfs2_holder_uninit(fl_gh);
        }
        mutex_unlock(&fp->f_fl_mutex);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7f513b1ceb2c..8f0c19d1d943 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 {
        INIT_LIST_HEAD(&gh->gh_list);
        gh->gh_gl = gl;
-        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        gh->gh_ip = _RET_IP_;
        gh->gh_owner_pid = get_pid(task_pid(current));
        gh->gh_state = state;
        gh->gh_flags = flags;
@@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
        gh->gh_state = state;
        gh->gh_flags = flags;
        gh->gh_iflags = 0;
-        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        gh->gh_ip = _RET_IP_;
        if (gh->gh_owner_pid)
                put_pid(gh->gh_owner_pid);
        gh->gh_owner_pid = get_pid(task_pid(current));
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67dce87f..1cc0bba6313f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
         * tr->alloced is not set since the transaction structure is
         * on the stack */
        tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
-        tr.tr_ip = (unsigned long)__builtin_return_address(0);
+        tr.tr_ip = _RET_IP_;
        sb_start_intwrite(sdp->sd_vfs);
        if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
                sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
        unsigned long gh_ip;
 };
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
 /* Resource group multi-block reservation, in order of appearance:
   Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
        u64 rs_inum;                  /* Inode number for reservation */
        /* ancillary quota stuff */
-        struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+        struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
-        struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+        struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
        unsigned int rs_qa_qd_num;
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..fcf42eadb69c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        int error, free_vfs_inode = 0;
        u32 aflags = 0;
        unsigned blocks = 1;
-        struct gfs2_diradd da = { .bh = NULL, };
+        struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return -ENAMETOOLONG;
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (!IS_ERR(inode)) {
                d = d_splice_alias(inode, dentry);
                error = PTR_ERR(d);
-                if (IS_ERR(d))
+                if (IS_ERR(d)) {
+                        inode = ERR_CAST(d);
                        goto fail_gunlock;
+                }
                error = 0;
                if (file) {
                        if (S_ISREG(inode->i_mode)) {
@@ -670,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        gfs2_set_inode_blocks(inode, 1);
        munge_mode_uid_gid(dip, inode);
+        check_and_update_goal(dip);
        ip->i_goal = dip->i_goal;
        ip->i_diskflags = 0;
        ip->i_eattr = 0;
@@ -840,8 +843,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
        int error;
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-        if (!inode)
+        if (inode == NULL) {
+                d_add(dentry, NULL);
                return NULL;
+        }
        if (IS_ERR(inode))
                return ERR_CAST(inode);
@@ -854,7 +859,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
        d = d_splice_alias(inode, dentry);
        if (IS_ERR(d)) {
-                iput(inode);
                gfs2_glock_dq_uninit(&gh);
                return d;
        }
@@ -896,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder ghs[2];
        struct buffer_head *dibh;
-        struct gfs2_diradd da = { .bh = NULL, };
+        struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
        int error;
        if (S_ISDIR(inode->i_mode))
@@ -1334,7 +1338,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
-        struct gfs2_diradd da = { .nr_blocks = 0, };
+        struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
        unsigned int x;
        int error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4cb9c0d6bbd..7474c413ffd1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
        return rgd;
 }
+void check_and_update_goal(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
+                ip->i_goal = ip->i_no_addr;
+}
 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 {
        int x;
@@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
        } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
                rs->rs_rbm.rgd = begin = ip->i_rgd;
        } else {
+                check_and_update_goal(ip);
                rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
        }
        if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
@@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                                     u32 blen, unsigned char new_state)
 {
        struct gfs2_rbm rbm;
-        struct gfs2_bitmap *bi;
+        struct gfs2_bitmap *bi, *bi_prev = NULL;
        rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
        if (!rbm.rgd) {
@@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                return NULL;
        }
+        gfs2_rbm_from_block(&rbm, bstart);
        while (blen--) {
-                gfs2_rbm_from_block(&rbm, bstart);
                bi = rbm_bi(&rbm);
-                bstart++;
+                if (bi != bi_prev) {
-                if (!bi->bi_clone) {
+                        if (!bi->bi_clone) {
-                        bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+                                bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-                                               GFP_NOFS | __GFP_NOFAIL);
+                                                      GFP_NOFS | __GFP_NOFAIL);
-                        memcpy(bi->bi_clone + bi->bi_offset,
+                                memcpy(bi->bi_clone + bi->bi_offset,
-                               bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+                                       bi->bi_bh->b_data + bi->bi_offset,
+                                       bi->bi_len);
+                        }
+                        gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
+                        bi_prev = bi;
                }
-                gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
                gfs2_setbit(&rbm, false, new_state);
+                gfs2_rbm_incr(&rbm);
        }
        return rbm.rgd;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 463ab2e95d1c..5d8f085f7ade 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
        return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
+extern void check_and_update_goal(struct gfs2_inode *ip);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff13d486..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
        int val;
        if (is_ancestor(root, sdp->sd_master_dir))
-                seq_printf(s, ",meta");
+                seq_puts(s, ",meta");
        if (args->ar_lockproto[0])
                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
        if (args->ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
        if (args->ar_hostdata[0])
                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
        if (args->ar_spectator)
-                seq_printf(s, ",spectator");
+                seq_puts(s, ",spectator");
        if (args->ar_localflocks)
-                seq_printf(s, ",localflocks");
+                seq_puts(s, ",localflocks");
        if (args->ar_debug)
-                seq_printf(s, ",debug");
+                seq_puts(s, ",debug");
        if (args->ar_posix_acl)
-                seq_printf(s, ",acl");
+                seq_puts(s, ",acl");
        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
                char *state;
                switch (args->ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
                seq_printf(s, ",quota=%s", state);
        }
        if (args->ar_suiddir)
-                seq_printf(s, ",suiddir");
+                seq_puts(s, ",suiddir");
        if (args->ar_data != GFS2_DATA_DEFAULT) {
                char *state;
                switch (args->ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
                seq_printf(s, ",data=%s", state);
        }
        if (args->ar_discard)
-                seq_printf(s, ",discard");
+                seq_puts(s, ",discard");
        val = sdp->sd_tune.gt_logd_secs;
        if (val != 30)
                seq_printf(s, ",commit=%d", val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
                seq_printf(s, ",errors=%s", state);
        }
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-                seq_printf(s, ",nobarrier");
+                seq_puts(s, ",nobarrier");
        if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
-                seq_printf(s, ",demote_interface_used");
+                seq_puts(s, ",demote_interface_used");
        if (args->ar_rgrplvb)
-                seq_printf(s, ",rgrplvb");
+                seq_puts(s, ",rgrplvb");
        return 0;
 }
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0546ab4e28e8..42bfd3361979 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (!tr)
                return -ENOMEM;
-        tr->tr_ip = (unsigned long)__builtin_return_address(0);
+        tr->tr_ip = _RET_IP_;
        tr->tr_blocks = blocks;
        tr->tr_revokes = revokes;
        tr->tr_reserved = 1;
diff --git a/fs/internal.h b/fs/internal.h
index e325b4f9c799..b2623200107b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 #endif
 /*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+/*
 * char_dev.c
 */
 extern void __init chrdev_init(void);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4556ce1af5b0..5ddaf8625d3b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb)
        return;
 }
-static int isofs_read_inode(struct inode *);
+static int isofs_read_inode(struct inode *, int relocated);
 static int isofs_statfs (struct dentry *, struct kstatfs *);
 static struct kmem_cache *isofs_inode_cachep;
@@ -1259,7 +1259,7 @@ out_toomany:
        goto out;
 }
-static int isofs_read_inode(struct inode *inode)
+static int isofs_read_inode(struct inode *inode, int relocated)
 {
        struct super_block *sb = inode->i_sb;
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode)
         */
        if (!high_sierra) {
-                parse_rock_ridge_inode(de, inode);
+                parse_rock_ridge_inode(de, inode, relocated);
                /* if we want uid/gid set, override the rock ridge setting */
                if (sbi->s_uid_set)
                        inode->i_uid = sbi->s_uid;
@@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
 * offset that point to the underlying meta-data for the inode.  The
 * code below is otherwise similar to the iget() code in
 * include/linux/fs.h */
-struct inode *isofs_iget(struct super_block *sb,
+struct inode *__isofs_iget(struct super_block *sb,
-                         unsigned long block,
+                           unsigned long block,
-                         unsigned long offset)
+                           unsigned long offset,
+                           int relocated)
 {
        unsigned long hashval;
        struct inode *inode;
@@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb,
                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
-                ret = isofs_read_inode(inode);
+                ret = isofs_read_inode(inode, relocated);
                if (ret < 0) {
                        iget_failed(inode);
                        inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 99167238518d..0ac4c1f73fbd 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -107,7 +107,7 @@ extern int iso_date(char *, int);
 struct inode;           /* To make gcc happy */
-extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *);
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
 extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
 extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
@@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int
 extern struct buffer_head *isofs_bread(struct inode *, sector_t);
 extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
-extern struct inode *isofs_iget(struct super_block *sb,
+struct inode *__isofs_iget(struct super_block *sb,
-                                unsigned long block,
+                           unsigned long block,
-                                unsigned long offset);
+                           unsigned long offset,
+                           int relocated);
+static inline struct inode *isofs_iget(struct super_block *sb,
+                                       unsigned long block,
+                                       unsigned long offset)
+{
+        return __isofs_iget(sb, block, offset, 0);
+}
+static inline struct inode *isofs_iget_reloc(struct super_block *sb,
+                                             unsigned long block,
+                                             unsigned long offset)
+{
+        return __isofs_iget(sb, block, offset, 1);
+}
 /* Because the inode number is no longer relevant to finding the
 * underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c0bf42472e40..f488bbae541a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -288,12 +288,16 @@ eio:
        goto out;
 }
+#define RR_REGARD_XA 1
+#define RR_RELOC_DE 2
 static int
 parse_rock_ridge_inode_internal(struct iso_directory_record *de,
-                                struct inode *inode, int regard_xa)
+                                struct inode *inode, int flags)
 {
        int symlink_len = 0;
        int cnt, sig;
+        unsigned int reloc_block;
        struct inode *reloc;
        struct rock_ridge *rr;
        int rootflag;
@@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
        init_rock_state(&rs, inode);
        setup_rock_ridge(de, inode, &rs);
-        if (regard_xa) {
+        if (flags & RR_REGARD_XA) {
                rs.chr += 14;
                rs.len -= 14;
                if (rs.len < 0)
@@ -485,12 +489,22 @@ repeat:
                                        "relocated directory\n");
                        goto out;
                case SIG('C', 'L'):
-                        ISOFS_I(inode)->i_first_extent =
+                        if (flags & RR_RELOC_DE) {
-                            isonum_733(rr->u.CL.location);
+                                printk(KERN_ERR
-                        reloc =
+                                       "ISOFS: Recursive directory relocation "
-                            isofs_iget(inode->i_sb,
+                                       "is not supported\n");
-                                       ISOFS_I(inode)->i_first_extent,
+                                goto eio;
-                                       0);
+                        }
+                        reloc_block = isonum_733(rr->u.CL.location);
+                        if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
+                            ISOFS_I(inode)->i_iget5_offset == 0) {
+                                printk(KERN_ERR
+                                       "ISOFS: Directory relocation points to "
+                                       "itself\n");
+                                goto eio;
+                        }
+                        ISOFS_I(inode)->i_first_extent = reloc_block;
+                        reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
                        if (IS_ERR(reloc)) {
                                ret = PTR_ERR(reloc);
                                goto out;
@@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
        return rpnt;
 }
-int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
+                           int relocated)
 {
-        int result = parse_rock_ridge_inode_internal(de, inode, 0);
+        int flags = relocated ? RR_RELOC_DE : 0;
+        int result = parse_rock_ridge_inode_internal(de, inode, flags);
        /*
         * if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
         */
        if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
            && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
-                result = parse_rock_ridge_inode_internal(de, inode, 14);
+                result = parse_rock_ridge_inode_internal(de, inode,
+                                                         flags | RR_REGARD_XA);
        }
        return result;
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
        struct commit_header *h;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
        return checksum;
 }
-static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
                                   unsigned long long block)
 {
        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
-        if (tag_bytes > JBD2_TAG_SIZE32)
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 }
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
        struct jbd2_journal_block_tail *tail;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
                                    struct buffer_head *bh, __u32 sequence)
 {
+        journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
        struct page *page = bh->b_page;
        __u8 *addr;
        __u32 csum32;
        __be32 seq;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
                             bh->b_size);
        kunmap_atomic(addr);
-        /* We only have space to store the lower 16 bits of the crc32c. */
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
-        tag->t_checksum = cpu_to_be16(csum32);
+                tag3->t_checksum = cpu_to_be32(csum32);
+        else
+                tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
 * jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        LIST_HEAD(io_bufs);
        LIST_HEAD(log_bufs);
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_block_tail);
        /*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        tag_flag |= JBD2_FLAG_SAME_UUID;
                tag = (journal_block_tag_t *) tagp;
-                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+                write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
                tag->t_flags = cpu_to_be16(tag_flag);
                jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
                                        commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..19d74d86d99c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 /* Checksumming functions */
 static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
-        if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+        if (jbd2_journal_has_csum_v2or3(journal) &&
-            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+            JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
                /* Can't have checksum v1 and v2 on at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
                       "at the same time!\n");
                goto out;
        }
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
+            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+                /* Can't have checksum v2 and v3 at the same time! */
+                printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+                       "at the same time!\n");
+                goto out;
+        }
        if (!jbd2_verify_csum_type(journal, sb)) {
                printk(KERN_ERR "JBD2: Unknown checksum type\n");
                goto out;
        }
        /* Load the checksum driver */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+        if (jbd2_journal_has_csum_v2or3(journal)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
        }
        /* Precompute checksum seed for all metadata */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                return 0;
-        /* Asking for checksumming v2 and v1?  Only give them v2. */
+        /* If enabling v2 checksums, turn on v3 instead */
-        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
+                incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
+                incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
+        }
+        /* Asking for checksumming v3 and v1?  Only give them v3. */
+        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
            compat & JBD2_FEATURE_COMPAT_CHECKSUM)
                compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        sb = journal->j_superblock;
-        /* If enabling v2 checksums, update superblock */
+        /* If enabling v3 checksums, update superblock */
-        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
                sb->s_feature_compat &=
                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                }
                /* Precompute checksum seed for all metadata */
-                if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                if (jbd2_journal_has_csum_v2or3(journal))
-                                              JBD2_FEATURE_INCOMPAT_CSUM_V2))
                        journal->j_csum_seed = jbd2_chksum(journal, ~0,
                                                           sb->s_uuid,
                                                           sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        /* If enabling v1 checksums, downgrade superblock */
        if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
                sb->s_feature_incompat &=
-                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
+                                     JBD2_FEATURE_INCOMPAT_CSUM_V3);
        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
 */
 size_t journal_tag_bytes(journal_t *journal)
 {
-        journal_block_tag_t tag;
+        size_t sz;
-        size_t x = 0;
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+                return sizeof(journal_block_tag3_t);
+        sz = sizeof(journal_block_tag_t);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
-                x += sizeof(tag.t_checksum);
+                sz += sizeof(__u16);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
-                return x + JBD2_TAG_SIZE64;
+                return sz;
        else
-                return x + JBD2_TAG_SIZE32;
+                return sz - sizeof(__u32);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..9b329b55ffe3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
        int                     nr = 0, size = journal->j_blocksize;
        int                     tag_bytes = journal_tag_bytes(journal);
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                size -= sizeof(struct jbd2_journal_block_tail);
        tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
        return err;
 }
-static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+static inline unsigned long long read_tag_block(journal_t *journal,
+                                                journal_block_tag_t *tag)
 {
        unsigned long long block = be32_to_cpu(tag->t_blocknr);
-        if (tag_bytes > JBD2_TAG_SIZE32)
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
        return block;
 }
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
                                      void *buf, __u32 sequence)
 {
+        journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
        __u32 csum32;
        __be32 seq;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        seq = cpu_to_be32(sequence);
        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
        csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
-        return tag->t_checksum == cpu_to_be16(csum32);
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+                return tag3->t_checksum == cpu_to_be32(csum32);
+        else
+                return tag->t_checksum == cpu_to_be16(csum32);
 }
 static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
        int                     tag_bytes = journal_tag_bytes(journal);
        __u32                   crc32_sum = ~0; /* Transactional Checksums */
        int                     descr_csum_size = 0;
+        int                     block_error = 0;
        /*
         * First thing is to establish what we expect to find in the log
@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
                switch(blocktype) {
                case JBD2_DESCRIPTOR_BLOCK:
                        /* Verify checksum first */
-                        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                        if (jbd2_journal_has_csum_v2or3(journal))
-                                        JBD2_FEATURE_INCOMPAT_CSUM_V2))
                                descr_csum_size =
                                        sizeof(struct jbd2_journal_block_tail);
                        if (descr_csum_size > 0 &&
@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
                                        unsigned long long blocknr;
                                        J_ASSERT(obh != NULL);
-                                        blocknr = read_tag_block(tag_bytes,
+                                        blocknr = read_tag_block(journal,
                                                                 tag);
                                        /* If the block has been
@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
                                                       "checksum recovering "
                                                       "block %llu in log\n",
                                                       blocknr);
-                                                continue;
+                                                block_error = 1;
+                                                goto skip_write;
                                        }
                                        /* Find a buffer for the new
@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
                                success = -EIO;
                }
        }
+        if (block_error && success == 0)
+                success = -EIO;
        return success;
 failed:
@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..d5e95a175c92 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,8 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/bio.h>
-#endif
 #include <linux/log2.h>
+#endif
 static struct kmem_cache *jbd2_revoke_record_cache;
 static struct kmem_cache *jbd2_revoke_table_cache;
@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
        offset = *offsetp;
        /* Do we need to leave space at the end for a checksum? */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_revoke_tail);
        /* Make sure we have a descriptor with space left for the record */
@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
        struct jbd2_journal_revoke_tail *tail;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
 lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o
 lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
        msg.rpc_proc = &clnt->cl_procinfo[proc];
        status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+        if (status == -ECONNREFUSED) {
+                dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
+                                status);
+                rpc_force_rebind(clnt);
+                status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+        }
        if (status < 0)
                dprintk("lockd: NSM upcall RPC failed, status=%d\n",
                                status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
        struct delayed_work grace_period_end;
        struct lock_manager lockd_manager;
-        struct list_head grace_list;
        spinlock_t nsm_clnt_lock;
        unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include "netns.h"
+#include "procfs.h"
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+                    loff_t *pos)
+{
+        char *data;
+        struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+                                           lockd_net_id);
+        if (size < 1)
+                return -EINVAL;
+        data = simple_transaction_get(file, buf, size);
+        if (IS_ERR(data))
+                return PTR_ERR(data);
+        switch(data[0]) {
+        case 'Y':
+        case 'y':
+        case '1':
+                locks_end_grace(&ln->lockd_manager);
+                break;
+        default:
+                return -EINVAL;
+        }
+        return size;
+}
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+                   loff_t *pos)
+{
+        struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+                                           lockd_net_id);
+        char resp[3];
+        resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+        resp[1] = '\n';
+        resp[2] = '\0';
+        return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+static const struct file_operations lockd_end_grace_operations = {
+        .write          = nlm_end_grace_write,
+        .read           = nlm_end_grace_read,
+        .llseek         = default_llseek,
+        .release        = simple_transaction_release,
+        .owner          = THIS_MODULE,
+};
+int __init
+lockd_create_procfs(void)
+{
+        struct proc_dir_entry *entry;
+        entry = proc_mkdir("fs/lockd", NULL);
+        if (!entry)
+                return -ENOMEM;
+        entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+                                 &lockd_end_grace_operations);
+        if (!entry) {
+                remove_proc_entry("fs/lockd", NULL);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void __exit
+lockd_remove_procfs(void)
+{
+        remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+        remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+#include <linux/kconfig.h>
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+        return 0;
+}
+static inline void
+lockd_remove_procfs(void)
+{
+        return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 09857b48d0c3..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
 #include <linux/nfs.h>
 #include "netns.h"
+#include "procfs.h"
 #define NLMDBG_FACILITY         NLMDBG_SVC
 #define LOCKD_BUFSIZE           (1024 + NLMSVC_XDRSIZE)
@@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
        error = make_socks(serv, net);
        if (error < 0)
-                goto err_socks;
+                goto err_bind;
        set_grace_period(net);
        dprintk("lockd_up_net: per-net data created; net=%p\n", net);
        return 0;
-err_socks:
-        svc_rpcb_cleanup(serv, net);
 err_bind:
        ln->nlmsvc_users--;
        return error;
@@ -586,7 +585,7 @@ static int lockd_init_net(struct net *net)
        struct lockd_net *ln = net_generic(net, lockd_net_id);
        INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
-        INIT_LIST_HEAD(&ln->grace_list);
+        INIT_LIST_HEAD(&ln->lockd_manager.list);
        spin_lock_init(&ln->nsm_clnt_lock);
        return 0;
 }
@@ -620,8 +619,15 @@ static int __init init_nlm(void)
        err = register_pernet_subsys(&lockd_net_ops);
        if (err)
                goto err_pernet;
+        err = lockd_create_procfs();
+        if (err)
+                goto err_procfs;
        return 0;
+err_procfs:
+        unregister_pernet_subsys(&lockd_net_ops);
 err_pernet:
 #ifdef CONFIG_SYSCTL
        unregister_sysctl_table(nlm_sysctl_table);
@@ -634,6 +640,7 @@ static void __exit exit_nlm(void)
 {
        /* FIXME: delete all NLM clients */
        nlm_shutdown_hosts();
+        lockd_remove_procfs();
        unregister_pernet_subsys(&lockd_net_ops);
 #ifdef CONFIG_SYSCTL
        unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/cleancache.h>
+#include "internal.h"
 /*
 * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io;
+        guard_bio_eod(rw, bio);
        submit_bio(rw, bio);
        return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..a7b05bf82d31 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
+#include <linux/hash.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd)
 static __always_inline void set_root(struct nameidata *nd)
 {
-        if (!nd->root.mnt)
+        get_fs_root(current->fs, &nd->root);
-                get_fs_root(current->fs, &nd->root);
 }
 static int link_path_walk(const char *, struct nameidata *);
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
 {
-        if (!nd->root.mnt) {
+        struct fs_struct *fs = current->fs;
-                struct fs_struct *fs = current->fs;
+        unsigned seq, res;
-                unsigned seq;
-                do {
+        do {
-                        seq = read_seqcount_begin(&fs->seq);
+                seq = read_seqcount_begin(&fs->seq);
-                        nd->root = fs->root;
+                nd->root = fs->root;
-                        nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+                res = __read_seqcount_begin(&nd->root.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+        } while (read_seqcount_retry(&fs->seq, seq));
-        }
+        return res;
 }
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
                        return PTR_ERR(s);
                }
                if (*s == '/') {
-                        set_root(nd);
+                        if (!nd->root.mnt)
+                                set_root(nd);
                        path_put(&nd->path);
                        nd->path = nd->root;
                        path_get(&nd->root);
@@ -1137,13 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 */
                *inode = path->dentry->d_inode;
        }
-        return read_seqretry(&mount_lock, nd->m_seq) &&
+        return !read_seqretry(&mount_lock, nd->m_seq) &&
                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
-        set_root_rcu(nd);
+        struct inode *inode = nd->inode;
+        if (!nd->root.mnt)
+                set_root_rcu(nd);
        while (1) {
                if (nd->path.dentry == nd->root.dentry &&
@@ -1155,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        struct dentry *parent = old->d_parent;
                        unsigned seq;
+                        inode = parent->d_inode;
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
                                goto failed;
@@ -1164,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                }
                if (!follow_up_rcu(&nd->path))
                        break;
+                inode = nd->path.dentry->d_inode;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
        }
        while (d_mountpoint(nd->path.dentry)) {
@@ -1173,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        break;
                nd->path.mnt = &mounted->mnt;
                nd->path.dentry = mounted->mnt.mnt_root;
+                inode = nd->path.dentry->d_inode;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-                if (!read_seqretry(&mount_lock, nd->m_seq))
+                if (read_seqretry(&mount_lock, nd->m_seq))
                        goto failed;
        }
-        nd->inode = nd->path.dentry->d_inode;
+        nd->inode = inode;
        return 0;
 failed:
@@ -1256,7 +1261,8 @@ static void follow_mount(struct path *path)
 static void follow_dotdot(struct nameidata *nd)
 {
-        set_root(nd);
+        if (!nd->root.mnt)
+                set_root(nd);
        while(1) {
                struct dentry *old = nd->path.dentry;
@@ -1634,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 static inline unsigned int fold_hash(unsigned long hash)
 {
-        hash += hash >> (8*sizeof(int));
+        return hash_64(hash, 32);
-        return hash;
 }
 #else   /* 32-bit case */
@@ -1669,9 +1674,9 @@ EXPORT_SYMBOL(full_name_hash);
 /*
 * Calculate the length and hash of the path component, and
- * return the length of the component;
+ * return the "hash_len" as the result.
 */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
 {
        unsigned long a, b, adata, bdata, mask, hash, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
@@ -1691,9 +1696,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
        mask = create_zero_mask(adata | bdata);
        hash += a & zero_bytemask(mask);
-        *hashp = fold_hash(hash);
+        len += find_zero(mask);
+        return hashlen_create(fold_hash(hash), len);
-        return len + find_zero(mask);
 }
 #else
@@ -1711,7 +1715,7 @@ EXPORT_SYMBOL(full_name_hash);
 * We know there's a real path component here of at least
 * one character.
 */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
 {
        unsigned long hash = init_name_hash();
        unsigned long len = 0, c;
@@ -1722,8 +1726,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
-        *hashp = end_name_hash(hash);
+        return hashlen_create(end_name_hash(hash), len);
-        return len;
 }
 #endif
@@ -1748,20 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        /* At this point we know we have a real path component. */
        for(;;) {
-                struct qstr this;
+                u64 hash_len;
-                long len;
                int type;
                err = may_lookup(nd);
                if (err)
                        break;
-                len = hash_name(name, &this.hash);
+                hash_len = hash_name(name);
-                this.name = name;
-                this.len = len;
                type = LAST_NORM;
-                if (name[0] == '.') switch (len) {
+                if (name[0] == '.') switch (hashlen_len(hash_len)) {
                        case 2:
                                if (name[1] == '.') {
                                        type = LAST_DOTDOT;
@@ -1775,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        struct dentry *parent = nd->path.dentry;
                        nd->flags &= ~LOOKUP_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                                struct qstr this = { { .hash_len = hash_len }, .name = name };
                                err = parent->d_op->d_hash(parent, &this);
                                if (err < 0)
                                        break;
+                                hash_len = this.hash_len;
+                                name = this.name;
                        }
                }
-                nd->last = this;
+                nd->last.hash_len = hash_len;
+                nd->last.name = name;
                nd->last_type = type;
-                if (!name[len])
+                name += hashlen_len(hash_len);
+                if (!*name)
                        return 0;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
-                        len++;
+                        name++;
-                } while (unlikely(name[len] == '/'));
+                } while (unlikely(*name == '/'));
-                if (!name[len])
+                if (!*name)
                        return 0;
-                name += len;
                err = walk_component(nd, &next, LOOKUP_FOLLOW);
                if (err < 0)
                        return err;
@@ -1852,7 +1855,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        if (*name=='/') {
                if (flags & LOOKUP_RCU) {
                        rcu_read_lock();
-                        set_root_rcu(nd);
+                        nd->seq = set_root_rcu(nd);
                } else {
                        set_root(nd);
                        path_get(&nd->root);
@@ -1903,7 +1906,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        }
        nd->inode = nd->path.dentry->d_inode;
-        return 0;
+        if (!(flags & LOOKUP_RCU))
+                return 0;
+        if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+                return 0;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        return -ECHILD;
 }
 static inline int lookup_last(struct nameidata *nd, struct path *path)
diff --git a/fs/namespace.c b/fs/namespace.c
index a01c7730e9af..ef42d9bee212 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1217,6 +1217,11 @@ static void namespace_unlock(void)
        head.first->pprev = &head.first;
        INIT_HLIST_HEAD(&unmounted);
+        /* undo decrements we'd done in umount_tree() */
+        hlist_for_each_entry(mnt, &head, mnt_hash)
+                if (mnt->mnt_ex_mountpoint.mnt)
+                        mntget(mnt->mnt_ex_mountpoint.mnt);
        up_write(&namespace_sem);
        synchronize_rcu();
@@ -1253,6 +1258,9 @@ void umount_tree(struct mount *mnt, int how)
                hlist_add_head(&p->mnt_hash, &tmp_list);
        }
+        hlist_for_each_entry(p, &tmp_list, mnt_hash)
+                list_del_init(&p->mnt_child);
        if (how)
                propagate_umount(&tmp_list);
@@ -1263,9 +1271,9 @@ void umount_tree(struct mount *mnt, int how)
                p->mnt_ns = NULL;
                if (how < 2)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-                list_del_init(&p->mnt_child);
                if (mnt_has_parent(p)) {
                        put_mountpoint(p->mnt_mp);
+                        mnt_add_count(p->mnt_parent, -1);
                        /* move the reference to mountpoint into ->mnt_ex_mountpoint */
                        p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
                        p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
 # Makefile for the pNFS block layout driver kernel module
 #
 obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/bio.h>          /* struct bio */
-#include <linux/buffer_head.h>  /* various write calls */
 #include <linux/prefetch.h>
 #include <linux/pagevec.h>
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-static void print_page(struct page *page)
+static bool is_hole(struct pnfs_block_extent *be)
 {
-        dprintk("PRINTPAGE page %p\n", page);
+        switch (be->be_state) {
-        dprintk("       PagePrivate %d\n", PagePrivate(page));
+        case PNFS_BLOCK_NONE_DATA:
-        dprintk("       PageUptodate %d\n", PageUptodate(page));
+                return true;
-        dprintk("       PageError %d\n", PageError(page));
+        case PNFS_BLOCK_INVALID_DATA:
-        dprintk("       PageDirty %d\n", PageDirty(page));
+                return be->be_tag ? false : true;
-        dprintk("       PageReferenced %d\n", PageReferenced(page));
+        default:
-        dprintk("       PageLocked %d\n", PageLocked(page));
+                return false;
-        dprintk("       PageWriteback %d\n", PageWriteback(page));
+        }
-        dprintk("       PageMappedToDisk %d\n", PageMappedToDisk(page));
-        dprintk("\n");
-}
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
-        if (be->be_state == PNFS_BLOCK_NONE_DATA)
-                return 1;
-        else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
-                return 0;
-        else
-                return !bl_is_sector_init(be->be_inval, isect);
-}
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
-{
-        return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-                be->be_state == PNFS_BLOCK_INVALID_DATA);
 }
 /* The data we are handed might be spread across several bios.  We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
 */
 struct parallel_io {
        struct kref refcnt;
-        void (*pnfs_callback) (void *data, int num_se);
+        void (*pnfs_callback) (void *data);
        void *data;
-        int bse_count;
 };
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
        if (rv) {
                rv->data = data;
                kref_init(&rv->refcnt);
-                rv->bse_count = 0;
        }
        return rv;
 }
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
        struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
        dprintk("%s enter\n", __func__);
-        p->pnfs_callback(p->data, p->bse_count);
+        p->pnfs_callback(p->data);
        kfree(p);
 }
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
        return NULL;
 }
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+static struct bio *
-                                     struct pnfs_block_extent *be,
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
-                                     void (*end_io)(struct bio *, int err),
+                void (*end_io)(struct bio *, int err), struct parallel_io *par)
-                                     struct parallel_io *par)
 {
        struct bio *bio;
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
        }
        if (bio) {
-                bio->bi_iter.bi_sector = isect - be->be_f_offset +
+                bio->bi_iter.bi_sector = disk_sector;
-                        be->be_v_offset;
+                bio->bi_bdev = bdev;
-                bio->bi_bdev = be->be_mdev;
                bio->bi_end_io = end_io;
                bio->bi_private = par;
        }
        return bio;
 }
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *
-                                      sector_t isect, struct page *page,
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
-                                      struct pnfs_block_extent *be,
+                struct page *page, struct pnfs_block_dev_map *map,
-                                      void (*end_io)(struct bio *, int err),
+                struct pnfs_block_extent *be,
-                                      struct parallel_io *par,
+                void (*end_io)(struct bio *, int err),
-                                      unsigned int offset, int len)
+                struct parallel_io *par, unsigned int offset, int *len)
 {
-        isect = isect + (offset >> SECTOR_SHIFT);
+        struct pnfs_block_dev *dev =
+                container_of(be->be_device, struct pnfs_block_dev, node);
+        u64 disk_addr, end;
        dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
-                npg, rw, (unsigned long long)isect, offset, len);
+                npg, rw, (unsigned long long)isect, offset, *len);
+        /* translate to device offset */
+        isect += be->be_v_offset;
+        isect -= be->be_f_offset;
+        /* translate to physical disk offset */
+        disk_addr = (u64)isect << SECTOR_SHIFT;
+        if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+                if (!dev->map(dev, disk_addr, map))
+                        return ERR_PTR(-EIO);
+                bio = bl_submit_bio(rw, bio);
+        }
+        disk_addr += map->disk_offset;
+        disk_addr -= map->start;
+        /* limit length to what the device mapping allows */
+        end = disk_addr + *len;
+        if (end >= map->start + map->len)
+                *len = map->start + map->len - disk_addr;
 retry:
        if (!bio) {
-                bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+                bio = bl_alloc_init_bio(npg, map->bdev,
+                                disk_addr >> SECTOR_SHIFT, end_io, par);
                if (!bio)
                        return ERR_PTR(-ENOMEM);
        }
-        if (bio_add_page(bio, page, len, offset) < len) {
+        if (bio_add_page(bio, page, *len, offset) < *len) {
                bio = bl_submit_bio(rw, bio);
                goto retry;
        }
        return bio;
 }
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
-                                      sector_t isect, struct page *page,
-                                      struct pnfs_block_extent *be,
-                                      void (*end_io)(struct bio *, int err),
-                                      struct parallel_io *par)
-{
-        return do_add_page_to_bio(bio, npg, rw, isect, page, be,
-                                  end_io, par, 0, PAGE_CACHE_SIZE);
-}
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
-        struct bio_vec *bvec;
-        int i;
-        if (!err)
-                bio_for_each_segment_all(bvec, bio, i)
-                        SetPageUptodate(bvec->bv_page);
        if (err) {
                struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
                        header->pnfs_error = -EIO;
                pnfs_set_lo_fail(header->lseg);
        }
        bio_put(bio);
        put_parallel(par);
 }
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
 {
        struct nfs_pgio_header *hdr = data;
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
 }
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_header *hdr)
+bl_read_pagelist(struct nfs_pgio_header *header)
 {
-        struct nfs_pgio_header *header = hdr;
+        struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
-        int i, hole;
+        struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
        struct bio *bio = NULL;
-        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        struct pnfs_block_extent be;
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
-        loff_t f_offset = hdr->args.offset;
+        loff_t f_offset = header->args.offset;
-        size_t bytes_left = hdr->args.count;
+        size_t bytes_left = header->args.count;
        unsigned int pg_offset, pg_len;
-        struct page **pages = hdr->args.pages;
+        struct page **pages = header->args.pages;
-        int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
+        int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
        const bool is_dio = (header->dreq != NULL);
+        struct blk_plug plug;
+        int i;
        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-                hdr->page_array.npages, f_offset,
+                header->page_array.npages, f_offset,
-                (unsigned int)hdr->args.count);
+                (unsigned int)header->args.count);
-        par = alloc_parallel(hdr);
+        par = alloc_parallel(header);
        if (!par)
-                goto use_mds;
+                return PNFS_NOT_ATTEMPTED;
        par->pnfs_callback = bl_end_par_io_read;
-        /* At this point, we can no longer jump to use_mds */
+        blk_start_plug(&plug);
        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
        /* Code assumes extents are page-aligned */
-        for (i = pg_index; i < hdr->page_array.npages; i++) {
+        for (i = pg_index; i < header->page_array.npages; i++) {
-                if (!extent_length) {
+                if (extent_length <= 0) {
                        /* We've used up the previous extent */
-                        bl_put_extent(be);
-                        bl_put_extent(cow_read);
                        bio = bl_submit_bio(READ, bio);
                        /* Get the next one */
-                        be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
+                        if (!ext_tree_lookup(bl, isect, &be, false)) {
-                                             isect, &cow_read);
-                        if (!be) {
                                header->pnfs_error = -EIO;
                                goto out;
                        }
-                        extent_length = be->be_length -
+                        extent_length = be.be_length - (isect - be.be_f_offset);
-                                (isect - be->be_f_offset);
-                        if (cow_read) {
-                                sector_t cow_length = cow_read->be_length -
-                                        (isect - cow_read->be_f_offset);
-                                extent_length = min(extent_length, cow_length);
-                        }
                }
+                pg_offset = f_offset & ~PAGE_CACHE_MASK;
                if (is_dio) {
-                        pg_offset = f_offset & ~PAGE_CACHE_MASK;
                        if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
                                pg_len = PAGE_CACHE_SIZE - pg_offset;
                        else
                                pg_len = bytes_left;
-                        f_offset += pg_len;
-                        bytes_left -= pg_len;
-                        isect += (pg_offset >> SECTOR_SHIFT);
                } else {
-                        pg_offset = 0;
+                        BUG_ON(pg_offset != 0);
                        pg_len = PAGE_CACHE_SIZE;
                }
-                hole = is_hole(be, isect);
+                isect += (pg_offset >> SECTOR_SHIFT);
-                if (hole && !cow_read) {
+                extent_length -= (pg_offset >> SECTOR_SHIFT);
+                if (is_hole(&be)) {
                        bio = bl_submit_bio(READ, bio);
                        /* Fill hole w/ zeroes w/o accessing device */
                        dprintk("%s Zeroing page for hole\n", __func__);
                        zero_user_segment(pages[i], pg_offset, pg_len);
-                        print_page(pages[i]);
-                        SetPageUptodate(pages[i]);
-                } else {
-                        struct pnfs_block_extent *be_read;
-                        be_read = (hole && cow_read) ? cow_read : be;
+                        /* invalidate map */
+                        map.start = NFS4_MAX_UINT64;
+                } else {
                        bio = do_add_page_to_bio(bio,
-                                                 hdr->page_array.npages - i,
+                                                 header->page_array.npages - i,
                                                 READ,
-                                                 isect, pages[i], be_read,
+                                                 isect, pages[i], &map, &be,
                                                 bl_end_io_read, par,
-                                                 pg_offset, pg_len);
+                                                 pg_offset, &pg_len);
                        if (IS_ERR(bio)) {
                                header->pnfs_error = PTR_ERR(bio);
                                bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
                        }
                }
                isect += (pg_len >> SECTOR_SHIFT);
-                extent_length -= PAGE_CACHE_SECTORS;
+                extent_length -= (pg_len >> SECTOR_SHIFT);
+                f_offset += pg_len;
+                bytes_left -= pg_len;
        }
        if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-                hdr->res.eof = 1;
+                header->res.eof = 1;
-                hdr->res.count = header->inode->i_size - hdr->args.offset;
+                header->res.count = header->inode->i_size - header->args.offset;
        } else {
-                hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
+                header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
        }
 out:
-        bl_put_extent(be);
-        bl_put_extent(cow_read);
        bl_submit_bio(READ, bio);
+        blk_finish_plug(&plug);
        put_parallel(par);
        return PNFS_ATTEMPTED;
- use_mds:
-        dprintk("Giving up and using normal NFS\n");
-        return PNFS_NOT_ATTEMPTED;
-}
-static void mark_extents_written(struct pnfs_block_layout *bl,
-                                 __u64 offset, __u32 count)
-{
-        sector_t isect, end;
-        struct pnfs_block_extent *be;
-        struct pnfs_block_short_extent *se;
-        dprintk("%s(%llu, %u)\n", __func__, offset, count);
-        if (count == 0)
-                return;
-        isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
-        end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
-        end >>= SECTOR_SHIFT;
-        while (isect < end) {
-                sector_t len;
-                be = bl_find_get_extent(bl, isect, NULL);
-                BUG_ON(!be); /* FIXME */
-                len = min(end, be->be_f_offset + be->be_length) - isect;
-                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-                        se = bl_pop_one_short_extent(be->be_inval);
-                        BUG_ON(!se);
-                        bl_mark_for_commit(be, isect, len, se);
-                }
-                isect += len;
-                bl_put_extent(be);
-        }
-}
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
-        struct parallel_io *par = bio->bi_private;
-        struct bio_vec *bvec;
-        int i;
-        bio_for_each_segment_all(bvec, bio, i) {
-                /* This is the zeroing page we added */
-                end_page_writeback(bvec->bv_page);
-                page_cache_release(bvec->bv_page);
-        }
-        if (unlikely(err)) {
-                struct nfs_pgio_header *header = par->data;
-                if (!header->pnfs_error)
-                        header->pnfs_error = -EIO;
-                pnfs_set_lo_fail(header->lseg);
-        }
-        bio_put(bio);
-        put_parallel(par);
 }
 static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
 */
 static void bl_write_cleanup(struct work_struct *work)
 {
-        struct rpc_task *task;
+        struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
-        struct nfs_pgio_header *hdr;
+        struct nfs_pgio_header *hdr =
+                        container_of(task, struct nfs_pgio_header, task);
        dprintk("%s enter\n", __func__);
-        task = container_of(work, struct rpc_task, u.tk_work);
-        hdr = container_of(task, struct nfs_pgio_header, task);
        if (likely(!hdr->pnfs_error)) {
-                /* Marks for LAYOUTCOMMIT */
+                struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
-                mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
+                u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
-                                     hdr->args.offset, hdr->args.count);
+                u64 end = (hdr->args.offset + hdr->args.count +
+                        PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+                ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+                                        (end - start) >> SECTOR_SHIFT);
        }
        pnfs_ld_write_done(hdr);
 }
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
+static void bl_end_par_io_write(void *data)
 {
        struct nfs_pgio_header *hdr = data;
-        if (unlikely(hdr->pnfs_error)) {
-                bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
-                                        num_se);
-        }
        hdr->task.tk_status = hdr->pnfs_error;
        hdr->verf.committed = NFS_FILE_SYNC;
        INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
        schedule_work(&hdr->task.u.tk_work);
 }
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
-        return;
-}
-/*
- * map_block:  map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
-        dprintk("%s enter be=%p\n", __func__, be);
-        set_buffer_mapped(bh);
-        bh->b_bdev = be->be_mdev;
-        bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
-            (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-        dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
-                __func__, (unsigned long long)isect, (long)bh->b_blocknr,
-                bh->b_size);
-        return;
-}
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct page *page = bvec->bv_page;
-        /* Only one page in bvec */
-        unlock_page(page);
-}
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
-                    unsigned int offset, unsigned int len)
-{
-        struct bio *bio;
-        struct page *shadow_page;
-        sector_t isect;
-        char *kaddr, *kshadow_addr;
-        int ret = 0;
-        dprintk("%s: offset %u len %u\n", __func__, offset, len);
-        shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-        if (shadow_page == NULL)
-                return -ENOMEM;
-        bio = bio_alloc(GFP_NOIO, 1);
-        if (bio == NULL)
-                return -ENOMEM;
-        isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
-                (offset / SECTOR_SIZE);
-        bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
-        bio->bi_bdev = be->be_mdev;
-        bio->bi_end_io = bl_read_single_end_io;
-        lock_page(shadow_page);
-        if (bio_add_page(bio, shadow_page,
-                         SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
-                unlock_page(shadow_page);
-                bio_put(bio);
-                return -EIO;
-        }
-        submit_bio(READ, bio);
-        wait_on_page_locked(shadow_page);
-        if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
-                ret = -EIO;
-        } else {
-                kaddr = kmap_atomic(page);
-                kshadow_addr = kmap_atomic(shadow_page);
-                memcpy(kaddr + offset, kshadow_addr + offset, len);
-                kunmap_atomic(kshadow_addr);
-                kunmap_atomic(kaddr);
-        }
-        __free_page(shadow_page);
-        bio_put(bio);
-        return ret;
-}
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
-                          unsigned int dirty_offset, unsigned int dirty_len,
-                          bool full_page)
-{
-        int ret = 0;
-        unsigned int start, end;
-        if (full_page) {
-                start = 0;
-                end = PAGE_CACHE_SIZE;
-        } else {
-                start = round_down(dirty_offset, SECTOR_SIZE);
-                end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
-        }
-        dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
-        if (!be) {
-                zero_user_segments(page, start, dirty_offset,
-                                   dirty_offset + dirty_len, end);
-                if (start == 0 && end == PAGE_CACHE_SIZE &&
-                    trylock_page(page)) {
-                        SetPageUptodate(page);
-                        unlock_page(page);
-                }
-                return ret;
-        }
-        if (start != dirty_offset)
-                ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-        if (!ret && (dirty_offset + dirty_len < end))
-                ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
-                                          end - dirty_offset - dirty_len);
-        return ret;
-}
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
-{
-        struct buffer_head *bh = NULL;
-        int ret = 0;
-        sector_t isect;
-        dprintk("%s enter, %p\n", __func__, page);
-        BUG_ON(PageUptodate(page));
-        if (!cow_read) {
-                zero_user_segment(page, 0, PAGE_SIZE);
-                SetPageUptodate(page);
-                goto cleanup;
-        }
-        bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
-        if (!bh) {
-                ret = -ENOMEM;
-                goto cleanup;
-        }
-        isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
-        map_block(bh, isect, cow_read);
-        if (!bh_uptodate_or_lock(bh))
-                ret = bh_submit_read(bh);
-        if (ret)
-                goto cleanup;
-        SetPageUptodate(page);
-cleanup:
-        if (bh)
-                free_buffer_head(bh);
-        if (ret) {
-                /* Need to mark layout with bad read...should now
-                 * just use nfs4 for reads and writes.
-                 */
-                mark_bad_read();
-        }
-        return ret;
-}
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
-                        struct pnfs_block_extent *cow_read)
-{
-        struct page *page;
-        int locked = 0;
-        page = find_get_page(inode->i_mapping, index);
-        if (page)
-                goto check_page;
-        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-        if (unlikely(!page)) {
-                dprintk("%s oom\n", __func__);
-                return ERR_PTR(-ENOMEM);
-        }
-        locked = 1;
-check_page:
-        /* PageDirty: Other will write this out
-         * PageWriteback: Other is writing this out
-         * PageUptodate: It was read before
-         */
-        if (PageDirty(page) || PageWriteback(page)) {
-                print_page(page);
-                if (locked)
-                        unlock_page(page);
-                page_cache_release(page);
-                return NULL;
-        }
-        if (!locked) {
-                lock_page(page);
-                locked = 1;
-                goto check_page;
-        }
-        if (!PageUptodate(page)) {
-                /* New page, readin or zero it */
-                init_page_for_write(page, cow_read);
-        }
-        set_page_writeback(page);
-        unlock_page(page);
-        return page;
-}
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-        int i, ret, npg_zero, pg_index, last = 0;
+        struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+        struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
        struct bio *bio = NULL;
-        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        struct pnfs_block_extent be;
-        sector_t isect, last_isect = 0, extent_length = 0;
+        sector_t isect, extent_length = 0;
        struct parallel_io *par = NULL;
        loff_t offset = header->args.offset;
        size_t count = header->args.count;
-        unsigned int pg_offset, pg_len, saved_len;
        struct page **pages = header->args.pages;
-        struct page *page;
+        int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
-        pgoff_t index;
+        unsigned int pg_len;
-        u64 temp;
+        struct blk_plug plug;
-        int npg_per_block =
+        int i;
-            NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
        dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
-        if (header->dreq != NULL &&
-            (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
-             !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
-                dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
-                goto out_mds;
-        }
        /* At this point, header->page_aray is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
         * to have it redone using nfs.
         */
        par = alloc_parallel(header);
        if (!par)
-                goto out_mds;
+                return PNFS_NOT_ATTEMPTED;
        par->pnfs_callback = bl_end_par_io_write;
-        /* At this point, have to be more careful with error handling */
-        isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+        blk_start_plug(&plug);
-        be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
-        if (!be || !is_writable(be, isect)) {
-                dprintk("%s no matching extents!\n", __func__);
-                goto out_mds;
-        }
-        /* First page inside INVALID extent */
+        /* we always write out the whole page */
-        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+        offset = offset & (loff_t)PAGE_CACHE_MASK;
-                if (likely(!bl_push_one_short_extent(be->be_inval)))
+        isect = offset >> SECTOR_SHIFT;
-                        par->bse_count++;
-                else
-                        goto out_mds;
-                temp = offset >> PAGE_CACHE_SHIFT;
-                npg_zero = do_div(temp, npg_per_block);
-                isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
-                                     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-                extent_length = be->be_length - (isect - be->be_f_offset);
-fill_invalid_ext:
-                dprintk("%s need to zero %d pages\n", __func__, npg_zero);
-                for (;npg_zero > 0; npg_zero--) {
-                        if (bl_is_sector_init(be->be_inval, isect)) {
-                                dprintk("isect %llu already init\n",
-                                        (unsigned long long)isect);
-                                goto next_page;
-                        }
-                        /* page ref released in bl_end_io_write_zero */
-                        index = isect >> PAGE_CACHE_SECTOR_SHIFT;
-                        dprintk("%s zero %dth page: index %lu isect %llu\n",
-                                __func__, npg_zero, index,
-                                (unsigned long long)isect);
-                        page = bl_find_get_zeroing_page(header->inode, index,
-                                                        cow_read);
-                        if (unlikely(IS_ERR(page))) {
-                                header->pnfs_error = PTR_ERR(page);
-                                goto out;
-                        } else if (page == NULL)
-                                goto next_page;
-                        ret = bl_mark_sectors_init(be->be_inval, isect,
-                                                       PAGE_CACHE_SECTORS);
-                        if (unlikely(ret)) {
-                                dprintk("%s bl_mark_sectors_init fail %d\n",
-                                        __func__, ret);
-                                end_page_writeback(page);
-                                page_cache_release(page);
-                                header->pnfs_error = ret;
-                                goto out;
-                        }
-                        if (likely(!bl_push_one_short_extent(be->be_inval)))
-                                par->bse_count++;
-                        else {
-                                end_page_writeback(page);
-                                page_cache_release(page);
-                                header->pnfs_error = -ENOMEM;
-                                goto out;
-                        }
-                        /* FIXME: This should be done in bi_end_io */
-                        mark_extents_written(BLK_LSEG2EXT(header->lseg),
-                                             page->index << PAGE_CACHE_SHIFT,
-                                             PAGE_CACHE_SIZE);
-                        bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
-                                                 isect, page, be,
-                                                 bl_end_io_write_zero, par);
-                        if (IS_ERR(bio)) {
-                                header->pnfs_error = PTR_ERR(bio);
-                                bio = NULL;
-                                goto out;
-                        }
-next_page:
-                        isect += PAGE_CACHE_SECTORS;
-                        extent_length -= PAGE_CACHE_SECTORS;
-                }
-                if (last)
-                        goto write_done;
-        }
-        bio = bl_submit_bio(WRITE, bio);
-        /* Middle pages */
-        pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
        for (i = pg_index; i < header->page_array.npages; i++) {
-                if (!extent_length) {
+                if (extent_length <= 0) {
                        /* We've used up the previous extent */
-                        bl_put_extent(be);
-                        bl_put_extent(cow_read);
                        bio = bl_submit_bio(WRITE, bio);
                        /* Get the next one */
-                        be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
+                        if (!ext_tree_lookup(bl, isect, &be, true)) {
-                                             isect, &cow_read);
-                        if (!be || !is_writable(be, isect)) {
                                header->pnfs_error = -EINVAL;
                                goto out;
                        }
-                        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-                                if (likely(!bl_push_one_short_extent(
-                                                                be->be_inval)))
-                                        par->bse_count++;
-                                else {
-                                        header->pnfs_error = -ENOMEM;
-                                        goto out;
-                                }
-                        }
-                        extent_length = be->be_length -
-                            (isect - be->be_f_offset);
-                }
-                dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
-                pg_offset = offset & ~PAGE_CACHE_MASK;
-                if (pg_offset + count > PAGE_CACHE_SIZE)
-                        pg_len = PAGE_CACHE_SIZE - pg_offset;
-                else
-                        pg_len = count;
-                saved_len = pg_len;
-                if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
-                    !bl_is_sector_init(be->be_inval, isect)) {
-                        ret = bl_read_partial_page_sync(pages[i], cow_read,
-                                                        pg_offset, pg_len, true);
-                        if (ret) {
-                                dprintk("%s bl_read_partial_page_sync fail %d\n",
-                                        __func__, ret);
-                                header->pnfs_error = ret;
-                                goto out;
-                        }
-                        ret = bl_mark_sectors_init(be->be_inval, isect,
-                                                       PAGE_CACHE_SECTORS);
-                        if (unlikely(ret)) {
-                                dprintk("%s bl_mark_sectors_init fail %d\n",
-                                        __func__, ret);
-                                header->pnfs_error = ret;
-                                goto out;
-                        }
-                        /* Expand to full page write */
+                        extent_length = be.be_length - (isect - be.be_f_offset);
-                        pg_offset = 0;
-                        pg_len = PAGE_CACHE_SIZE;
-                } else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
-                            (pg_len & (SECTOR_SIZE - 1))){
-                        /* ahh, nasty case. We have to do sync full sector
-                         * read-modify-write cycles.
-                         */
-                        unsigned int saved_offset = pg_offset;
-                        ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
-                                                        pg_len, false);
-                        pg_offset = round_down(pg_offset, SECTOR_SIZE);
-                        pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
-                                 - pg_offset;
                }
+                pg_len = PAGE_CACHE_SIZE;
                bio = do_add_page_to_bio(bio, header->page_array.npages - i,
-                                         WRITE,
+                                         WRITE, isect, pages[i], &map, &be,
-                                         isect, pages[i], be,
                                         bl_end_io_write, par,
-                                         pg_offset, pg_len);
+                                         0, &pg_len);
                if (IS_ERR(bio)) {
                        header->pnfs_error = PTR_ERR(bio);
                        bio = NULL;
                        goto out;
                }
-                offset += saved_len;
-                count -= saved_len;
-                isect += PAGE_CACHE_SECTORS;
-                last_isect = isect;
-                extent_length -= PAGE_CACHE_SECTORS;
-        }
-        /* Last page inside INVALID extent */
+                offset += pg_len;
-        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                count -= pg_len;
-                bio = bl_submit_bio(WRITE, bio);
+                isect += (pg_len >> SECTOR_SHIFT);
-                temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+                extent_length -= (pg_len >> SECTOR_SHIFT);
-                npg_zero = npg_per_block - do_div(temp, npg_per_block);
-                if (npg_zero < npg_per_block) {
-                        last = 1;
-                        goto fill_invalid_ext;
-                }
        }
-write_done:
        header->res.count = header->args.count;
 out:
-        bl_put_extent(be);
-        bl_put_extent(cow_read);
        bl_submit_bio(WRITE, bio);
+        blk_finish_plug(&plug);
        put_parallel(par);
        return PNFS_ATTEMPTED;
-out_mds:
-        bl_put_extent(be);
-        bl_put_extent(cow_read);
-        kfree(par);
-        return PNFS_NOT_ATTEMPTED;
-}
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
-        int i;
-        struct pnfs_block_extent *be;
-        spin_lock(&bl->bl_ext_lock);
-        for (i = 0; i < EXTENT_LISTS; i++) {
-                while (!list_empty(&bl->bl_extents[i])) {
-                        be = list_first_entry(&bl->bl_extents[i],
-                                              struct pnfs_block_extent,
-                                              be_node);
-                        list_del(&be->be_node);
-                        bl_put_extent(be);
-                }
-        }
-        spin_unlock(&bl->bl_ext_lock);
-}
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
-        struct pnfs_inval_tracking *pos, *temp;
-        struct pnfs_block_short_extent *se, *stemp;
-        list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
-                list_del(&pos->it_link);
-                kfree(pos);
-        }
-        list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
-                list_del(&se->bse_node);
-                kfree(se);
-        }
-        return;
 }
 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        int err;
        dprintk("%s enter\n", __func__);
-        release_extents(bl, NULL);
-        release_inval_marks(&bl->bl_inval);
+        err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+        WARN_ON(err);
        kfree(bl);
 }
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
        bl = kzalloc(sizeof(*bl), gfp_flags);
        if (!bl)
                return NULL;
+        bl->bl_ext_rw = RB_ROOT;
+        bl->bl_ext_ro = RB_ROOT;
        spin_lock_init(&bl->bl_ext_lock);
-        INIT_LIST_HEAD(&bl->bl_extents[0]);
-        INIT_LIST_HEAD(&bl->bl_extents[1]);
-        INIT_LIST_HEAD(&bl->bl_commit);
-        INIT_LIST_HEAD(&bl->bl_committing);
-        bl->bl_count = 0;
-        bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
-        BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
        return &bl->bl_layout;
 }
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
        kfree(lseg);
 }
-/* We pretty much ignore lseg, and store all data layout wide, so we
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
- * can correctly merge.
+struct layout_verification {
- */
+        u32 mode;       /* R or RW */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+        u64 start;      /* Expected start of next non-COW extent */
-                                                 struct nfs4_layoutget_res *lgr,
+        u64 inval;      /* Start of INVAL coverage */
-                                                 gfp_t gfp_flags)
+        u64 cowread;    /* End of COW read coverage */
-{
+};
-        struct pnfs_layout_segment *lseg;
-        int status;
-        dprintk("%s enter\n", __func__);
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
-        lseg = kzalloc(sizeof(*lseg), gfp_flags);
+ * section 2.3.1.
-        if (!lseg)
+ */
-                return ERR_PTR(-ENOMEM);
+static int verify_extent(struct pnfs_block_extent *be,
-        status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+                         struct layout_verification *lv)
-        if (status) {
+{
-                /* We don't want to call the full-blown bl_free_lseg,
+        if (lv->mode == IOMODE_READ) {
-                 * since on error extents were not touched.
+                if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-                 */
+                    be->be_state == PNFS_BLOCK_INVALID_DATA)
-                kfree(lseg);
+                        return -EIO;
-                return ERR_PTR(status);
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
        }
-        return lseg;
+        /* lv->mode == IOMODE_RW */
+        if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                if (lv->cowread > lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                lv->inval = lv->start;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+                if (be->be_f_offset > lv->start)
+                        return -EIO;
+                if (be->be_f_offset < lv->inval)
+                        return -EIO;
+                if (be->be_f_offset < lv->cowread)
+                        return -EIO;
+                /* It looks like you might want to min this with lv->start,
+                 * but you really don't.
+                 */
+                lv->inval = lv->inval + be->be_length;
+                lv->cowread = be->be_f_offset + be->be_length;
+                return 0;
+        } else
+                return -EIO;
 }
-static void
+static int decode_sector_number(__be32 **rp, sector_t *sp)
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
-                       const struct nfs4_layoutcommit_args *arg)
 {
-        dprintk("%s enter\n", __func__);
+        uint64_t s;
-        encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+        *rp = xdr_decode_hyper(*rp, &s);
+        if (s & 0x1ff) {
+                printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+                return -1;
+        }
+        *sp = s >> SECTOR_SHIFT;
+        return 0;
 }
-static void
+static int
-bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+                struct layout_verification *lv, struct list_head *extents,
+                gfp_t gfp_mask)
 {
-        struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+        struct pnfs_block_extent *be;
+        struct nfs4_deviceid id;
+        int error;
+        __be32 *p;
-        dprintk("%s enter\n", __func__);
+        p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
-        clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+        if (!p)
-}
+                return -EIO;
-static void free_blk_mountid(struct block_mount_id *mid)
+        be = kzalloc(sizeof(*be), GFP_NOFS);
-{
+        if (!be)
-        if (mid) {
+                return -ENOMEM;
-                struct pnfs_block_dev *dev, *tmp;
-                /* No need to take bm_lock as we are last user freeing bm_devlist */
+        memcpy(&id, p, NFS4_DEVICEID4_SIZE);
-                list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
+        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-                        list_del(&dev->bm_node);
-                        bl_free_block_dev(dev);
+        error = -EIO;
-                }
+        be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
-                kfree(mid);
+                                                lo->plh_lc_cred, gfp_mask);
+        if (!be->be_device)
+                goto out_free_be;
+        /*
+         * The next three values are read in as bytes, but stored in the
+         * extent structure in 512-byte granularity.
+         */
+        if (decode_sector_number(&p, &be->be_f_offset) < 0)
+                goto out_put_deviceid;
+        if (decode_sector_number(&p, &be->be_length) < 0)
+                goto out_put_deviceid;
+        if (decode_sector_number(&p, &be->be_v_offset) < 0)
+                goto out_put_deviceid;
+        be->be_state = be32_to_cpup(p++);
+        error = verify_extent(be, lv);
+        if (error) {
+                dprintk("%s: extent verification failed\n", __func__);
+                goto out_put_deviceid;
        }
+        list_add_tail(&be->be_list, extents);
+        return 0;
+out_put_deviceid:
+        nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+        kfree(be);
+        return error;
 }
-/* This is mostly copied from the filelayout_get_device_info function.
+static struct pnfs_layout_segment *
- * It seems much of this should be at the generic pnfs level.
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
- */
+                gfp_t gfp_mask)
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
-                        struct nfs4_deviceid *d_id)
 {
-        struct pnfs_device *dev;
+        struct layout_verification lv = {
-        struct pnfs_block_dev *rv;
+                .mode = lgr->range.iomode,
-        u32 max_resp_sz;
+                .start = lgr->range.offset >> SECTOR_SHIFT,
-        int max_pages;
+                .inval = lgr->range.offset >> SECTOR_SHIFT,
-        struct page **pages = NULL;
+                .cowread = lgr->range.offset >> SECTOR_SHIFT,
-        int i, rc;
+        };
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        struct pnfs_layout_segment *lseg;
+        struct xdr_buf buf;
+        struct xdr_stream xdr;
+        struct page *scratch;
+        int status, i;
+        uint32_t count;
+        __be32 *p;
+        LIST_HEAD(extents);
+        dprintk("---> %s\n", __func__);
+        lseg = kzalloc(sizeof(*lseg), gfp_mask);
+        if (!lseg)
+                return ERR_PTR(-ENOMEM);
+        status = -ENOMEM;
+        scratch = alloc_page(gfp_mask);
+        if (!scratch)
+                goto out;
+        xdr_init_decode_pages(&xdr, &buf,
+                        lgr->layoutp->pages, lgr->layoutp->len);
+        xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+        status = -EIO;
+        p = xdr_inline_decode(&xdr, 4);
+        if (unlikely(!p))
+                goto out_free_scratch;
+        count = be32_to_cpup(p++);
+        dprintk("%s: number of extents %d\n", __func__, count);
        /*
-         * Use the session max response size as the basis for setting
+         * Decode individual extents, putting them in temporary staging area
-         * GETDEVICEINFO's maxcount
+         * until whole layout is decoded to make error recovery easier.
         */
-        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        for (i = 0; i < count; i++) {
-        max_pages = nfs_page_array_len(0, max_resp_sz);
+                status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
-        dprintk("%s max_resp_sz %u max_pages %d\n",
+                if (status)
-                __func__, max_resp_sz, max_pages);
+                        goto process_extents;
-        dev = kmalloc(sizeof(*dev), GFP_NOFS);
-        if (!dev) {
-                dprintk("%s kmalloc failed\n", __func__);
-                return ERR_PTR(-ENOMEM);
        }
-        pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
+        if (lgr->range.offset + lgr->range.length !=
-        if (pages == NULL) {
+                        lv.start << SECTOR_SHIFT) {
-                kfree(dev);
+                dprintk("%s Final length mismatch\n", __func__);
-                return ERR_PTR(-ENOMEM);
+                status = -EIO;
+                goto process_extents;
        }
-        for (i = 0; i < max_pages; i++) {
-                pages[i] = alloc_page(GFP_NOFS);
+        if (lv.start < lv.cowread) {
-                if (!pages[i]) {
+                dprintk("%s Final uncovered COW extent\n", __func__);
-                        rv = ERR_PTR(-ENOMEM);
+                status = -EIO;
-                        goto out_free;
-                }
        }
-        memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+process_extents:
-        dev->layout_type = LAYOUT_BLOCK_VOLUME;
+        while (!list_empty(&extents)) {
-        dev->pages = pages;
+                struct pnfs_block_extent *be =
-        dev->pgbase = 0;
+                        list_first_entry(&extents, struct pnfs_block_extent,
-        dev->pglen = PAGE_SIZE * max_pages;
+                                         be_list);
-        dev->mincount = 0;
+                list_del(&be->be_list);
-        dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+                if (!status)
-        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+                        status = ext_tree_insert(bl, be);
-        rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
-        dprintk("%s getdevice info returns %d\n", __func__, rc);
+                if (status) {
-        if (rc) {
+                        nfs4_put_deviceid_node(be->be_device);
-                rv = ERR_PTR(rc);
+                        kfree(be);
-                goto out_free;
+                }
        }
-        rv = nfs4_blk_decode_device(server, dev);
+out_free_scratch:
- out_free:
+        __free_page(scratch);
-        for (i = 0; i < max_pages; i++)
+out:
-                __free_page(pages[i]);
+        dprintk("%s returns %d\n", __func__, status);
-        kfree(pages);
+        if (status) {
-        kfree(dev);
+                kfree(lseg);
-        return rv;
+                return ERR_PTR(status);
+        }
+        return lseg;
 }
-static int
+static void
-bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+bl_return_range(struct pnfs_layout_hdr *lo,
+                struct pnfs_layout_range *range)
 {
-        struct block_mount_id *b_mt_id = NULL;
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-        struct pnfs_devicelist *dlist = NULL;
+        sector_t offset = range->offset >> SECTOR_SHIFT, end;
-        struct pnfs_block_dev *bdev;
-        LIST_HEAD(block_disklist);
-        int status, i;
-        dprintk("%s enter\n", __func__);
-        if (server->pnfs_blksize == 0) {
+        if (range->offset % 8) {
-                dprintk("%s Server did not return blksize\n", __func__);
+                dprintk("%s: offset %lld not block size aligned\n",
-                return -EINVAL;
+                        __func__, range->offset);
-        }
+                return;
-        b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
-        if (!b_mt_id) {
-                status = -ENOMEM;
-                goto out_error;
-        }
-        /* Initialize nfs4 block layout mount id */
-        spin_lock_init(&b_mt_id->bm_lock);
-        INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-        dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
-        if (!dlist) {
-                status = -ENOMEM;
-                goto out_error;
        }
-        dlist->eof = 0;
-        while (!dlist->eof) {
+        if (range->length != NFS4_MAX_UINT64) {
-                status = nfs4_proc_getdevicelist(server, fh, dlist);
+                if (range->length % 8) {
-                if (status)
+                        dprintk("%s: length %lld not block size aligned\n",
-                        goto out_error;
+                                __func__, range->length);
-                dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+                        return;
-                        __func__, dlist->num_devs, dlist->eof);
-                for (i = 0; i < dlist->num_devs; i++) {
-                        bdev = nfs4_blk_get_deviceinfo(server, fh,
-                                                       &dlist->dev_id[i]);
-                        if (IS_ERR(bdev)) {
-                                status = PTR_ERR(bdev);
-                                goto out_error;
-                        }
-                        spin_lock(&b_mt_id->bm_lock);
-                        list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
-                        spin_unlock(&b_mt_id->bm_lock);
                }
-        }
-        dprintk("%s SUCCESS\n", __func__);
-        server->pnfs_ld_data = b_mt_id;
- out_return:
+                end = offset + (range->length >> SECTOR_SHIFT);
-        kfree(dlist);
+        } else {
-        return status;
+                end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+        }
- out_error:
+        ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
-        free_blk_mountid(b_mt_id);
-        goto out_return;
 }
 static int
-bl_clear_layoutdriver(struct nfs_server *server)
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+        return ext_tree_prepare_commit(arg);
+}
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 {
-        struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+        ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
        dprintk("%s enter\n", __func__);
-        free_blk_mountid(b_mt_id);
-        dprintk("%s RETURNS\n", __func__);
+        if (server->pnfs_blksize == 0) {
+                dprintk("%s Server did not return blksize\n", __func__);
+                return -EINVAL;
+        }
+        if (server->pnfs_blksize > PAGE_SIZE) {
+                printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+                        __func__, server->pnfs_blksize);
+                return -EINVAL;
+        }
        return 0;
 }
 static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+                struct nfs_page *req, unsigned int alignment)
 {
-        return IS_ALIGNED(req->wb_offset, alignment) &&
+        /*
-               IS_ALIGNED(req->wb_bytes, alignment);
+         * Always accept buffered writes, higher layers take care of the
+         * right alignment.
+         */
+        if (pgio->pg_dreq == NULL)
+                return true;
+        if (!IS_ALIGNED(req->wb_offset, alignment))
+                return false;
+        if (IS_ALIGNED(req->wb_bytes, alignment))
+                return true;
+        if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+                /*
+                 * If the write goes up to the inode size, just write
+                 * the full page.  Data past the inode size is
+                 * guaranteed to be zeroed by the higher level client
+                 * code, and this behaviour is mandated by RFC 5663
+                 * section 2.3.2.
+                 */
+                return true;
+        }
+        return false;
 }
 static void
 bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-        if (pgio->pg_dreq != NULL &&
+        if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
-            !is_aligned_req(req, SECTOR_SIZE))
                nfs_pageio_reset_read_mds(pgio);
-        else
+                return;
-                pnfs_generic_pg_init_read(pgio, req);
+        }
+        pnfs_generic_pg_init_read(pgio, req);
 }
 /*
@@ -1196,10 +796,8 @@ static size_t
 bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                struct nfs_page *req)
 {
-        if (pgio->pg_dreq != NULL &&
+        if (!is_aligned_req(pgio, req, SECTOR_SIZE))
-            !is_aligned_req(req, SECTOR_SIZE))
                return 0;
        return pnfs_generic_pg_test(pgio, prev, req);
 }
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
 static void
 bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-        if (pgio->pg_dreq != NULL &&
+        u64 wb_size;
-            !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+        if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
                nfs_pageio_reset_write_mds(pgio);
-        } else {
+                return;
-                u64 wb_size;
-                if (pgio->pg_dreq == NULL)
-                        wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
-                                                      req->wb_index);
-                else
-                        wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-                pnfs_generic_pg_init_write(pgio, req, wb_size);
        }
+        if (pgio->pg_dreq == NULL)
+                wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+                                              req->wb_index);
+        else
+                wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+        pnfs_generic_pg_init_write(pgio, req, wb_size);
 }
 /*
@@ -1252,10 +851,8 @@ static size_t
 bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                 struct nfs_page *req)
 {
-        if (pgio->pg_dreq != NULL &&
+        if (!is_aligned_req(pgio, req, PAGE_SIZE))
-            !is_aligned_req(req, PAGE_CACHE_SIZE))
                return 0;
        return pnfs_generic_pg_test(pgio, prev, req);
 }
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
        .id                             = LAYOUT_BLOCK_VOLUME,
        .name                           = "LAYOUT_BLOCK_VOLUME",
        .owner                          = THIS_MODULE,
+        .flags                          = PNFS_LAYOUTRET_ON_SETATTR |
+                                          PNFS_READ_WHOLE_PAGE,
        .read_pagelist                  = bl_read_pagelist,
        .write_pagelist                 = bl_write_pagelist,
        .alloc_layout_hdr               = bl_alloc_layout_hdr,
        .free_layout_hdr                = bl_free_layout_hdr,
        .alloc_lseg                     = bl_alloc_lseg,
        .free_lseg                      = bl_free_lseg,
-        .encode_layoutcommit            = bl_encode_layoutcommit,
+        .return_range                   = bl_return_range,
+        .prepare_layoutcommit           = bl_prepare_layoutcommit,
        .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
        .set_layoutdriver               = bl_set_layoutdriver,
-        .clear_layoutdriver             = bl_clear_layoutdriver,
+        .alloc_deviceid_node            = bl_alloc_deviceid_node,
+        .free_deviceid_node             = bl_free_deviceid_node,
        .pg_read_ops                    = &bl_pg_read_ops,
        .pg_write_ops                   = &bl_pg_write_ops,
 };
-static const struct rpc_pipe_ops bl_upcall_ops = {
-        .upcall         = rpc_pipe_generic_upcall,
-        .downcall       = bl_pipe_downcall,
-        .destroy_msg    = bl_pipe_destroy_msg,
-};
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
-                                            struct rpc_pipe *pipe)
-{
-        struct dentry *dir, *dentry;
-        dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
-        if (dir == NULL)
-                return ERR_PTR(-ENOENT);
-        dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
-        dput(dir);
-        return dentry;
-}
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
-                                          struct rpc_pipe *pipe)
-{
-        if (pipe->dentry)
-                rpc_unlink(pipe->dentry);
-}
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
-                           void *ptr)
-{
-        struct super_block *sb = ptr;
-        struct net *net = sb->s_fs_info;
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        struct dentry *dentry;
-        int ret = 0;
-        if (!try_module_get(THIS_MODULE))
-                return 0;
-        if (nn->bl_device_pipe == NULL) {
-                module_put(THIS_MODULE);
-                return 0;
-        }
-        switch (event) {
-        case RPC_PIPEFS_MOUNT:
-                dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
-                if (IS_ERR(dentry)) {
-                        ret = PTR_ERR(dentry);
-                        break;
-                }
-                nn->bl_device_pipe->dentry = dentry;
-                break;
-        case RPC_PIPEFS_UMOUNT:
-                if (nn->bl_device_pipe->dentry)
-                        nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
-                break;
-        default:
-                ret = -ENOTSUPP;
-                break;
-        }
-        module_put(THIS_MODULE);
-        return ret;
-}
-static struct notifier_block nfs4blocklayout_block = {
-        .notifier_call = rpc_pipefs_event,
-};
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
-                                                   struct rpc_pipe *pipe)
-{
-        struct super_block *pipefs_sb;
-        struct dentry *dentry;
-        pipefs_sb = rpc_get_sb_net(net);
-        if (!pipefs_sb)
-                return NULL;
-        dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
-        rpc_put_sb_net(net);
-        return dentry;
-}
-static void nfs4blocklayout_unregister_net(struct net *net,
-                                           struct rpc_pipe *pipe)
-{
-        struct super_block *pipefs_sb;
-        pipefs_sb = rpc_get_sb_net(net);
-        if (pipefs_sb) {
-                nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
-                rpc_put_sb_net(net);
-        }
-}
-static int nfs4blocklayout_net_init(struct net *net)
-{
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        struct dentry *dentry;
-        init_waitqueue_head(&nn->bl_wq);
-        nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
-        if (IS_ERR(nn->bl_device_pipe))
-                return PTR_ERR(nn->bl_device_pipe);
-        dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
-        if (IS_ERR(dentry)) {
-                rpc_destroy_pipe_data(nn->bl_device_pipe);
-                return PTR_ERR(dentry);
-        }
-        nn->bl_device_pipe->dentry = dentry;
-        return 0;
-}
-static void nfs4blocklayout_net_exit(struct net *net)
-{
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
-        rpc_destroy_pipe_data(nn->bl_device_pipe);
-        nn->bl_device_pipe = NULL;
-}
-static struct pernet_operations nfs4blocklayout_net_ops = {
-        .init = nfs4blocklayout_net_init,
-        .exit = nfs4blocklayout_net_exit,
-};
 static int __init nfs4blocklayout_init(void)
 {
        int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
        ret = pnfs_register_layoutdriver(&blocklayout_type);
        if (ret)
                goto out;
+        ret = bl_init_pipefs();
-        ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
        if (ret)
-                goto out_remove;
+                goto out_unregister;
-        ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+        return 0;
-        if (ret)
-                goto out_notifier;
-out:
-        return ret;
-out_notifier:
+out_unregister:
-        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
        pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
        return ret;
 }
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
        dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
               __func__);
-        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+        bl_cleanup_pipefs();
-        unregister_pernet_subsys(&nfs4blocklayout_net_ops);
        pnfs_unregister_layoutdriver(&blocklayout_type);
 }
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
-struct block_mount_id {
+struct pnfs_block_dev;
-        spinlock_t                      bm_lock;    /* protects list */
-        struct list_head                bm_devlist; /* holds pnfs_block_dev */
-};
-struct pnfs_block_dev {
+enum pnfs_block_volume_type {
-        struct list_head                bm_node;
+        PNFS_BLOCK_VOLUME_SIMPLE        = 0,
-        struct nfs4_deviceid            bm_mdevid;    /* associated devid */
+        PNFS_BLOCK_VOLUME_SLICE         = 1,
-        struct block_device             *bm_mdev;     /* meta device itself */
+        PNFS_BLOCK_VOLUME_CONCAT        = 2,
-        struct net                      *net;
+        PNFS_BLOCK_VOLUME_STRIPE        = 3,
 };
-enum exstate4 {
+#define PNFS_BLOCK_MAX_UUIDS    4
-        PNFS_BLOCK_READWRITE_DATA       = 0,
+#define PNFS_BLOCK_MAX_DEVICES  64
-        PNFS_BLOCK_READ_DATA            = 1,
-        PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
+/*
-        PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN     128
+struct pnfs_block_volume {
+        enum pnfs_block_volume_type     type;
+        union {
+                struct {
+                        int             len;
+                        int             nr_sigs;
+                        struct {
+                                u64             offset;
+                                u32             sig_len;
+                                u8              sig[PNFS_BLOCK_UUID_LEN];
+                        } sigs[PNFS_BLOCK_MAX_UUIDS];
+                } simple;
+                struct {
+                        u64             start;
+                        u64             len;
+                        u32             volume;
+                } slice;
+                struct {
+                        u32             volumes_count;
+                        u32             volumes[PNFS_BLOCK_MAX_DEVICES];
+                } concat;
+                struct {
+                        u64             chunk_size;
+                        u32             volumes_count;
+                        u32             volumes[PNFS_BLOCK_MAX_DEVICES];
+                } stripe;
+        };
 };
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct pnfs_block_dev_map {
+        sector_t                        start;
+        sector_t                        len;
-struct my_tree {
+        sector_t                        disk_offset;
-        sector_t                mtt_step_size;  /* Internal sector alignment */
+        struct block_device             *bdev;
-        struct list_head        mtt_stub; /* Should be a radix tree */
 };
-struct pnfs_inval_markings {
+struct pnfs_block_dev {
-        spinlock_t      im_lock;
+        struct nfs4_deviceid_node       node;
-        struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
-        sector_t        im_block_size;  /* Server blocksize in sectors */
+        u64                             start;
-        struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
+        u64                             len;
+        u32                             nr_children;
+        struct pnfs_block_dev           *children;
+        u64                             chunk_size;
+        struct block_device             *bdev;
+        u64                             disk_offset;
+        bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+                        struct pnfs_block_dev_map *map);
 };
-struct pnfs_inval_tracking {
+enum exstate4 {
-        struct list_head it_link;
+        PNFS_BLOCK_READWRITE_DATA       = 0,
-        int              it_sector;
+        PNFS_BLOCK_READ_DATA            = 1,
-        int              it_tags;
+        PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
+        PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
 };
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
-        struct kref     be_refcnt;
+        union {
-        struct list_head be_node;       /* link into lseg list */
+                struct rb_node  be_node;
-        struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
+                struct list_head be_list;
-        struct block_device *be_mdev;
+        };
+        struct nfs4_deviceid_node *be_device;
        sector_t        be_f_offset;    /* the starting offset in the file */
        sector_t        be_length;      /* the size of the extent */
        sector_t        be_v_offset;    /* the starting offset in the volume */
        enum exstate4   be_state;       /* the state of this extent */
-        struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+#define EXTENT_WRITTEN          1
+#define EXTENT_COMMITTING       2
+        unsigned int    be_tag;
 };
-/* Shortened extent used by LAYOUTCOMMIT */
+/* on the wire size of the extent */
-struct pnfs_block_short_extent {
+#define BL_EXTENT_SIZE  (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-        struct list_head bse_node;
-        struct nfs4_deviceid bse_devid;
-        struct block_device *bse_mdev;
-        sector_t        bse_f_offset;   /* the starting offset in the file */
-        sector_t        bse_length;     /* the size of the extent */
-};
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
-        spin_lock_init(&marks->im_lock);
-        INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
-        INIT_LIST_HEAD(&marks->im_extents);
-        marks->im_block_size = blocksize;
-        marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
-                                           blocksize);
-}
-enum extentclass4 {
-        RW_EXTENT       = 0, /* READWRTE and INVAL */
-        RO_EXTENT       = 1, /* READ and NONE */
-        EXTENT_LISTS    = 2,
-};
-static inline int bl_choose_list(enum exstate4 state)
-{
-        if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
-                return RO_EXTENT;
-        else
-                return RW_EXTENT;
-}
 struct pnfs_block_layout {
-        struct pnfs_layout_hdr bl_layout;
+        struct pnfs_layout_hdr  bl_layout;
-        struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+        struct rb_root          bl_ext_rw;
+        struct rb_root          bl_ext_ro;
        spinlock_t              bl_ext_lock;   /* Protects list manipulation */
-        struct list_head        bl_extents[EXTENT_LISTS]; /* R and RW extents */
-        struct list_head        bl_commit;      /* Needs layout commit */
-        struct list_head        bl_committing;  /* Layout committing */
-        unsigned int            bl_count;       /* entries in bl_commit */
-        sector_t                bl_blocksize;  /* Server blocksize in sectors */
 };
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
 static inline struct pnfs_block_layout *
 BLK_LO2EXT(struct pnfs_layout_hdr *lo)
 {
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
-/* blocklayoutdev.c */
+/* dev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+                struct pnfs_device *pdev, gfp_t gfp_mask);
-void nfs4_blkdev_put(struct block_device *bdev);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
-                                                struct pnfs_device *dev);
+/* extent_tree.c */
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+int ext_tree_insert(struct pnfs_block_layout *bl,
-                                struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+                struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
-/* blocklayoutdm.c */
+                sector_t end);
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+                sector_t len);
-/* extents.c */
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
-struct pnfs_block_extent *
+                struct pnfs_block_extent *ret, bool rw);
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
-                struct pnfs_block_extent **cow_read);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-                             sector_t offset, sector_t length);
+/* rpc_pipefs.c */
-void bl_put_extent(struct pnfs_block_extent *be);
+dev_t bl_resolve_deviceid(struct nfs_server *server,
-struct pnfs_block_extent *bl_alloc_extent(void);
+                struct pnfs_block_volume *b, gfp_t gfp_mask);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
+int __init bl_init_pipefs(void);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+void __exit bl_cleanup_pipefs(void);
-                                   struct xdr_stream *xdr,
-                                   const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-                                   const struct nfs4_layoutcommit_args *arg,
-                                   int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
-                         struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-                        sector_t offset, sector_t length,
-                        struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- *  Device operations for the pnfs nfs4 file layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include "blocklayout.h"
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
-        uint64_t s;
-        *rp = xdr_decode_hyper(*rp, &s);
-        if (s & 0x1ff) {
-                printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
-                return -1;
-        }
-        *sp = s >> SECTOR_SHIFT;
-        return 0;
-}
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
-        dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
-                        MINOR(bdev->bd_dev));
-        blkdev_put(bdev, FMODE_READ);
-}
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
-                         size_t mlen)
-{
-        struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
-                                         nfs_net_id);
-        if (mlen != sizeof (struct bl_dev_msg))
-                return -EINVAL;
-        if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
-                return -EFAULT;
-        wake_up(&nn->bl_wq);
-        return mlen;
-}
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
-        struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-        if (msg->errno >= 0)
-                return;
-        wake_up(bl_pipe_msg->bl_wq);
-}
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
-                       struct pnfs_device *dev)
-{
-        struct pnfs_block_dev *rv;
-        struct block_device *bd = NULL;
-        struct bl_pipe_msg bl_pipe_msg;
-        struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-        struct bl_msg_hdr bl_msg = {
-                .type = BL_DEVICE_MOUNT,
-                .totallen = dev->mincount,
-        };
-        uint8_t *dataptr;
-        DECLARE_WAITQUEUE(wq, current);
-        int offset, len, i, rc;
-        struct net *net = server->nfs_client->cl_net;
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        struct bl_dev_msg *reply = &nn->bl_mount_reply;
-        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-        dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-                dev->mincount);
-        bl_pipe_msg.bl_wq = &nn->bl_wq;
-        memset(msg, 0, sizeof(*msg));
-        msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
-        if (!msg->data) {
-                rv = ERR_PTR(-ENOMEM);
-                goto out;
-        }
-        memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-        dataptr = (uint8_t *) msg->data;
-        len = dev->mincount;
-        offset = sizeof(bl_msg);
-        for (i = 0; len > 0; i++) {
-                memcpy(&dataptr[offset], page_address(dev->pages[i]),
-                                len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
-                len -= PAGE_CACHE_SIZE;
-                offset += PAGE_CACHE_SIZE;
-        }
-        msg->len = sizeof(bl_msg) + dev->mincount;
-        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
-        add_wait_queue(&nn->bl_wq, &wq);
-        rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
-        if (rc < 0) {
-                remove_wait_queue(&nn->bl_wq, &wq);
-                rv = ERR_PTR(rc);
-                goto out;
-        }
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule();
-        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&nn->bl_wq, &wq);
-        if (reply->status != BL_DEVICE_REQUEST_PROC) {
-                dprintk("%s failed to open device: %d\n",
-                        __func__, reply->status);
-                rv = ERR_PTR(-EINVAL);
-                goto out;
-        }
-        bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
-                               FMODE_READ, NULL);
-        if (IS_ERR(bd)) {
-                dprintk("%s failed to open device : %ld\n", __func__,
-                        PTR_ERR(bd));
-                rv = ERR_CAST(bd);
-                goto out;
-        }
-        rv = kzalloc(sizeof(*rv), GFP_NOFS);
-        if (!rv) {
-                rv = ERR_PTR(-ENOMEM);
-                goto out;
-        }
-        rv->bm_mdev = bd;
-        memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
-        rv->net = net;
-        dprintk("%s Created device %s with bd_block_size %u\n",
-                __func__,
-                bd->bd_disk->disk_name,
-                bd->bd_block_size);
-out:
-        kfree(msg->data);
-        return rv;
-}
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
-                                            struct nfs4_deviceid *id)
-{
-        struct block_device *rv = NULL;
-        struct block_mount_id *mid;
-        struct pnfs_block_dev *dev;
-        dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
-        mid = BLK_ID(lo);
-        spin_lock(&mid->bm_lock);
-        list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
-                if (memcmp(id->data, dev->bm_mdevid.data,
-                           NFS4_DEVICEID4_SIZE) == 0) {
-                        rv = dev->bm_mdev;
-                        goto out;
-                }
-        }
- out:
-        spin_unlock(&mid->bm_lock);
-        dprintk("%s returning %p\n", __func__, rv);
-        return rv;
-}
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
-        u32 mode;       /* R or RW */
-        u64 start;      /* Expected start of next non-COW extent */
-        u64 inval;      /* Start of INVAL coverage */
-        u64 cowread;    /* End of COW read coverage */
-};
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
-                         struct layout_verification *lv)
-{
-        if (lv->mode == IOMODE_READ) {
-                if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-                    be->be_state == PNFS_BLOCK_INVALID_DATA)
-                        return -EIO;
-                if (be->be_f_offset != lv->start)
-                        return -EIO;
-                lv->start += be->be_length;
-                return 0;
-        }
-        /* lv->mode == IOMODE_RW */
-        if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
-                if (be->be_f_offset != lv->start)
-                        return -EIO;
-                if (lv->cowread > lv->start)
-                        return -EIO;
-                lv->start += be->be_length;
-                lv->inval = lv->start;
-                return 0;
-        } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-                if (be->be_f_offset != lv->start)
-                        return -EIO;
-                lv->start += be->be_length;
-                return 0;
-        } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
-                if (be->be_f_offset > lv->start)
-                        return -EIO;
-                if (be->be_f_offset < lv->inval)
-                        return -EIO;
-                if (be->be_f_offset < lv->cowread)
-                        return -EIO;
-                /* It looks like you might want to min this with lv->start,
-                 * but you really don't.
-                 */
-                lv->inval = lv->inval + be->be_length;
-                lv->cowread = be->be_f_offset + be->be_length;
-                return 0;
-        } else
-                return -EIO;
-}
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-                           struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
-        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-        int i, status = -EIO;
-        uint32_t count;
-        struct pnfs_block_extent *be = NULL, *save;
-        struct xdr_stream stream;
-        struct xdr_buf buf;
-        struct page *scratch;
-        __be32 *p;
-        struct layout_verification lv = {
-                .mode = lgr->range.iomode,
-                .start = lgr->range.offset >> SECTOR_SHIFT,
-                .inval = lgr->range.offset >> SECTOR_SHIFT,
-                .cowread = lgr->range.offset >> SECTOR_SHIFT,
-        };
-        LIST_HEAD(extents);
-        dprintk("---> %s\n", __func__);
-        scratch = alloc_page(gfp_flags);
-        if (!scratch)
-                return -ENOMEM;
-        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-        p = xdr_inline_decode(&stream, 4);
-        if (unlikely(!p))
-                goto out_err;
-        count = be32_to_cpup(p++);
-        dprintk("%s enter, number of extents %i\n", __func__, count);
-        p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
-        if (unlikely(!p))
-                goto out_err;
-        /* Decode individual extents, putting them in temporary
-         * staging area until whole layout is decoded to make error
-         * recovery easier.
-         */
-        for (i = 0; i < count; i++) {
-                be = bl_alloc_extent();
-                if (!be) {
-                        status = -ENOMEM;
-                        goto out_err;
-                }
-                memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
-                p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-                be->be_mdev = translate_devid(lo, &be->be_devid);
-                if (!be->be_mdev)
-                        goto out_err;
-                /* The next three values are read in as bytes,
-                 * but stored as 512-byte sector lengths
-                 */
-                if (decode_sector_number(&p, &be->be_f_offset) < 0)
-                        goto out_err;
-                if (decode_sector_number(&p, &be->be_length) < 0)
-                        goto out_err;
-                if (decode_sector_number(&p, &be->be_v_offset) < 0)
-                        goto out_err;
-                be->be_state = be32_to_cpup(p++);
-                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-                        be->be_inval = &bl->bl_inval;
-                if (verify_extent(be, &lv)) {
-                        dprintk("%s verify failed\n", __func__);
-                        goto out_err;
-                }
-                list_add_tail(&be->be_node, &extents);
-        }
-        if (lgr->range.offset + lgr->range.length !=
-                        lv.start << SECTOR_SHIFT) {
-                dprintk("%s Final length mismatch\n", __func__);
-                be = NULL;
-                goto out_err;
-        }
-        if (lv.start < lv.cowread) {
-                dprintk("%s Final uncovered COW extent\n", __func__);
-                be = NULL;
-                goto out_err;
-        }
-        /* Extents decoded properly, now try to merge them in to
-         * existing layout extents.
-         */
-        spin_lock(&bl->bl_ext_lock);
-        list_for_each_entry_safe(be, save, &extents, be_node) {
-                list_del(&be->be_node);
-                status = bl_add_merge_extent(bl, be);
-                if (status) {
-                        spin_unlock(&bl->bl_ext_lock);
-                        /* This is a fairly catastrophic error, as the
-                         * entire layout extent lists are now corrupted.
-                         * We should have some way to distinguish this.
-                         */
-                        be = NULL;
-                        goto out_err;
-                }
-        }
-        spin_unlock(&bl->bl_ext_lock);
-        status = 0;
- out:
-        __free_page(scratch);
-        dprintk("%s returns %i\n", __func__, status);
-        return status;
- out_err:
-        bl_put_extent(be);
-        while (!list_empty(&extents)) {
-                be = list_first_entry(&extents, struct pnfs_block_extent,
-                                      be_node);
-                list_del(&be->be_node);
-                bl_put_extent(be);
-        }
-        goto out;
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2007 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Fred Isaman <iisaman@umich.edu>
- *  Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-#include "blocklayout.h"
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-static void dev_remove(struct net *net, dev_t dev)
-{
-        struct bl_pipe_msg bl_pipe_msg;
-        struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-        struct bl_dev_msg bl_umount_request;
-        struct bl_msg_hdr bl_msg = {
-                .type = BL_DEVICE_UMOUNT,
-                .totallen = sizeof(bl_umount_request),
-        };
-        uint8_t *dataptr;
-        DECLARE_WAITQUEUE(wq, current);
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
-        dprintk("Entering %s\n", __func__);
-        bl_pipe_msg.bl_wq = &nn->bl_wq;
-        memset(msg, 0, sizeof(*msg));
-        msg->len = sizeof(bl_msg) + bl_msg.totallen;
-        msg->data = kzalloc(msg->len, GFP_NOFS);
-        if (!msg->data)
-                goto out;
-        memset(&bl_umount_request, 0, sizeof(bl_umount_request));
-        bl_umount_request.major = MAJOR(dev);
-        bl_umount_request.minor = MINOR(dev);
-        memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-        dataptr = (uint8_t *) msg->data;
-        memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-        add_wait_queue(&nn->bl_wq, &wq);
-        if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
-                remove_wait_queue(&nn->bl_wq, &wq);
-                goto out;
-        }
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule();
-        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&nn->bl_wq, &wq);
-out:
-        kfree(msg->data);
-}
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
-        dprintk("%s Releasing\n", __func__);
-        nfs4_blkdev_put(bdev->bm_mdev);
-        dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
-        if (bdev) {
-                if (bdev->bm_mdev) {
-                        dprintk("%s Removing DM device: %d:%d\n",
-                                __func__,
-                                MAJOR(bdev->bm_mdev->bd_dev),
-                                MINOR(bdev->bm_mdev->bd_dev));
-                        nfs4_blk_metadev_release(bdev);
-                }
-                kfree(bdev);
-        }
-}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+        if (dev->nr_children) {
+                int i;
+                for (i = 0; i < dev->nr_children; i++)
+                        bl_free_device(&dev->children[i]);
+                kfree(dev->children);
+        } else {
+                if (dev->bdev)
+                        blkdev_put(dev->bdev, FMODE_READ);
+        }
+}
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+        struct pnfs_block_dev *dev =
+                container_of(d, struct pnfs_block_dev, node);
+        bl_free_device(dev);
+        kfree(dev);
+}
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+        __be32 *p;
+        int i;
+        p = xdr_inline_decode(xdr, 4);
+        if (!p)
+                return -EIO;
+        b->type = be32_to_cpup(p++);
+        switch (b->type) {
+        case PNFS_BLOCK_VOLUME_SIMPLE:
+                p = xdr_inline_decode(xdr, 4);
+                if (!p)
+                        return -EIO;
+                b->simple.nr_sigs = be32_to_cpup(p++);
+                if (!b->simple.nr_sigs) {
+                        dprintk("no signature\n");
+                        return -EIO;
+                }
+                b->simple.len = 4 + 4;
+                for (i = 0; i < b->simple.nr_sigs; i++) {
+                        p = xdr_inline_decode(xdr, 8 + 4);
+                        if (!p)
+                                return -EIO;
+                        p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+                        b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+                        p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+                        if (!p)
+                                return -EIO;
+                        memcpy(&b->simple.sigs[i].sig, p,
+                                b->simple.sigs[i].sig_len);
+                        b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+                }
+                break;
+        case PNFS_BLOCK_VOLUME_SLICE:
+                p = xdr_inline_decode(xdr, 8 + 8 + 4);
+                if (!p)
+                        return -EIO;
+                p = xdr_decode_hyper(p, &b->slice.start);
+                p = xdr_decode_hyper(p, &b->slice.len);
+                b->slice.volume = be32_to_cpup(p++);
+                break;
+        case PNFS_BLOCK_VOLUME_CONCAT:
+                p = xdr_inline_decode(xdr, 4);
+                if (!p)
+                        return -EIO;
+                b->concat.volumes_count = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+                if (!p)
+                        return -EIO;
+                for (i = 0; i < b->concat.volumes_count; i++)
+                        b->concat.volumes[i] = be32_to_cpup(p++);
+                break;
+        case PNFS_BLOCK_VOLUME_STRIPE:
+                p = xdr_inline_decode(xdr, 8 + 4);
+                if (!p)
+                        return -EIO;
+                p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+                b->stripe.volumes_count = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+                if (!p)
+                        return -EIO;
+                for (i = 0; i < b->stripe.volumes_count; i++)
+                        b->stripe.volumes[i] = be32_to_cpup(p++);
+                break;
+        default:
+                dprintk("unknown volume type!\n");
+                return -EIO;
+        }
+        return 0;
+}
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+                struct pnfs_block_dev_map *map)
+{
+        map->start = dev->start;
+        map->len = dev->len;
+        map->disk_offset = dev->disk_offset;
+        map->bdev = dev->bdev;
+        return true;
+}
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+                struct pnfs_block_dev_map *map)
+{
+        int i;
+        for (i = 0; i < dev->nr_children; i++) {
+                struct pnfs_block_dev *child = &dev->children[i];
+                if (child->start > offset ||
+                    child->start + child->len <= offset)
+                        continue;
+                child->map(child, offset - child->start, map);
+                return true;
+        }
+        dprintk("%s: ran off loop!\n", __func__);
+        return false;
+}
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+                struct pnfs_block_dev_map *map)
+{
+        struct pnfs_block_dev *child;
+        u64 chunk;
+        u32 chunk_idx;
+        u64 disk_offset;
+        chunk = div_u64(offset, dev->chunk_size);
+        div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+        if (chunk_idx > dev->nr_children) {
+                dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+                        __func__, chunk_idx, offset, dev->chunk_size);
+                /* error, should not happen */
+                return false;
+        }
+        /* truncate offset to the beginning of the stripe */
+        offset = chunk * dev->chunk_size;
+        /* disk offset of the stripe */
+        disk_offset = div_u64(offset, dev->nr_children);
+        child = &dev->children[chunk_idx];
+        child->map(child, disk_offset, map);
+        map->start += offset;
+        map->disk_offset += disk_offset;
+        map->len = dev->chunk_size;
+        return true;
+}
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        struct pnfs_block_volume *v = &volumes[idx];
+        dev_t dev;
+        dev = bl_resolve_deviceid(server, v, gfp_mask);
+        if (!dev)
+                return -EIO;
+        d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+        if (IS_ERR(d->bdev)) {
+                printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+                        MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+                return PTR_ERR(d->bdev);
+        }
+        d->len = i_size_read(d->bdev->bd_inode);
+        d->map = bl_map_simple;
+        printk(KERN_INFO "pNFS: using block device %s\n",
+                d->bdev->bd_disk->disk_name);
+        return 0;
+}
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        struct pnfs_block_volume *v = &volumes[idx];
+        int ret;
+        ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+        if (ret)
+                return ret;
+        d->disk_offset = v->slice.start;
+        d->len = v->slice.len;
+        return 0;
+}
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        struct pnfs_block_volume *v = &volumes[idx];
+        u64 len = 0;
+        int ret, i;
+        d->children = kcalloc(v->concat.volumes_count,
+                        sizeof(struct pnfs_block_dev), GFP_KERNEL);
+        if (!d->children)
+                return -ENOMEM;
+        for (i = 0; i < v->concat.volumes_count; i++) {
+                ret = bl_parse_deviceid(server, &d->children[i],
+                                volumes, v->concat.volumes[i], gfp_mask);
+                if (ret)
+                        return ret;
+                d->nr_children++;
+                d->children[i].start += len;
+                len += d->children[i].len;
+        }
+        d->len = len;
+        d->map = bl_map_concat;
+        return 0;
+}
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        struct pnfs_block_volume *v = &volumes[idx];
+        u64 len = 0;
+        int ret, i;
+        d->children = kcalloc(v->stripe.volumes_count,
+                        sizeof(struct pnfs_block_dev), GFP_KERNEL);
+        if (!d->children)
+                return -ENOMEM;
+        for (i = 0; i < v->stripe.volumes_count; i++) {
+                ret = bl_parse_deviceid(server, &d->children[i],
+                                volumes, v->stripe.volumes[i], gfp_mask);
+                if (ret)
+                        return ret;
+                d->nr_children++;
+                len += d->children[i].len;
+        }
+        d->len = len;
+        d->chunk_size = v->stripe.chunk_size;
+        d->map = bl_map_stripe;
+        return 0;
+}
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        switch (volumes[idx].type) {
+        case PNFS_BLOCK_VOLUME_SIMPLE:
+                return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+        case PNFS_BLOCK_VOLUME_SLICE:
+                return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+        case PNFS_BLOCK_VOLUME_CONCAT:
+                return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+        case PNFS_BLOCK_VOLUME_STRIPE:
+                return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+        default:
+                dprintk("unsupported volume type: %d\n", volumes[idx].type);
+                return -EIO;
+        }
+}
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                gfp_t gfp_mask)
+{
+        struct nfs4_deviceid_node *node = NULL;
+        struct pnfs_block_volume *volumes;
+        struct pnfs_block_dev *top;
+        struct xdr_stream xdr;
+        struct xdr_buf buf;
+        struct page *scratch;
+        int nr_volumes, ret, i;
+        __be32 *p;
+        scratch = alloc_page(gfp_mask);
+        if (!scratch)
+                goto out;
+        xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+        xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+        p = xdr_inline_decode(&xdr, sizeof(__be32));
+        if (!p)
+                goto out_free_scratch;
+        nr_volumes = be32_to_cpup(p++);
+        volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+                          gfp_mask);
+        if (!volumes)
+                goto out_free_scratch;
+        for (i = 0; i < nr_volumes; i++) {
+                ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+                if (ret < 0)
+                        goto out_free_volumes;
+        }
+        top = kzalloc(sizeof(*top), gfp_mask);
+        if (!top)
+                goto out_free_volumes;
+        ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+        if (ret) {
+                bl_free_device(top);
+                kfree(top);
+                goto out_free_volumes;
+        }
+        node = &top->node;
+        nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+out_free_volumes:
+        kfree(volumes);
+out_free_scratch:
+        __free_page(scratch);
+out:
+        return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/vmalloc.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+        return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+        struct rb_node *node = rb_first(root);
+        return node ? ext_node(node) : NULL;
+}
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+        struct rb_node *node = rb_prev(&be->be_node);
+        return node ? ext_node(node) : NULL;
+}
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+        struct rb_node *node = rb_next(&be->be_node);
+        return node ? ext_node(node) : NULL;
+}
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+        return be->be_f_offset + be->be_length;
+}
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+        struct rb_node *node = root->rb_node;
+        struct pnfs_block_extent *be = NULL;
+        while (node) {
+                be = ext_node(node);
+                if (start < be->be_f_offset)
+                        node = node->rb_left;
+                else if (start >= ext_f_end(be))
+                        node = node->rb_right;
+                else
+                        return be;
+        }
+        if (be) {
+                if (start < be->be_f_offset)
+                        return be;
+                if (start >= ext_f_end(be))
+                        return ext_tree_next(be);
+        }
+        return NULL;
+}
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+        if (be1->be_state != be2->be_state)
+                return false;
+        if (be1->be_device != be2->be_device)
+                return false;
+        if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+                return false;
+        if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+            (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+                return false;
+        if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+            be1->be_tag != be2->be_tag)
+                return false;
+        return true;
+}
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+        struct pnfs_block_extent *left = ext_tree_prev(be);
+        if (left && ext_can_merge(left, be)) {
+                left->be_length += be->be_length;
+                rb_erase(&be->be_node, root);
+                nfs4_put_deviceid_node(be->be_device);
+                kfree(be);
+                return left;
+        }
+        return be;
+}
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+        struct pnfs_block_extent *right = ext_tree_next(be);
+        if (right && ext_can_merge(be, right)) {
+                be->be_length += right->be_length;
+                rb_erase(&right->be_node, root);
+                nfs4_put_deviceid_node(right->be_device);
+                kfree(right);
+        }
+        return be;
+}
+static void
+__ext_tree_insert(struct rb_root *root,
+                struct pnfs_block_extent *new, bool merge_ok)
+{
+        struct rb_node **p = &root->rb_node, *parent = NULL;
+        struct pnfs_block_extent *be;
+        while (*p) {
+                parent = *p;
+                be = ext_node(parent);
+                if (new->be_f_offset < be->be_f_offset) {
+                        if (merge_ok && ext_can_merge(new, be)) {
+                                be->be_f_offset = new->be_f_offset;
+                                if (be->be_state != PNFS_BLOCK_NONE_DATA)
+                                        be->be_v_offset = new->be_v_offset;
+                                be->be_length += new->be_length;
+                                be = ext_try_to_merge_left(root, be);
+                                goto free_new;
+                        }
+                        p = &(*p)->rb_left;
+                } else if (new->be_f_offset >= ext_f_end(be)) {
+                        if (merge_ok && ext_can_merge(be, new)) {
+                                be->be_length += new->be_length;
+                                be = ext_try_to_merge_right(root, be);
+                                goto free_new;
+                        }
+                        p = &(*p)->rb_right;
+                } else {
+                        BUG();
+                }
+        }
+        rb_link_node(&new->be_node, parent, p);
+        rb_insert_color(&new->be_node, root);
+        return;
+free_new:
+        nfs4_put_deviceid_node(new->be_device);
+        kfree(new);
+}
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+        struct pnfs_block_extent *be;
+        sector_t len1 = 0, len2 = 0;
+        sector_t orig_v_offset;
+        sector_t orig_len;
+        be = __ext_tree_search(root, start);
+        if (!be)
+                return 0;
+        if (be->be_f_offset >= end)
+                return 0;
+        orig_v_offset = be->be_v_offset;
+        orig_len = be->be_length;
+        if (start > be->be_f_offset)
+                len1 = start - be->be_f_offset;
+        if (ext_f_end(be) > end)
+                len2 = ext_f_end(be) - end;
+        if (len2 > 0) {
+                if (len1 > 0) {
+                        struct pnfs_block_extent *new;
+                        new = kzalloc(sizeof(*new), GFP_ATOMIC);
+                        if (!new)
+                                return -ENOMEM;
+                        be->be_length = len1;
+                        new->be_f_offset = end;
+                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+                                new->be_v_offset =
+                                        orig_v_offset + orig_len - len2;
+                        }
+                        new->be_length = len2;
+                        new->be_state = be->be_state;
+                        new->be_tag = be->be_tag;
+                        new->be_device = nfs4_get_deviceid(be->be_device);
+                        __ext_tree_insert(root, new, true);
+                } else {
+                        be->be_f_offset = end;
+                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+                                be->be_v_offset =
+                                        orig_v_offset + orig_len - len2;
+                        }
+                        be->be_length = len2;
+                }
+        } else {
+                if (len1 > 0) {
+                        be->be_length = len1;
+                        be = ext_tree_next(be);
+                }
+                while (be && ext_f_end(be) <= end) {
+                        struct pnfs_block_extent *next = ext_tree_next(be);
+                        rb_erase(&be->be_node, root);
+                        nfs4_put_deviceid_node(be->be_device);
+                        kfree(be);
+                        be = next;
+                }
+                if (be && be->be_f_offset < end) {
+                        len1 = ext_f_end(be) - end;
+                        be->be_f_offset = end;
+                        if (be->be_state != PNFS_BLOCK_NONE_DATA)
+                                be->be_v_offset += be->be_length - len1;
+                        be->be_length = len1;
+                }
+        }
+        return 0;
+}
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+        struct pnfs_block_extent *be;
+        struct rb_root *root;
+        int err = 0;
+        switch (new->be_state) {
+        case PNFS_BLOCK_READWRITE_DATA:
+        case PNFS_BLOCK_INVALID_DATA:
+                root = &bl->bl_ext_rw;
+                break;
+        case PNFS_BLOCK_READ_DATA:
+        case PNFS_BLOCK_NONE_DATA:
+                root = &bl->bl_ext_ro;
+                break;
+        default:
+                dprintk("invalid extent type\n");
+                return -EINVAL;
+        }
+        spin_lock(&bl->bl_ext_lock);
+retry:
+        be = __ext_tree_search(root, new->be_f_offset);
+        if (!be || be->be_f_offset >= ext_f_end(new)) {
+                __ext_tree_insert(root, new, true);
+        } else if (new->be_f_offset >= be->be_f_offset) {
+                if (ext_f_end(new) <= ext_f_end(be)) {
+                        nfs4_put_deviceid_node(new->be_device);
+                        kfree(new);
+                } else {
+                        sector_t new_len = ext_f_end(new) - ext_f_end(be);
+                        sector_t diff = new->be_length - new_len;
+                        new->be_f_offset += diff;
+                        new->be_v_offset += diff;
+                        new->be_length = new_len;
+                        goto retry;
+                }
+        } else if (ext_f_end(new) <= ext_f_end(be)) {
+                new->be_length = be->be_f_offset - new->be_f_offset;
+                __ext_tree_insert(root, new, true);
+        } else {
+                struct pnfs_block_extent *split;
+                sector_t new_len = ext_f_end(new) - ext_f_end(be);
+                sector_t diff = new->be_length - new_len;
+                split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+                if (!split) {
+                        err = -EINVAL;
+                        goto out;
+                }
+                split->be_length = be->be_f_offset - split->be_f_offset;
+                split->be_device = nfs4_get_deviceid(new->be_device);
+                __ext_tree_insert(root, split, true);
+                new->be_f_offset += diff;
+                new->be_v_offset += diff;
+                new->be_length = new_len;
+                goto retry;
+        }
+out:
+        spin_unlock(&bl->bl_ext_lock);
+        return err;
+}
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+                struct pnfs_block_extent *ret)
+{
+        struct rb_node *node;
+        struct pnfs_block_extent *be;
+        node = root->rb_node;
+        while (node) {
+                be = ext_node(node);
+                if (isect < be->be_f_offset)
+                        node = node->rb_left;
+                else if (isect >= ext_f_end(be))
+                        node = node->rb_right;
+                else {
+                        *ret = *be;
+                        return true;
+                }
+        }
+        return false;
+}
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+            struct pnfs_block_extent *ret, bool rw)
+{
+        bool found = false;
+        spin_lock(&bl->bl_ext_lock);
+        if (!rw)
+                found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+        if (!found)
+                found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+        spin_unlock(&bl->bl_ext_lock);
+        return found;
+}
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+                sector_t start, sector_t end)
+{
+        int err, err2;
+        spin_lock(&bl->bl_ext_lock);
+        err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+        if (rw) {
+                err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+                if (!err)
+                        err = err2;
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        return err;
+}
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+                sector_t split)
+{
+        struct pnfs_block_extent *new;
+        sector_t orig_len = be->be_length;
+        new = kzalloc(sizeof(*new), GFP_ATOMIC);
+        if (!new)
+                return -ENOMEM;
+        be->be_length = split - be->be_f_offset;
+        new->be_f_offset = split;
+        if (be->be_state != PNFS_BLOCK_NONE_DATA)
+                new->be_v_offset = be->be_v_offset + be->be_length;
+        new->be_length = orig_len - be->be_length;
+        new->be_state = be->be_state;
+        new->be_tag = be->be_tag;
+        new->be_device = nfs4_get_deviceid(be->be_device);
+        __ext_tree_insert(root, new, false);
+        return 0;
+}
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+                sector_t len)
+{
+        struct rb_root *root = &bl->bl_ext_rw;
+        sector_t end = start + len;
+        struct pnfs_block_extent *be;
+        int err = 0;
+        spin_lock(&bl->bl_ext_lock);
+        /*
+         * First remove all COW extents or holes from written to range.
+         */
+        err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+        if (err)
+                goto out;
+        /*
+         * Then mark all invalid extents in the range as written to.
+         */
+        for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+                if (be->be_f_offset >= end)
+                        break;
+                if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+                        continue;
+                if (be->be_f_offset < start) {
+                        struct pnfs_block_extent *left = ext_tree_prev(be);
+                        if (left && ext_can_merge(left, be)) {
+                                sector_t diff = start - be->be_f_offset;
+                                left->be_length += diff;
+                                be->be_f_offset += diff;
+                                be->be_v_offset += diff;
+                                be->be_length -= diff;
+                        } else {
+                                err = ext_tree_split(root, be, start);
+                                if (err)
+                                        goto out;
+                        }
+                }
+                if (ext_f_end(be) > end) {
+                        struct pnfs_block_extent *right = ext_tree_next(be);
+                        if (right && ext_can_merge(be, right)) {
+                                sector_t diff = end - be->be_f_offset;
+                                be->be_length -= diff;
+                                right->be_f_offset -= diff;
+                                right->be_v_offset -= diff;
+                                right->be_length += diff;
+                        } else {
+                                err = ext_tree_split(root, be, end);
+                                if (err)
+                                        goto out;
+                        }
+                }
+                if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+                        be->be_tag = EXTENT_WRITTEN;
+                        be = ext_try_to_merge_left(root, be);
+                        be = ext_try_to_merge_right(root, be);
+                }
+        }
+out:
+        spin_unlock(&bl->bl_ext_lock);
+        return err;
+}
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+                size_t buffer_size)
+{
+        if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+                int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+                for (i = 0; i < nr_pages; i++)
+                        put_page(arg->layoutupdate_pages[i]);
+                kfree(arg->layoutupdate_pages);
+        } else {
+                put_page(arg->layoutupdate_page);
+        }
+}
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+                size_t buffer_size, size_t *count)
+{
+        struct pnfs_block_extent *be;
+        int ret = 0;
+        spin_lock(&bl->bl_ext_lock);
+        for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+                    be->be_tag != EXTENT_WRITTEN)
+                        continue;
+                (*count)++;
+                if (*count * BL_EXTENT_SIZE > buffer_size) {
+                        /* keep counting.. */
+                        ret = -ENOSPC;
+                        continue;
+                }
+                p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+                                NFS4_DEVICEID4_SIZE);
+                p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, 0LL);
+                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+                be->be_tag = EXTENT_COMMITTING;
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        return ret;
+}
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+        size_t count = 0, buffer_size = PAGE_SIZE;
+        __be32 *start_p;
+        int ret;
+        dprintk("%s enter\n", __func__);
+        arg->layoutupdate_page = alloc_page(GFP_NOFS);
+        if (!arg->layoutupdate_page)
+                return -ENOMEM;
+        start_p = page_address(arg->layoutupdate_page);
+        arg->layoutupdate_pages = &arg->layoutupdate_page;
+retry:
+        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+        if (unlikely(ret)) {
+                ext_tree_free_commitdata(arg, buffer_size);
+                buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+                count = 0;
+                arg->layoutupdate_pages =
+                        kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+                                sizeof(struct page *), GFP_NOFS);
+                if (!arg->layoutupdate_pages)
+                        return -ENOMEM;
+                start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+                if (!start_p) {
+                        kfree(arg->layoutupdate_pages);
+                        return -ENOMEM;
+                }
+                goto retry;
+        }
+        *start_p = cpu_to_be32(count);
+        arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+                __be32 *p = start_p;
+                int i = 0;
+                for (p = start_p;
+                     p < start_p + arg->layoutupdate_len;
+                     p += PAGE_SIZE) {
+                        arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+                }
+        }
+        dprintk("%s found %zu ranges\n", __func__, count);
+        return 0;
+}
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+        struct rb_root *root = &bl->bl_ext_rw;
+        struct pnfs_block_extent *be;
+        dprintk("%s status %d\n", __func__, status);
+        ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+        spin_lock(&bl->bl_ext_lock);
+        for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+                    be->be_tag != EXTENT_COMMITTING)
+                        continue;
+                if (status) {
+                        /*
+                         * Mark as written and try again.
+                         *
+                         * XXX: some real error handling here wouldn't hurt..
+                         */
+                        be->be_tag = EXTENT_WRITTEN;
+                } else {
+                        be->be_state = PNFS_BLOCK_READWRITE_DATA;
+                        be->be_tag = 0;
+                }
+                be = ext_try_to_merge_left(root, be);
+                be = ext_try_to_merge_right(root, be);
+        }
+        spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayout.h
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include "blocklayout.h"
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN     1
-#define EXTENT_IN_COMMIT   2
-#define INTERNAL_EXISTS    MY_MAX_TAGS
-#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
-        sector_t tmp = s; /* Since do_div modifies its argument */
-        return s - sector_div(tmp, base);
-}
-static inline sector_t normalize_up(sector_t s, int base)
-{
-        return normalize(s + base - 1, base);
-}
-/* Complete stub using list while determine API wanted */
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
-        struct pnfs_inval_tracking *pos;
-        dprintk("%s(%llu) enter\n", __func__, s);
-        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-                if (pos->it_sector > s)
-                        continue;
-                else if (pos->it_sector == s)
-                        return pos->it_tags & INTERNAL_MASK;
-                else
-                        break;
-        }
-        return -ENOENT;
-}
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
-        int32_t tags;
-        dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
-        s = normalize(s, tree->mtt_step_size);
-        tags = _find_entry(tree, s);
-        if ((tags < 0) || !(tags & (1 << tag)))
-                return 0;
-        else
-                return 1;
-}
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
-                      struct pnfs_inval_tracking *storage)
-{
-        int found = 0;
-        struct pnfs_inval_tracking *pos;
-        dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
-        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-                if (pos->it_sector > s)
-                        continue;
-                else if (pos->it_sector == s) {
-                        found = 1;
-                        break;
-                } else
-                        break;
-        }
-        if (found) {
-                pos->it_tags |= (1 << tag);
-                return 0;
-        } else {
-                struct pnfs_inval_tracking *new;
-                new = storage;
-                new->it_sector = s;
-                new->it_tags = (1 << tag);
-                list_add(&new->it_link, &pos->it_link);
-                return 1;
-        }
-}
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
-        u64 i;
-        dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
-        for (i = normalize(s, tree->mtt_step_size); i < s + length;
-             i += tree->mtt_step_size)
-                if (_add_entry(tree, i, tag, NULL))
-                        return -ENOMEM;
-        return 0;
-}
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
-                u64 offset, u64 length)
-{
-        u64 start, end, s;
-        int count, i, used = 0, status = -ENOMEM;
-        struct pnfs_inval_tracking **storage;
-        struct my_tree  *tree = &marks->im_tree;
-        dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
-        start = normalize(offset, tree->mtt_step_size);
-        end = normalize_up(offset + length, tree->mtt_step_size);
-        count = (int)(end - start) / (int)tree->mtt_step_size;
-        /* Pre-malloc what memory we might need */
-        storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
-        if (!storage)
-                return -ENOMEM;
-        for (i = 0; i < count; i++) {
-                storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
-                                     GFP_NOFS);
-                if (!storage[i])
-                        goto out_cleanup;
-        }
-        spin_lock_bh(&marks->im_lock);
-        for (s = start; s < end; s += tree->mtt_step_size)
-                used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
-        spin_unlock_bh(&marks->im_lock);
-        status = 0;
- out_cleanup:
-        for (i = used; i < count; i++) {
-                if (!storage[i])
-                        break;
-                kfree(storage[i]);
-        }
-        kfree(storage);
-        return status;
-}
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
-        int rv;
-        spin_lock_bh(&marks->im_lock);
-        rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
-        spin_unlock_bh(&marks->im_lock);
-        return rv;
-}
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
-        struct pnfs_inval_tracking *pos;
-        u64 expect = 0;
-        dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
-        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-                if (pos->it_sector >= end)
-                        continue;
-                if (!expect) {
-                        if ((pos->it_sector == end - tree->mtt_step_size) &&
-                            (pos->it_tags & (1 << tag))) {
-                                expect = pos->it_sector - tree->mtt_step_size;
-                                if (pos->it_sector < tree->mtt_step_size || expect < start)
-                                        return 1;
-                                continue;
-                        } else {
-                                return 0;
-                        }
-                }
-                if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
-                        return 0;
-                expect -= tree->mtt_step_size;
-                if (expect < start)
-                        return 1;
-        }
-        return 0;
-}
-static int is_range_written(struct pnfs_inval_markings *marks,
-                            sector_t start, sector_t end)
-{
-        int rv;
-        spin_lock_bh(&marks->im_lock);
-        rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
-        spin_unlock_bh(&marks->im_lock);
-        return rv;
-}
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-                             sector_t offset, sector_t length)
-{
-        sector_t start, end;
-        dprintk("%s(offset=%llu,len=%llu) enter\n",
-                __func__, (u64)offset, (u64)length);
-        start = normalize(offset, marks->im_block_size);
-        end = normalize_up(offset + length, marks->im_block_size);
-        if (_preload_range(marks, start, end - start))
-                goto outerr;
-        spin_lock_bh(&marks->im_lock);
-        if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
-                goto out_unlock;
-        spin_unlock_bh(&marks->im_lock);
-        return 0;
-out_unlock:
-        spin_unlock_bh(&marks->im_lock);
-outerr:
-        return -ENOMEM;
-}
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
-                                sector_t offset, sector_t length)
-{
-        int status;
-        dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
-                (u64)offset, (u64)length);
-        spin_lock_bh(&marks->im_lock);
-        status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
-        spin_unlock_bh(&marks->im_lock);
-        return status;
-}
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
-        dprintk("PRINT SHORT EXTENT extent %p\n", be);
-        if (be) {
-                dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
-                dprintk("        be_length   %llu\n", (u64)be->bse_length);
-        }
-}
-static void print_clist(struct list_head *list, unsigned int count)
-{
-        struct pnfs_block_short_extent *be;
-        unsigned int i = 0;
-        ifdebug(FACILITY) {
-                printk(KERN_DEBUG "****************\n");
-                printk(KERN_DEBUG "Extent list looks like:\n");
-                list_for_each_entry(be, list, bse_node) {
-                        i++;
-                        print_short_extent(be);
-                }
-                if (i != count)
-                        printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
-                printk(KERN_DEBUG "****************\n");
-        }
-}
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
-                              struct pnfs_block_short_extent *new)
-{
-        struct list_head *clist = &bl->bl_commit;
-        struct pnfs_block_short_extent *old, *save;
-        sector_t end = new->bse_f_offset + new->bse_length;
-        dprintk("%s enter\n", __func__);
-        print_short_extent(new);
-        print_clist(clist, bl->bl_count);
-        bl->bl_count++;
-        /* Scan for proper place to insert, extending new to the left
-         * as much as possible.
-         */
-        list_for_each_entry_safe(old, save, clist, bse_node) {
-                if (new->bse_f_offset < old->bse_f_offset)
-                        break;
-                if (end <= old->bse_f_offset + old->bse_length) {
-                        /* Range is already in list */
-                        bl->bl_count--;
-                        kfree(new);
-                        return;
-                } else if (new->bse_f_offset <=
-                                old->bse_f_offset + old->bse_length) {
-                        /* new overlaps or abuts existing be */
-                        if (new->bse_mdev == old->bse_mdev) {
-                                /* extend new to fully replace old */
-                                new->bse_length += new->bse_f_offset -
-                                                old->bse_f_offset;
-                                new->bse_f_offset = old->bse_f_offset;
-                                list_del(&old->bse_node);
-                                bl->bl_count--;
-                                kfree(old);
-                        }
-                }
-        }
-        /* Note that if we never hit the above break, old will not point to a
-         * valid extent.  However, in that case &old->bse_node==list.
-         */
-        list_add_tail(&new->bse_node, &old->bse_node);
-        /* Scan forward for overlaps.  If we find any, extend new and
-         * remove the overlapped extent.
-         */
-        old = list_prepare_entry(new, clist, bse_node);
-        list_for_each_entry_safe_continue(old, save, clist, bse_node) {
-                if (end < old->bse_f_offset)
-                        break;
-                /* new overlaps or abuts old */
-                if (new->bse_mdev == old->bse_mdev) {
-                        if (end < old->bse_f_offset + old->bse_length) {
-                                /* extend new to fully cover old */
-                                end = old->bse_f_offset + old->bse_length;
-                                new->bse_length = end - new->bse_f_offset;
-                        }
-                        list_del(&old->bse_node);
-                        bl->bl_count--;
-                        kfree(old);
-                }
-        }
-        dprintk("%s: after merging\n", __func__);
-        print_clist(clist, bl->bl_count);
-}
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-                    sector_t offset, sector_t length,
-                    struct pnfs_block_short_extent *new)
-{
-        sector_t new_end, end = offset + length;
-        struct pnfs_block_layout *bl = container_of(be->be_inval,
-                                                    struct pnfs_block_layout,
-                                                    bl_inval);
-        mark_written_sectors(be->be_inval, offset, length);
-        /* We want to add the range to commit list, but it must be
-         * block-normalized, and verified that the normalized range has
-         * been entirely written to disk.
-         */
-        new->bse_f_offset = offset;
-        offset = normalize(offset, bl->bl_blocksize);
-        if (offset < new->bse_f_offset) {
-                if (is_range_written(be->be_inval, offset, new->bse_f_offset))
-                        new->bse_f_offset = offset;
-                else
-                        new->bse_f_offset = offset + bl->bl_blocksize;
-        }
-        new_end = normalize_up(end, bl->bl_blocksize);
-        if (end < new_end) {
-                if (is_range_written(be->be_inval, end, new_end))
-                        end = new_end;
-                else
-                        end = new_end - bl->bl_blocksize;
-        }
-        if (end <= new->bse_f_offset) {
-                kfree(new);
-                return 0;
-        }
-        new->bse_length = end - new->bse_f_offset;
-        new->bse_devid = be->be_devid;
-        new->bse_mdev = be->be_mdev;
-        spin_lock(&bl->bl_ext_lock);
-        add_to_commitlist(bl, new);
-        spin_unlock(&bl->bl_ext_lock);
-        return 0;
-}
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
-        dprintk("PRINT EXTENT extent %p\n", be);
-        if (be) {
-                dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
-                dprintk("        be_length   %llu\n", (u64)be->be_length);
-                dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
-                dprintk("        be_state    %d\n", be->be_state);
-        }
-}
-static void
-destroy_extent(struct kref *kref)
-{
-        struct pnfs_block_extent *be;
-        be = container_of(kref, struct pnfs_block_extent, be_refcnt);
-        dprintk("%s be=%p\n", __func__, be);
-        kfree(be);
-}
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
-        if (be) {
-                dprintk("%s enter %p (%i)\n", __func__, be,
-                        atomic_read(&be->be_refcnt.refcount));
-                kref_put(&be->be_refcnt, destroy_extent);
-        }
-}
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
-        struct pnfs_block_extent *be;
-        be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
-        if (!be)
-                return NULL;
-        INIT_LIST_HEAD(&be->be_node);
-        kref_init(&be->be_refcnt);
-        be->be_inval = NULL;
-        return be;
-}
-static void print_elist(struct list_head *list)
-{
-        struct pnfs_block_extent *be;
-        dprintk("****************\n");
-        dprintk("Extent list looks like:\n");
-        list_for_each_entry(be, list, be_node) {
-                print_bl_extent(be);
-        }
-        dprintk("****************\n");
-}
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
-        /* Note this assumes new->be_f_offset >= old->be_f_offset */
-        return (new->be_state == old->be_state) &&
-                ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
-                 ((new->be_v_offset - old->be_v_offset ==
-                   new->be_f_offset - old->be_f_offset) &&
-                  new->be_mdev == old->be_mdev));
-}
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set.  If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
-                     struct pnfs_block_extent *new)
-{
-        struct pnfs_block_extent *be, *tmp;
-        sector_t end = new->be_f_offset + new->be_length;
-        struct list_head *list;
-        dprintk("%s enter with be=%p\n", __func__, new);
-        print_bl_extent(new);
-        list = &bl->bl_extents[bl_choose_list(new->be_state)];
-        print_elist(list);
-        /* Scan for proper place to insert, extending new to the left
-         * as much as possible.
-         */
-        list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
-                if (new->be_f_offset >= be->be_f_offset + be->be_length)
-                        break;
-                if (new->be_f_offset >= be->be_f_offset) {
-                        if (end <= be->be_f_offset + be->be_length) {
-                                /* new is a subset of existing be*/
-                                if (extents_consistent(be, new)) {
-                                        dprintk("%s: new is subset, ignoring\n",
-                                                __func__);
-                                        bl_put_extent(new);
-                                        return 0;
-                                } else {
-                                        goto out_err;
-                                }
-                        } else {
-                                /* |<--   be   -->|
-                                 *          |<--   new   -->| */
-                                if (extents_consistent(be, new)) {
-                                        /* extend new to fully replace be */
-                                        new->be_length += new->be_f_offset -
-                                                be->be_f_offset;
-                                        new->be_f_offset = be->be_f_offset;
-                                        new->be_v_offset = be->be_v_offset;
-                                        dprintk("%s: removing %p\n", __func__, be);
-                                        list_del(&be->be_node);
-                                        bl_put_extent(be);
-                                } else {
-                                        goto out_err;
-                                }
-                        }
-                } else if (end >= be->be_f_offset + be->be_length) {
-                        /* new extent overlap existing be */
-                        if (extents_consistent(be, new)) {
-                                /* extend new to fully replace be */
-                                dprintk("%s: removing %p\n", __func__, be);
-                                list_del(&be->be_node);
-                                bl_put_extent(be);
-                        } else {
-                                goto out_err;
-                        }
-                } else if (end > be->be_f_offset) {
-                        /*           |<--   be   -->|
-                         *|<--   new   -->| */
-                        if (extents_consistent(new, be)) {
-                                /* extend new to fully replace be */
-                                new->be_length += be->be_f_offset + be->be_length -
-                                        new->be_f_offset - new->be_length;
-                                dprintk("%s: removing %p\n", __func__, be);
-                                list_del(&be->be_node);
-                                bl_put_extent(be);
-                        } else {
-                                goto out_err;
-                        }
-                }
-        }
-        /* Note that if we never hit the above break, be will not point to a
-         * valid extent.  However, in that case &be->be_node==list.
-         */
-        list_add(&new->be_node, &be->be_node);
-        dprintk("%s: inserting new\n", __func__);
-        print_elist(list);
-        /* FIXME - The per-list consistency checks have all been done,
-         * should now check cross-list consistency.
-         */
-        return 0;
- out_err:
-        bl_put_extent(new);
-        return -EIO;
-}
-/* Returns extent, or NULL.  If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID.  Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-            struct pnfs_block_extent **cow_read)
-{
-        struct pnfs_block_extent *be, *cow, *ret;
-        int i;
-        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-        cow = ret = NULL;
-        spin_lock(&bl->bl_ext_lock);
-        for (i = 0; i < EXTENT_LISTS; i++) {
-                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-                        if (isect >= be->be_f_offset + be->be_length)
-                                break;
-                        if (isect >= be->be_f_offset) {
-                                /* We have found an extent */
-                                dprintk("%s Get %p (%i)\n", __func__, be,
-                                        atomic_read(&be->be_refcnt.refcount));
-                                kref_get(&be->be_refcnt);
-                                if (!ret)
-                                        ret = be;
-                                else if (be->be_state != PNFS_BLOCK_READ_DATA)
-                                        bl_put_extent(be);
-                                else
-                                        cow = be;
-                                break;
-                        }
-                }
-                if (ret &&
-                    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
-                        break;
-        }
-        spin_unlock(&bl->bl_ext_lock);
-        if (cow_read)
-                *cow_read = cow;
-        print_bl_extent(ret);
-        return ret;
-}
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
-        struct pnfs_block_extent *be, *ret = NULL;
-        int i;
-        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-        for (i = 0; i < EXTENT_LISTS; i++) {
-                if (ret)
-                        break;
-                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-                        if (isect >= be->be_f_offset + be->be_length)
-                                break;
-                        if (isect >= be->be_f_offset) {
-                                /* We have found an extent */
-                                dprintk("%s Get %p (%i)\n", __func__, be,
-                                        atomic_read(&be->be_refcnt.refcount));
-                                kref_get(&be->be_refcnt);
-                                ret = be;
-                                break;
-                        }
-                }
-        }
-        print_bl_extent(ret);
-        return ret;
-}
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-                               struct xdr_stream *xdr,
-                               const struct nfs4_layoutcommit_args *arg)
-{
-        struct pnfs_block_short_extent *lce, *save;
-        unsigned int count = 0;
-        __be32 *p, *xdr_start;
-        dprintk("%s enter\n", __func__);
-        /* BUG - creation of bl_commit is buggy - need to wait for
-         * entire block to be marked WRITTEN before it can be added.
-         */
-        spin_lock(&bl->bl_ext_lock);
-        /* Want to adjust for possible truncate */
-        /* We now want to adjust argument range */
-        /* XDR encode the ranges found */
-        xdr_start = xdr_reserve_space(xdr, 8);
-        if (!xdr_start)
-                goto out;
-        list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
-                p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
-                if (!p)
-                        break;
-                p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
-                p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
-                p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
-                p = xdr_encode_hyper(p, 0LL);
-                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-                list_move_tail(&lce->bse_node, &bl->bl_committing);
-                bl->bl_count--;
-                count++;
-        }
-        xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
-        xdr_start[1] = cpu_to_be32(count);
-out:
-        spin_unlock(&bl->bl_ext_lock);
-        dprintk("%s found %i ranges\n", __func__, count);
-        return 0;
-}
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
-                 struct pnfs_block_extent *orig,
-                 sector_t offset, sector_t length, int state)
-{
-        kref_init(&new->be_refcnt);
-        /* don't need to INIT_LIST_HEAD(&new->be_node) */
-        memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
-        new->be_mdev = orig->be_mdev;
-        new->be_f_offset = offset;
-        new->be_length = length;
-        new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
-        new->be_state = state;
-        new->be_inval = orig->be_inval;
-}
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
-             struct pnfs_block_extent *storage)
-{
-        struct pnfs_block_extent *prev;
-        if (!storage)
-                goto no_merge;
-        if (&be->be_node == head || be->be_node.prev == head)
-                goto no_merge;
-        prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
-        if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
-            !extents_consistent(prev, be))
-                goto no_merge;
-        _prep_new_extent(storage, prev, prev->be_f_offset,
-                         prev->be_length + be->be_length, prev->be_state);
-        list_replace(&prev->be_node, &storage->be_node);
-        bl_put_extent(prev);
-        list_del(&be->be_node);
-        bl_put_extent(be);
-        return storage;
- no_merge:
-        kfree(storage);
-        return be;
-}
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
-        u64 rv = offset + length;
-        struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
-        struct pnfs_block_extent *children[3];
-        struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
-        int i = 0, j;
-        dprintk("%s(%llu, %llu)\n", __func__, offset, length);
-        /* Create storage for up to three new extents e1, e2, e3 */
-        e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
-        e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
-        e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
-        /* BUG - we are ignoring any failure */
-        if (!e1 || !e2 || !e3)
-                goto out_nosplit;
-        spin_lock(&bl->bl_ext_lock);
-        be = bl_find_get_extent_locked(bl, offset);
-        rv = be->be_f_offset + be->be_length;
-        if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
-                spin_unlock(&bl->bl_ext_lock);
-                goto out_nosplit;
-        }
-        /* Add e* to children, bumping e*'s krefs */
-        if (be->be_f_offset != offset) {
-                _prep_new_extent(e1, be, be->be_f_offset,
-                                 offset - be->be_f_offset,
-                                 PNFS_BLOCK_INVALID_DATA);
-                children[i++] = e1;
-                print_bl_extent(e1);
-        } else
-                merge1 = e1;
-        _prep_new_extent(e2, be, offset,
-                         min(length, be->be_f_offset + be->be_length - offset),
-                         PNFS_BLOCK_READWRITE_DATA);
-        children[i++] = e2;
-        print_bl_extent(e2);
-        if (offset + length < be->be_f_offset + be->be_length) {
-                _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
-                                 be->be_f_offset + be->be_length -
-                                 offset - length,
-                                 PNFS_BLOCK_INVALID_DATA);
-                children[i++] = e3;
-                print_bl_extent(e3);
-        } else
-                merge2 = e3;
-        /* Remove be from list, and insert the e* */
-        /* We don't get refs on e*, since this list is the base reference
-         * set when init'ed.
-         */
-        if (i < 3)
-                children[i] = NULL;
-        new = children[0];
-        list_replace(&be->be_node, &new->be_node);
-        bl_put_extent(be);
-        new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
-        for (j = 1; j < i; j++) {
-                old = new;
-                new = children[j];
-                list_add(&new->be_node, &old->be_node);
-        }
-        if (merge2) {
-                /* This is a HACK, should just create a _back_merge function */
-                new = list_entry(new->be_node.next,
-                                 struct pnfs_block_extent, be_node);
-                new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
-        }
-        spin_unlock(&bl->bl_ext_lock);
-        /* Since we removed the base reference above, be is now scheduled for
-         * destruction.
-         */
-        bl_put_extent(be);
-        dprintk("%s returns %llu after split\n", __func__, rv);
-        return rv;
- out_nosplit:
-        kfree(e1);
-        kfree(e2);
-        kfree(e3);
-        dprintk("%s returns %llu without splitting\n", __func__, rv);
-        return rv;
-}
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-                              const struct nfs4_layoutcommit_args *arg,
-                              int status)
-{
-        struct pnfs_block_short_extent *lce, *save;
-        dprintk("%s status %d\n", __func__, status);
-        list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
-                if (likely(!status)) {
-                        u64 offset = lce->bse_f_offset;
-                        u64 end = offset + lce->bse_length;
-                        do {
-                                offset = set_to_rw(bl, offset, end - offset);
-                        } while (offset < end);
-                        list_del(&lce->bse_node);
-                        kfree(lce);
-                } else {
-                        list_del(&lce->bse_node);
-                        spin_lock(&bl->bl_ext_lock);
-                        add_to_commitlist(bl, lce);
-                        spin_unlock(&bl->bl_ext_lock);
-                }
-        }
-}
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
-        struct pnfs_block_short_extent *new;
-        new = kmalloc(sizeof(*new), GFP_NOFS);
-        if (unlikely(!new))
-                return -ENOMEM;
-        spin_lock_bh(&marks->im_lock);
-        list_add(&new->bse_node, &marks->im_extents);
-        spin_unlock_bh(&marks->im_lock);
-        return 0;
-}
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
-        struct pnfs_block_short_extent *rv = NULL;
-        spin_lock_bh(&marks->im_lock);
-        if (!list_empty(&marks->im_extents)) {
-                rv = list_entry((&marks->im_extents)->next,
-                                struct pnfs_block_short_extent, bse_node);
-                list_del_init(&rv->bse_node);
-        }
-        spin_unlock_bh(&marks->im_lock);
-        return rv;
-}
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
-        struct pnfs_block_short_extent *se = NULL, *tmp;
-        if (num_to_free <= 0)
-                return;
-        spin_lock(&marks->im_lock);
-        list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
-                list_del(&se->bse_node);
-                kfree(se);
-                if (--num_to_free == 0)
-                        break;
-        }
-        spin_unlock(&marks->im_lock);
-        BUG_ON(num_to_free > 0);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+        int i;
+        *p++ = cpu_to_be32(1);
+        *p++ = cpu_to_be32(b->type);
+        *p++ = cpu_to_be32(b->simple.nr_sigs);
+        for (i = 0; i < b->simple.nr_sigs; i++) {
+                p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+                p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+                                         b->simple.sigs[i].sig_len);
+        }
+}
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+                gfp_t gfp_mask)
+{
+        struct net *net = server->nfs_client->cl_net;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct bl_dev_msg *reply = &nn->bl_mount_reply;
+        struct bl_pipe_msg bl_pipe_msg;
+        struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+        struct bl_msg_hdr *bl_msg;
+        DECLARE_WAITQUEUE(wq, current);
+        dev_t dev = 0;
+        int rc;
+        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+        bl_pipe_msg.bl_wq = &nn->bl_wq;
+        b->simple.len += 4;     /* single volume */
+        if (b->simple.len > PAGE_SIZE)
+                return -EIO;
+        memset(msg, 0, sizeof(*msg));
+        msg->len = sizeof(*bl_msg) + b->simple.len;
+        msg->data = kzalloc(msg->len, gfp_mask);
+        if (!msg->data)
+                goto out;
+        bl_msg = msg->data;
+        bl_msg->type = BL_DEVICE_MOUNT,
+        bl_msg->totallen = b->simple.len;
+        nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+        add_wait_queue(&nn->bl_wq, &wq);
+        rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+        if (rc < 0) {
+                remove_wait_queue(&nn->bl_wq, &wq);
+                goto out;
+        }
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&nn->bl_wq, &wq);
+        if (reply->status != BL_DEVICE_REQUEST_PROC) {
+                printk(KERN_WARNING "%s failed to decode device: %d\n",
+                        __func__, reply->status);
+                goto out;
+        }
+        dev = MKDEV(reply->major, reply->minor);
+out:
+        kfree(msg->data);
+        return dev;
+}
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+                         size_t mlen)
+{
+        struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+                                         nfs_net_id);
+        if (mlen != sizeof (struct bl_dev_msg))
+                return -EINVAL;
+        if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+                return -EFAULT;
+        wake_up(&nn->bl_wq);
+        return mlen;
+}
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        struct bl_pipe_msg *bl_pipe_msg =
+                container_of(msg, struct bl_pipe_msg, msg);
+        if (msg->errno >= 0)
+                return;
+        wake_up(bl_pipe_msg->bl_wq);
+}
+static const struct rpc_pipe_ops bl_upcall_ops = {
+        .upcall         = rpc_pipe_generic_upcall,
+        .downcall       = bl_pipe_downcall,
+        .destroy_msg    = bl_pipe_destroy_msg,
+};
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+                                            struct rpc_pipe *pipe)
+{
+        struct dentry *dir, *dentry;
+        dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+        if (dir == NULL)
+                return ERR_PTR(-ENOENT);
+        dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+        dput(dir);
+        return dentry;
+}
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+                                          struct rpc_pipe *pipe)
+{
+        if (pipe->dentry)
+                rpc_unlink(pipe->dentry);
+}
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+                           void *ptr)
+{
+        struct super_block *sb = ptr;
+        struct net *net = sb->s_fs_info;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct dentry *dentry;
+        int ret = 0;
+        if (!try_module_get(THIS_MODULE))
+                return 0;
+        if (nn->bl_device_pipe == NULL) {
+                module_put(THIS_MODULE);
+                return 0;
+        }
+        switch (event) {
+        case RPC_PIPEFS_MOUNT:
+                dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+                if (IS_ERR(dentry)) {
+                        ret = PTR_ERR(dentry);
+                        break;
+                }
+                nn->bl_device_pipe->dentry = dentry;
+                break;
+        case RPC_PIPEFS_UMOUNT:
+                if (nn->bl_device_pipe->dentry)
+                        nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+                break;
+        default:
+                ret = -ENOTSUPP;
+                break;
+        }
+        module_put(THIS_MODULE);
+        return ret;
+}
+static struct notifier_block nfs4blocklayout_block = {
+        .notifier_call = rpc_pipefs_event,
+};
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+                                                   struct rpc_pipe *pipe)
+{
+        struct super_block *pipefs_sb;
+        struct dentry *dentry;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (!pipefs_sb)
+                return NULL;
+        dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+        rpc_put_sb_net(net);
+        return dentry;
+}
+static void nfs4blocklayout_unregister_net(struct net *net,
+                                           struct rpc_pipe *pipe)
+{
+        struct super_block *pipefs_sb;
+        pipefs_sb = rpc_get_sb_net(net);
+        if (pipefs_sb) {
+                nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+                rpc_put_sb_net(net);
+        }
+}
+static int nfs4blocklayout_net_init(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct dentry *dentry;
+        init_waitqueue_head(&nn->bl_wq);
+        nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+        if (IS_ERR(nn->bl_device_pipe))
+                return PTR_ERR(nn->bl_device_pipe);
+        dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+        if (IS_ERR(dentry)) {
+                rpc_destroy_pipe_data(nn->bl_device_pipe);
+                return PTR_ERR(dentry);
+        }
+        nn->bl_device_pipe->dentry = dentry;
+        return 0;
+}
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+        rpc_destroy_pipe_data(nn->bl_device_pipe);
+        nn->bl_device_pipe = NULL;
+}
+static struct pernet_operations nfs4blocklayout_net_ops = {
+        .init = nfs4blocklayout_net_init,
+        .exit = nfs4blocklayout_net_exit,
+};
+int __init bl_init_pipefs(void)
+{
+        int ret;
+        ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+        if (ret)
+                goto out;
+        ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+        if (ret)
+                goto out_unregister_notifier;
+        return 0;
+out_unregister_notifier:
+        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+        return ret;
+}
+void __exit bl_cleanup_pipefs(void)
+{
+        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+        unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
                goto out;
        ino = lo->plh_inode;
+        spin_lock(&ino->i_lock);
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_layoutcommit_inode(ino, false);
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
            pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
-                                        &args->cbl_range))
+                                        &args->cbl_range)) {
                rv = NFS4ERR_DELAY;
-        else
+                goto unlock;
-                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        }
-        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+                        &args->cbl_range);
+        }
+unlock:
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me_list);
        pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
                }
        found:
-                if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
-                        dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
-                                "deleting instead\n", __func__);
                nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
        }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1c5ff6d58385..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
 * set up the iterator to start reading from the server list and return the first item
 */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+                                __acquires(&nn->nfs_client_lock)
 {
        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 * clean up after reading from the transports list
 */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
+                                __releases(&nn->nfs_client_lock)
 {
        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
@@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 */
 static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
-        return seq_open_net(inode, file, &nfs_server_list_ops,
+        return seq_open_net(inode, file, &nfs_volume_list_ops,
                           sizeof(struct seq_net_private));
 }
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
 * set up the iterator to start reading from the volume list and return the first item
 */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+                                __acquires(&nn->nfs_client_lock)
 {
        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 * clean up after reading from the transports list
 */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
+                                __releases(&nn->nfs_client_lock)
 {
        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
@@ -1412,24 +1416,18 @@ int nfs_fs_proc_net_init(struct net *net)
        p = proc_create("volumes", S_IFREG|S_IRUGO,
                        nn->proc_nfsfs, &nfs_volume_list_fops);
        if (!p)
-                goto error_2;
+                goto error_1;
        return 0;
-error_2:
-        remove_proc_entry("servers", nn->proc_nfsfs);
 error_1:
-        remove_proc_entry("fs/nfsfs", NULL);
+        remove_proc_subtree("nfsfs", net->proc_net);
 error_0:
        return -ENOMEM;
 }
 void nfs_fs_proc_net_exit(struct net *net)
 {
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        remove_proc_subtree("nfsfs", net->proc_net);
-        remove_proc_entry("volumes", nn->proc_nfsfs);
-        remove_proc_entry("servers", nn->proc_nfsfs);
-        remove_proc_entry("fs/nfsfs", NULL);
 }
 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
        return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 }
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 /*
 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
 * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
        WARN_ON_ONCE(verfp->committed < 0);
        return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 }
-#endif
 /**
 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
        return result;
 }
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
        struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
        schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
-        nfs_direct_complete(dreq, true);
-}
-#endif
 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
        struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8c4048ecdad1..4ea92ce0537f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #include "nfstrace.h"
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
        unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
        unsigned int end = offset + len;
+        if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+                if (!PageUptodate(page))
+                        return 1;
+                return 0;
+        }
        if ((file->f_mode & FMODE_READ) &&      /* open for read? */
            !PageUptodate(page) &&              /* Uptodate? */
            !PagePrivate(page) &&               /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
-        /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
+        /* Always try to initiate a 'commit' if relevant, but only
-         * doing this memory reclaim for a fs-related allocation.
+         * wait for it if __GFP_WAIT is set.  Even then, only wait 1
+         * second and only if the 'bdi' is not congested.
+         * Waiting indefinitely can cause deadlocks when the NFS
+         * server is on this machine, when a new TCP connection is
+         * needed and in other rare cases.  There is no particular
+         * need to wait extensively here.  A short wait has the
+         * benefit that someone else can worry about the freezer.
         */
-        if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
+        if (mapping) {
-            !(current->flags & PF_FSTRANS)) {
+                struct nfs_server *nfss = NFS_SERVER(mapping->host);
-                int how = FLUSH_SYNC;
+                nfs_commit_inode(mapping->host, 0);
+                if ((gfp & __GFP_WAIT) &&
-                /* Don't let kswapd deadlock waiting for OOM RPC calls */
+                    !bdi_write_congested(&nfss->backing_dev_info)) {
-                if (current_is_kswapd())
+                        wait_on_page_bit_killable_timeout(page, PG_private,
-                        how = 0;
+                                                          HZ);
-                nfs_commit_inode(mapping->host, how);
+                        if (PagePrivate(page))
+                                set_bdi_congested(&nfss->backing_dev_info,
+                                                  BLK_RW_ASYNC);
+                }
        }
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                                                sector_t *span)
 {
+        int ret;
+        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
        *span = sis->pages;
-        return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+        rcu_read_lock();
+        ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+        rcu_read_unlock();
+        return ret;
 }
 static void nfs_swap_deactivate(struct file *file)
 {
-        xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+        rcu_read_lock();
+        xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+        rcu_read_unlock();
 }
 #endif
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a27393..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
        if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-            hdr->res.verf->committed == NFS_FILE_SYNC)
+            hdr->res.verf->committed != NFS_DATA_SYNC)
                return;
        pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
+        if (data->verf.committed == NFS_UNSTABLE)
+                pnfs_commit_set_layoutcommit(data);
        return 0;
 }
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        }
        /* find and reference the deviceid */
-        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
-                                   NFS_SERVER(lo->plh_inode)->nfs_client, id);
+                        lo->plh_lc_cred, gfp_flags);
-        if (d == NULL) {
+        if (d == NULL)
-                dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+                goto out;
-                                lo->plh_lc_cred, gfp_flags);
-                if (dsaddr == NULL)
+        dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-                        goto out;
-        } else
-                dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
        /* Found deviceid is unavailable */
        if (filelayout_test_devid_unavailable(&dsaddr->id_node))
-                        goto out_put;
+                goto out_put;
        fl->dsaddr = dsaddr;
@@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
 static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
 {
        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-        struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+        struct pnfs_commit_bucket *bucket;
        struct pnfs_layout_segment *freeme;
        int i;
-        for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+        for (i = idx; i < fl_cinfo->nbuckets; i++) {
+                bucket = &fl_cinfo->buckets[i];
                if (list_empty(&bucket->committing))
                        continue;
                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
@@ -1367,6 +1368,17 @@ out:
        cinfo->ds->ncommitting = 0;
        return PNFS_ATTEMPTED;
 }
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+                struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+        if (!dsaddr)
+                return NULL;
+        return &dsaddr->id_node;
+}
 static void
 filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1419,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
+        .alloc_deviceid_node    = filelayout_alloc_deviceid_node,
        .free_deviceid_node     = filelayout_free_deveiceid_node,
 };
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
                                        u32 ds_idx);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+        struct pnfs_device *pdev, gfp_t gfp_flags);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
-                struct rpc_cred *cred, gfp_t gfp_flags);
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
 }
 /* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
+struct nfs4_file_layout_dsaddr *
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                gfp_t gfp_flags)
 {
        int i;
        u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        dsaddr->stripe_indices = stripe_indices;
        stripe_indices = NULL;
        dsaddr->ds_num = num;
-        nfs4_init_deviceid_node(&dsaddr->id_node,
+        nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
-                                NFS_SERVER(ino)->pnfs_curr_ld,
-                                NFS_SERVER(ino)->nfs_client,
-                                &pdev->dev_id);
        INIT_LIST_HEAD(&dsaddrs);
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
                mp_count = be32_to_cpup(p); /* multipath count */
                for (j = 0; j < mp_count; j++) {
-                        da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+                        da = decode_ds_addr(server->nfs_client->cl_net,
                                            &stream, gfp_flags);
                        if (da)
                                list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
        return NULL;
 }
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
-        struct nfs4_deviceid_node *d;
-        struct nfs4_file_layout_dsaddr *n, *new;
-        new = decode_device(inode, dev, gfp_flags);
-        if (!new) {
-                printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
-                        __func__);
-                return NULL;
-        }
-        d = nfs4_insert_deviceid_node(&new->id_node);
-        n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-        if (n != new) {
-                nfs4_fl_free_deviceid(new);
-                return n;
-        }
-        return new;
-}
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
-                struct nfs4_deviceid *dev_id,
-                struct rpc_cred *cred,
-                gfp_t gfp_flags)
-{
-        struct pnfs_device *pdev = NULL;
-        u32 max_resp_sz;
-        int max_pages;
-        struct page **pages = NULL;
-        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
-        int rc, i;
-        struct nfs_server *server = NFS_SERVER(inode);
-        /*
-         * Use the session max response size as the basis for setting
-         * GETDEVICEINFO's maxcount
-         */
-        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-        max_pages = nfs_page_array_len(0, max_resp_sz);
-        dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
-                __func__, inode, max_resp_sz, max_pages);
-        pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
-        if (pdev == NULL)
-                return NULL;
-        pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
-        if (pages == NULL) {
-                kfree(pdev);
-                return NULL;
-        }
-        for (i = 0; i < max_pages; i++) {
-                pages[i] = alloc_page(gfp_flags);
-                if (!pages[i])
-                        goto out_free;
-        }
-        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
-        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
-        pdev->pages = pages;
-        pdev->pgbase = 0;
-        pdev->pglen = max_resp_sz;
-        pdev->mincount = 0;
-        pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-        rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
-        dprintk("%s getdevice info returns %d\n", __func__, rc);
-        if (rc)
-                goto out_free;
-        /*
-         * Found new device, need to decode it and then add it to the
-         * list of known devices for this mountpoint.
-         */
-        dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
-        for (i = 0; i < max_pages; i++)
-                __free_page(pages[i]);
-        kfree(pages);
-        kfree(pdev);
-        dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
-        return dsaddr;
-}
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
        struct nfs_server_key *key = buffer;
        uint16_t len = sizeof(struct nfs_server_key);
+        memset(key, 0, len);
        key->nfsversion = clp->rpc_ops->version;
        key->family = clp->cl_addr.ss_family;
-        memset(key, 0, len);
        switch (clp->cl_addr.ss_family) {
        case AF_INET:
                key->port = sin->sin_port;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                attr->ia_valid &= ~ATTR_MODE;
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+                BUG_ON(!S_ISREG(inode->i_mode));
+                if (attr->ia_size == i_size_read(inode))
                        attr->ia_valid &= ~ATTR_SIZE;
        }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 94d922ebb5ac..efaa31c70fbe 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
 int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
 #endif
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
-                                     struct nfs_fattr *, rpc_authflavor_t);
-#endif
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+                struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+                struct posix_acl *dfacl)
+{
+        return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+                                     struct nfs_fattr *, rpc_authflavor_t);
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d0fec260132a..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
 #include <linux/nfsacl.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 #define NFSDBG_FACILITY NFSDBG_PROC
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                .rpc_argp       = &args,
                .rpc_resp       = &fattr,
        };
-        int status;
+        int status = 0;
+        if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+                goto out;
        status = -EOPNOTSUPP;
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 #ifdef CONFIG_NFS_V3_ACL
 static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
 #include "iostat.h"
 #include "internal.h"
+#include "nfs3_fs.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/nfs_fs.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 #include "nfs.h"
 static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
 */
 struct nfs4_lock_state {
-        struct list_head                ls_locks;   /* Other lock stateids */
+        struct list_head        ls_locks;       /* Other lock stateids */
-        struct nfs4_state *             ls_state;   /* Pointer to open state */
+        struct nfs4_state *     ls_state;       /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-        unsigned long                   ls_flags;
+        unsigned long           ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-        nfs4_stateid                    ls_stateid;
+        nfs4_stateid            ls_stateid;
-        atomic_t                        ls_count;
+        atomic_t                ls_count;
-        fl_owner_t                      ls_owner;
+        fl_owner_t              ls_owner;
-        struct work_struct              ls_release;
 };
 /* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 53e435a95260..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+                if (pos->rpc_ops != new->rpc_ops)
+                        continue;
+                if (pos->cl_proto != new->cl_proto)
+                        continue;
+                if (pos->cl_minorversion != new->cl_minorversion)
+                        continue;
                /* If "pos" isn't marked ready, we can't trust the
                 * remaining fields in "pos" */
                if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
                if (pos->cl_cons_state != NFS_CS_READY)
                        continue;
-                if (pos->rpc_ops != new->rpc_ops)
-                        continue;
-                if (pos->cl_proto != new->cl_proto)
-                        continue;
-                if (pos->cl_minorversion != new->cl_minorversion)
-                        continue;
                if (pos->cl_clientid != new->cl_clientid)
                        continue;
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+                if (pos->rpc_ops != new->rpc_ops)
+                        continue;
+                if (pos->cl_proto != new->cl_proto)
+                        continue;
+                if (pos->cl_minorversion != new->cl_minorversion)
+                        continue;
                /* If "pos" isn't marked ready, we can't trust the
                 * remaining fields in "pos", especially the client
                 * ID and serverowner fields.  Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (pos->cl_cons_state != NFS_CS_READY)
                        continue;
-                if (pos->rpc_ops != new->rpc_ops)
-                        continue;
-                if (pos->cl_proto != new->cl_proto)
-                        continue;
-                if (pos->cl_minorversion != new->cl_minorversion)
-                        continue;
                if (!nfs4_match_clientids(pos, new))
                        continue;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75ae8d22f067..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start);
 }
+static long nfs4_update_delay(long *timeout)
+{
+        long ret;
+        if (!timeout)
+                return NFS4_POLL_RETRY_MAX;
+        if (*timeout <= 0)
+                *timeout = NFS4_POLL_RETRY_MIN;
+        if (*timeout > NFS4_POLL_RETRY_MAX)
+                *timeout = NFS4_POLL_RETRY_MAX;
+        ret = *timeout;
+        *timeout <<= 1;
+        return ret;
+}
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
        int res = 0;
        might_sleep();
-        if (*timeout <= 0)
+        freezable_schedule_timeout_killable_unsafe(
-                *timeout = NFS4_POLL_RETRY_MIN;
+                nfs4_update_delay(timeout));
-        if (*timeout > NFS4_POLL_RETRY_MAX)
-                *timeout = NFS4_POLL_RETRY_MAX;
-        freezable_schedule_timeout_killable_unsafe(*timeout);
        if (fatal_signal_pending(current))
                res = -ERESTARTSYS;
-        *timeout <<= 1;
        return res;
 }
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        int ret = -EAGAIN;
        for (;;) {
+                spin_lock(&state->owner->so_lock);
                if (can_open_cached(state, fmode, open_mode)) {
-                        spin_lock(&state->owner->so_lock);
+                        update_open_stateflags(state, fmode);
-                        if (can_open_cached(state, fmode, open_mode)) {
-                                update_open_stateflags(state, fmode);
-                                spin_unlock(&state->owner->so_lock);
-                                goto out_return_state;
-                        }
                        spin_unlock(&state->owner->so_lock);
+                        goto out_return_state;
                }
+                spin_unlock(&state->owner->so_lock);
                rcu_read_lock();
                delegation = rcu_dereference(nfsi->delegation);
                if (!can_open_delegated(delegation, fmode)) {
@@ -2226,9 +2234,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        ret = _nfs4_proc_open(opendata);
        if (ret != 0) {
                if (ret == -ENOENT) {
-                        d_drop(opendata->dentry);
+                        dentry = opendata->dentry;
-                        d_add(opendata->dentry, NULL);
+                        if (dentry->d_inode)
-                        nfs_set_verifier(opendata->dentry,
+                                d_delete(dentry);
+                        else if (d_unhashed(dentry))
+                                d_add(dentry, NULL);
+                        nfs_set_verifier(dentry,
                                         nfs_save_change_attribute(opendata->dir->d_inode));
                }
                goto out;
@@ -2560,6 +2572,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
+        nfs4_stateid *res_stateid = NULL;
        dprintk("%s: begin!\n", __func__);
        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2570,12 +2583,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
-                        if (calldata->roc)
+                        res_stateid = &calldata->res.stateid;
+                        if (calldata->arg.fmode == 0 && calldata->roc)
                                pnfs_roc_set_barrier(state->inode,
                                                     calldata->roc_barrier);
-                        nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
-                        goto out_release;
+                        break;
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_OLD_STATEID:
@@ -2584,12 +2597,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        if (calldata->arg.fmode == 0)
                                break;
                default:
-                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
                                rpc_restart_call_prepare(task);
                                goto out_release;
                        }
        }
-        nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+        nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
 out_release:
        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2601,6 +2614,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct inode *inode = calldata->inode;
+        bool is_rdonly, is_wronly, is_rdwr;
        int call_close = 0;
        dprintk("%s: begin!\n", __func__);
@@ -2608,21 +2622,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                goto out_wait;
        task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-        calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
        spin_lock(&state->owner->so_lock);
+        is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+        is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+        is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
        /* Calculate the change in open mode */
+        calldata->arg.fmode = 0;
        if (state->n_rdwr == 0) {
-                if (state->n_rdonly == 0) {
+                if (state->n_rdonly == 0)
-                        call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+                        call_close |= is_rdonly;
-                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+                else if (is_rdonly)
-                        calldata->arg.fmode &= ~FMODE_READ;
+                        calldata->arg.fmode |= FMODE_READ;
-                }
+                if (state->n_wronly == 0)
-                if (state->n_wronly == 0) {
+                        call_close |= is_wronly;
-                        call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+                else if (is_wronly)
-                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+                        calldata->arg.fmode |= FMODE_WRITE;
-                        calldata->arg.fmode &= ~FMODE_WRITE;
+        } else if (is_rdwr)
-                }
+                calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
-        }
+        if (calldata->arg.fmode == 0)
+                call_close |= is_rdwr;
        if (!nfs4_valid_open_stateid(state))
                call_close = 0;
        spin_unlock(&state->owner->so_lock);
@@ -3205,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        struct nfs4_label *label = NULL;
        int status;
-        if (pnfs_ld_layoutret_on_setattr(inode))
+        if (pnfs_ld_layoutret_on_setattr(inode) &&
+            sattr->ia_valid & ATTR_SIZE &&
+            sattr->ia_size < i_size_read(inode))
                pnfs_commit_and_return_layout(inode);
        nfs_fattr_init(fattr);
@@ -3564,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (!nfs4_sequence_done(task, &res->seq_res))
                return 0;
-        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+        if (nfs4_async_handle_error(task, res->server, NULL,
+                                    &data->timeout) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
        return 1;
@@ -3597,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
        if (!nfs4_sequence_done(task, &res->seq_res))
                return 0;
-        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+        if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
                return 0;
        update_changeattr(old_dir, &res->old_cinfo);
@@ -4101,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
        trace_nfs4_read(hdr, task->tk_status);
        if (nfs4_async_handle_error(task, server,
-                                    hdr->args.context->state) == -EAGAIN) {
+                                    hdr->args.context->state,
+                                    NULL) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
@@ -4169,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
                              struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
-        
        trace_nfs4_write(hdr, task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-                                    hdr->args.context->state) == -EAGAIN) {
+                                    hdr->args.context->state,
+                                    NULL) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
@@ -4252,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
        struct inode *inode = data->inode;
        trace_nfs4_commit(data, task->tk_status);
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+                                    NULL, NULL) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
@@ -4805,7 +4831,8 @@ out:
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+                        struct nfs4_state *state, long *timeout)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -4855,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 #endif /* CONFIG_NFS_V4_1 */
                case -NFS4ERR_DELAY:
                        nfs_inc_server_stats(server, NFSIOS_DELAY);
+                        rpc_delay(task, nfs4_update_delay(timeout));
+                        goto restart_call;
                case -NFS4ERR_GRACE:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5095,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
                        pnfs_roc_set_barrier(data->inode, data->roc_barrier);
                break;
        default:
-                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+                if (nfs4_async_handle_error(task, data->res.server,
-                                -EAGAIN) {
+                                            NULL, NULL) == -EAGAIN) {
                        rpc_restart_call_prepare(task);
                        return;
                }
@@ -5360,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                case -NFS4ERR_EXPIRED:
                        break;
                default:
-                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+                        if (nfs4_async_handle_error(task, calldata->server,
+                                                    NULL, NULL) == -EAGAIN)
                                rpc_restart_call_prepare(task);
        }
        nfs_release_seqid(calldata->arg.seqid);
@@ -5966,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
                break;
        case -NFS4ERR_LEASE_MOVED:
        case -NFS4ERR_DELAY:
-                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+                if (nfs4_async_handle_error(task, server,
+                                            NULL, NULL) == -EAGAIN)
                        rpc_restart_call_prepare(task);
        }
 }
@@ -7341,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        int ret = 0;
        if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
-                return 0;
+                return -EAGAIN;
        task = _nfs41_proc_sequence(clp, cred, false);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
@@ -7571,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                } else {
                        LIST_HEAD(head);
+                        /*
+                         * Mark the bad layout state as invalid, then retry
+                         * with the current stateid.
+                         */
                        pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
                        spin_unlock(&inode->i_lock);
-                        /* Mark the bad layout state as invalid, then
-                         * retry using the open stateid. */
                        pnfs_free_lseg_list(&head);
+        
+                        task->tk_status = 0;
+                        rpc_restart_call_prepare(task);
                }
        }
-        if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+        if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
                rpc_restart_call_prepare(task);
 out:
        dprintk("<-- %s\n", __func__);
@@ -7738,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
        case 0:
                break;
        case -NFS4ERR_DELAY:
-                if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+                if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
                        break;
                rpc_restart_call_prepare(task);
                return;
@@ -7797,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
        return status;
 }
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
-                                    const struct nfs_fh *fh,
-                                    struct pnfs_devicelist *devlist)
-{
-        struct nfs4_getdevicelist_args args = {
-                .fh = fh,
-                .layoutclass = server->pnfs_curr_ld->id,
-        };
-        struct nfs4_getdevicelist_res res = {
-                .devlist = devlist,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
-                .rpc_argp = &args,
-                .rpc_resp = &res,
-        };
-        int status;
-        dprintk("--> %s\n", __func__);
-        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
-                                &res.seq_res, 0);
-        dprintk("<-- %s status=%d\n", __func__, status);
-        return status;
-}
-int nfs4_proc_getdevicelist(struct nfs_server *server,
-                            const struct nfs_fh *fh,
-                            struct pnfs_devicelist *devlist)
-{
-        struct nfs4_exception exception = { };
-        int err;
-        do {
-                err = nfs4_handle_exception(server,
-                                _nfs4_getdevicelist(server, fh, devlist),
-                                &exception);
-        } while (exception.retry);
-        dprintk("%s: err=%d, num_devs=%u\n", __func__,
-                err, devlist->num_devs);
-        return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server,
                struct pnfs_device *pdev,
@@ -7917,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
        case 0:
                break;
        default:
-                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
                        rpc_restart_call_prepare(task);
                        return;
                }
@@ -8213,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
-                if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+                if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
                        rpc_restart_call_prepare(task);
        }
 }
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
                        }
                        nfs_expire_all_delegations(clp);
                } else {
+                        int ret;
                        /* Queue an asynchronous RENEW. */
-                        ops->sched_state_renewal(clp, cred, renew_flags);
+                        ret = ops->sched_state_renewal(clp, cred, renew_flags);
                        put_rpccred(cred);
-                        goto out_exp;
+                        switch (ret) {
+                        default:
+                                goto out_exp;
+                        case -EAGAIN:
+                        case -ENOMEM:
+                                break;
+                        }
                }
        } else {
                dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
        return NULL;
 }
-static void
-free_lock_state_work(struct work_struct *work)
-{
-        struct nfs4_lock_state *lsp = container_of(work,
-                                        struct nfs4_lock_state, ls_release);
-        struct nfs4_state *state = lsp->ls_state;
-        struct nfs_server *server = state->owner->so_server;
-        struct nfs_client *clp = server->nfs_client;
-        clp->cl_mvops->free_lock_state(server, lsp);
-}
 /*
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        if (lsp->ls_seqid.owner_id < 0)
                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
-        INIT_WORK(&lsp->ls_release, free_lock_state_work);
        return lsp;
 out_free:
        kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+        server = state->owner->so_server;
-                queue_work(nfsiod_workqueue, &lsp->ls_release);
+        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-        else {
+                struct nfs_client *clp = server->nfs_client;
-                server = state->owner->so_server;
+                clp->cl_mvops->free_lock_state(server, lsp);
+        } else
                nfs4_free_lock_state(server, lsp);
-        }
 }
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -1717,7 +1705,8 @@ restart:
                        if (status < 0) {
                                set_bit(ops->owner_flag_bit, &sp->so_flags);
                                nfs4_put_state_owner(sp);
-                                return nfs4_recovery_handle_error(clp, status);
+                                status = nfs4_recovery_handle_error(clp, status);
+                                return (status != 0) ? status : -EAGAIN;
                        }
                        nfs4_put_state_owner(sp);
@@ -1726,7 +1715,7 @@ restart:
                spin_unlock(&clp->cl_lock);
        }
        rcu_read_unlock();
-        return status;
+        return 0;
 }
 static int nfs4_check_lease(struct nfs_client *clp)
@@ -1773,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
                break;
        case -NFS4ERR_STALE_CLIENTID:
                clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-                nfs4_state_clear_reclaim_reboot(clp);
                nfs4_state_start_reclaim_reboot(clp);
                break;
        case -NFS4ERR_CLID_INUSE:
@@ -2357,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        status = nfs4_check_lease(clp);
                        if (status < 0)
                                goto out_error;
+                        continue;
                }
                if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2378,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        section = "reclaim reboot";
                        status = nfs4_do_reclaim(clp,
                                clp->cl_mvops->reboot_recovery_ops);
-                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+                        if (status == -EAGAIN)
-                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-                                continue;
-                        nfs4_state_end_reclaim_reboot(clp);
-                        if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
                                continue;
                        if (status < 0)
                                goto out_error;
+                        nfs4_state_end_reclaim_reboot(clp);
                }
                /* Now recover expired state... */
@@ -2393,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        section = "reclaim nograce";
                        status = nfs4_do_reclaim(clp,
                                clp->cl_mvops->nograce_recovery_ops);
-                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+                        if (status == -EAGAIN)
-                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
-                            test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
                                continue;
                        if (status < 0)
                                goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
-                                encode_verifier_maxsz)
+                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+                                1 /* layout type */ + \
-                                2 /* nfs_cookie4 gdlr_cookie */ + \
+                                1 /* maxcount */ + \
-                                decode_verifier_maxsz \
+                                1 /* bitmap size */ + \
-                                  /* verifier4 gdlr_verifier */ + \
+                                1 /* notification bitmap length */ + \
-                                1 /* gdlr_deviceid_list count */ + \
+                                1 /* notification bitmap, word 0 */)
-                                XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
-                                            NFS4_DEVICEID4_SIZE) \
-                                  /* gdlr_deviceid_list */ + \
-                                1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
-                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
                                1 /* layout type */ + \
                                1 /* opaque devaddr4 length */ + \
                                  /* devaddr4 payload is read into page */ \
                                1 /* notification bitmap length */ + \
-                                1 /* notification bitmap */)
+                                1 /* notification bitmap, word 0 */)
 #define encode_layoutget_maxsz  (op_encode_hdr_maxsz + 10 + \
                                encode_stateid_maxsz)
 #define decode_layoutget_maxsz  (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
                                2 /* last byte written */ + \
                                1 /* nt_timechanged (false) */ + \
                                1 /* layoutupdate4 layout type */ + \
-                                1 /* NULL filelayout layoutupdate4 payload */)
+                                1 /* layoutupdate4 opaqueue len */)
+                                  /* the actual content of layoutupdate4 should
+                                     be allocated by drivers and spliced in
+                                     using xdr_write_pages */
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
                                encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
-                                encode_sequence_maxsz + \
-                                encode_putfh_maxsz + \
-                                encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
-                                decode_sequence_maxsz + \
-                                decode_putfh_maxsz + \
-                                decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
                                encode_sequence_maxsz +\
                                encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 #ifdef CONFIG_NFS_V4_1
 static void
-encode_getdevicelist(struct xdr_stream *xdr,
-                     const struct nfs4_getdevicelist_args *args,
-                     struct compound_hdr *hdr)
-{
-        __be32 *p;
-        nfs4_verifier dummy = {
-                .data = "dummmmmy",
-        };
-        encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
-        p = reserve_space(xdr, 16);
-        *p++ = cpu_to_be32(args->layoutclass);
-        *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
-        xdr_encode_hyper(p, 0ULL);                          /* cookie */
-        encode_nfs4_verifier(xdr, &dummy);
-}
-static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
                     const struct nfs4_getdeviceinfo_args *args,
                     struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
        __be32 *p;
        encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
-        p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+        p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
                                    NFS4_DEVICEID4_SIZE);
        *p++ = cpu_to_be32(args->pdev->layout_type);
        *p++ = cpu_to_be32(args->pdev->maxcount);       /* gdia_maxcount */
-        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
+        p = reserve_space(xdr, 4 + 4);
+        *p++ = cpu_to_be32(1);                  /* bitmap length */
+        *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
 }
 static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
                    struct inode *inode,
-                    const struct nfs4_layoutcommit_args *args,
+                    struct nfs4_layoutcommit_args *args,
                    struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
        *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
-        if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+        if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
                NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
                        NFS_I(inode)->layout, xdr, args);
-        else
+        } else {
-                encode_uint32(xdr, 0); /* no layout-type payload */
+                encode_uint32(xdr, args->layoutupdate_len);
+                if (args->layoutupdate_pages) {
+                        xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+                                        args->layoutupdate_len);
+                }
+        }
        return 0;
 }
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
 }
 /*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
-                                       struct xdr_stream *xdr,
-                                       struct nfs4_getdevicelist_args *args)
-{
-        struct compound_hdr hdr = {
-                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-        };
-        encode_compound_hdr(xdr, req, &hdr);
-        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_putfh(xdr, args->fh, &hdr);
-        encode_getdevicelist(xdr, args, &hdr);
-        encode_nops(&hdr);
-}
-/*
 * Encode GETDEVICEINFO request
 */
 static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
 }
 #if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
-                                struct pnfs_devicelist *res)
-{
-        __be32 *p;
-        int status, i;
-        nfs4_verifier verftemp;
-        status = decode_op_hdr(xdr, OP_GETDEVICELIST);
-        if (status)
-                return status;
-        p = xdr_inline_decode(xdr, 8 + 8 + 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        /* TODO: Skip cookie for now */
-        p += 2;
-        /* Read verifier */
-        p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-        res->num_devs = be32_to_cpup(p);
-        dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-        if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
-                printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
-                                __func__, res->num_devs);
-                return -EIO;
-        }
-        p = xdr_inline_decode(xdr,
-                              res->num_devs * NFS4_DEVICEID4_SIZE + 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        for (i = 0; i < res->num_devs; i++)
-                p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
-                                            NFS4_DEVICEID4_SIZE);
-        res->eof = be32_to_cpup(p);
-        return 0;
-out_overflow:
-        print_overflow_msg(__func__, xdr);
-        return -EIO;
-}
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct pnfs_device *pdev)
 {
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                p = xdr_inline_decode(xdr, 4 * len);
                if (unlikely(!p))
                        goto out_overflow;
-                for (i = 0; i < len; i++, p++) {
-                        if (be32_to_cpup(p)) {
+                if (be32_to_cpup(p++) &
-                                dprintk("%s: notifications not supported\n",
+                    ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+                        dprintk("%s: unsupported notification\n",
+                                __func__);
+                }
+                for (i = 1; i < len; i++) {
+                        if (be32_to_cpup(p++)) {
+                                dprintk("%s: unsupported notification\n",
                                        __func__);
                                return -EIO;
                        }
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 }
 /*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
-                                      struct xdr_stream *xdr,
-                                      struct nfs4_getdevicelist_res *res)
-{
-        struct compound_hdr hdr;
-        int status;
-        dprintk("encoding getdevicelist!\n");
-        status = decode_compound_hdr(xdr, &hdr);
-        if (status != 0)
-                goto out;
-        status = decode_sequence(xdr, &res->seq_res, rqstp);
-        if (status != 0)
-                goto out;
-        status = decode_putfh(xdr);
-        if (status != 0)
-                goto out;
-        status = decode_getdevicelist(xdr, res->devlist);
-out:
-        return status;
-}
-/*
 * Decode GETDEVINFO response
 */
 static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(SECINFO_NO_NAME,   enc_secinfo_no_name,    dec_secinfo_no_name),
        PROC(TEST_STATEID,      enc_test_stateid,       dec_test_stateid),
        PROC(FREE_STATEID,      enc_free_stateid,       dec_free_stateid),
-        PROC(GETDEVICELIST,     enc_getdevicelist,      dec_getdevicelist),
        PROC(BIND_CONN_TO_SESSION,
                        enc_bind_conn_to_session, dec_bind_conn_to_session),
        PROC(DESTROY_CLIENTID,  enc_destroy_clientid,   dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
        kfree(de);
 }
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
-        const struct nfs4_deviceid *d_id)
-{
-        struct nfs4_deviceid_node *d;
-        struct objio_dev_ent *de;
-        d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
-        if (!d)
-                return NULL;
-        de = container_of(d, struct objio_dev_ent, id_node);
-        return de;
-}
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
-        const struct nfs4_deviceid *d_id, struct osd_dev *od,
-        gfp_t gfp_flags)
-{
-        struct nfs4_deviceid_node *d;
-        struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
-        struct objio_dev_ent *n;
-        if (!de) {
-                dprintk("%s: -ENOMEM od=%p\n", __func__, od);
-                return NULL;
-        }
-        dprintk("%s: Adding od=%p\n", __func__, od);
-        nfs4_init_deviceid_node(&de->id_node,
-                                nfss->pnfs_curr_ld,
-                                nfss->nfs_client,
-                                d_id);
-        de->od.od = od;
-        d = nfs4_insert_deviceid_node(&de->id_node);
-        n = container_of(d, struct objio_dev_ent, id_node);
-        if (n != de) {
-                dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
-                objio_free_deviceid_node(&de->id_node);
-                de = n;
-        }
-        return de;
-}
 struct objio_segment {
        struct pnfs_layout_segment lseg;
@@ -130,29 +84,24 @@ struct objio_state {
 /* Send and wait for a get_device_info of devices in the layout,
   then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+struct nfs4_deviceid_node *
-        struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
-        gfp_t gfp_flags)
+                        gfp_t gfp_flags)
 {
        struct pnfs_osd_deviceaddr *deviceaddr;
-        struct objio_dev_ent *ode;
+        struct objio_dev_ent *ode = NULL;
        struct osd_dev *od;
        struct osd_dev_info odi;
        bool retry_flag = true;
+        __be32 *p;
        int err;
-        ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+        deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
-        if (ode) {
+        if (!deviceaddr)
-                objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+                return NULL;
-                return 0;
-        }
-        err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+        p = page_address(pdev->pages[0]);
-        if (unlikely(err)) {
+        pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
-                dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
-                        __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-                return err;
-        }
        odi.systemid_len = deviceaddr->oda_systemid.len;
        if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
                goto out;
        }
-        ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
-                            gfp_flags);
-        objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
        dprintk("Adding new dev_id(%llx:%llx)\n",
-                _DEVID_LO(d_id), _DEVID_HI(d_id));
+                _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+        ode = kzalloc(sizeof(*ode), gfp_flags);
+        if (!ode) {
+                dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+                goto out;
+        }
+        nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+        kfree(deviceaddr);
+        ode->od.od = od;
+        return &ode->id_node;
 out:
-        objlayout_put_deviceinfo(deviceaddr);
+        kfree(deviceaddr);
-        return err;
+        return NULL;
 }
 static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        struct xdr_stream *xdr,
        gfp_t gfp_flags)
 {
+        struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
        struct objio_segment *objio_seg;
        struct pnfs_osd_xdr_decode_layout_iter iter;
        struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
        objio_seg->oc.first_dev = layout.olo_comps_index;
        cur_comp = 0;
        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+                struct nfs4_deviceid_node *d;
+                struct objio_dev_ent *ode;
                copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-                err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
-                                           &src_comp.oc_object_id.oid_device_id,
+                d = nfs4_find_get_deviceid(server,
-                                           gfp_flags);
+                                &src_comp.oc_object_id.oid_device_id,
-                if (err)
+                                pnfslay->plh_lc_cred, gfp_flags);
+                if (!d) {
+                        err = -ENXIO;
                        goto err;
-                ++cur_comp;
+                }
+                ode = container_of(d, struct objio_dev_ent, id_node);
+                objio_seg->oc.ods[cur_comp++] = &ode->od;
        }
        /* pnfs_osd_xdr_decode_layout_comp returns false on error */
        if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
        .flags                   = PNFS_LAYOUTRET_ON_SETATTR |
                                   PNFS_LAYOUTRET_ON_ERROR,
+        .max_deviceinfo_size     = PAGE_SIZE,
        .owner                   = THIS_MODULE,
        .alloc_layout_hdr        = objlayout_alloc_layout_hdr,
        .free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
        dprintk("%s: Return\n", __func__);
 }
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
-        struct page *page;
-        struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-        struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-        gfp_t gfp_flags)
-{
-        struct objlayout_deviceinfo *odi;
-        struct pnfs_device pd;
-        struct page *page, **pages;
-        u32 *p;
-        int err;
-        page = alloc_page(gfp_flags);
-        if (!page)
-                return -ENOMEM;
-        pages = &page;
-        pd.pages = pages;
-        memcpy(&pd.dev_id, d_id, sizeof(*d_id));
-        pd.layout_type = LAYOUT_OSD2_OBJECTS;
-        pd.pages = &page;
-        pd.pgbase = 0;
-        pd.pglen = PAGE_SIZE;
-        pd.mincount = 0;
-        pd.maxcount = PAGE_SIZE;
-        err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
-                        pnfslay->plh_lc_cred);
-        dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
-        if (err)
-                goto err_out;
-        p = page_address(page);
-        odi = kzalloc(sizeof(*odi), gfp_flags);
-        if (!odi) {
-                err = -ENOMEM;
-                goto err_out;
-        }
-        pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
-        odi->page = page;
-        *deviceaddr = &odi->da;
-        return 0;
-err_out:
-        __free_page(page);
-        return err;
-}
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
-        struct objlayout_deviceinfo *odi = container_of(deviceaddr,
-                                                struct objlayout_deviceinfo,
-                                                da);
-        __free_page(odi->page);
-        kfree(odi);
-}
 enum {
        OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
        OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
 extern void objlayout_write_done(struct objlayout_io_res *oir,
                                 ssize_t status, bool sync);
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-        struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-        gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
 /*
 * exported generic objects function vectors
 */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ba491926df5f..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
                if (atomic_read(&c->io_count) == 0)
                        break;
                ret = nfs_wait_bit_killable(&q.key);
-        } while (atomic_read(&c->io_count) != 0);
+        } while (atomic_read(&c->io_count) != 0 && !ret);
        finish_wait(wq, &q.wait);
        return ret;
 }
@@ -139,26 +139,49 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
 /*
 * nfs_page_group_lock - lock the head of the page group
 * @req - request in group that is to be locked
+ * @nonblock - if true don't block waiting for lock
 *
 * this lock must be held if modifying the page group list
 *
- * returns result from wait_on_bit_lock: 0 on success, < 0 on error
+ * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
+ * result from wait_on_bit_lock
+ *
+ * NOTE: calling with nonblock=false should always have set the
+ *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
+ *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
 */
 int
-nfs_page_group_lock(struct nfs_page *req, bool wait)
+nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 {
        struct nfs_page *head = req->wb_head;
-        int ret;
        WARN_ON_ONCE(head != head->wb_head);
-        do {
+        if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
-                ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                return 0;
-                        TASK_UNINTERRUPTIBLE);
-        } while (wait && ret != 0);
-        WARN_ON_ONCE(ret > 0);
+        if (!nonblock)
-        return ret;
+                return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                                TASK_UNINTERRUPTIBLE);
+        return -EAGAIN;
+}
+/*
+ * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
+ * @req - a request in the group
+ *
+ * This is a blocking call to wait for the group lock to be cleared.
+ */
+void
+nfs_page_group_lock_wait(struct nfs_page *req)
+{
+        struct nfs_page *head = req->wb_head;
+        WARN_ON_ONCE(head != head->wb_head);
+        wait_on_bit(&head->wb_flags, PG_HEADLOCK,
+                TASK_UNINTERRUPTIBLE);
 }
 /*
@@ -219,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
        bool ret;
-        nfs_page_group_lock(req, true);
+        nfs_page_group_lock(req, false);
        ret = nfs_page_group_sync_on_bit_locked(req, bit);
        nfs_page_group_unlock(req);
@@ -458,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
                return 0;
        }
+        /*
+         * Limit the request size so that we can still allocate a page array
+         * for it without upsetting the slab allocator.
+         */
+        if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+                        sizeof(struct page) > PAGE_SIZE)
+                return 0;
        return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
@@ -701,10 +732,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                     struct nfs_pgio_header *hdr)
 {
        struct nfs_page         *req;
-        struct page             **pages;
+        struct page             **pages,
+                                *last_page;
        struct list_head *head = &desc->pg_list;
        struct nfs_commit_info cinfo;
-        unsigned int pagecount;
+        unsigned int pagecount, pageused;
        pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
        if (!nfs_pgarray_set(&hdr->page_array, pagecount))
@@ -712,12 +744,23 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
        pages = hdr->page_array.pagevec;
+        last_page = NULL;
+        pageused = 0;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &hdr->pages);
-                *pages++ = req->wb_page;
+                if (WARN_ON_ONCE(pageused >= pagecount))
+                        return nfs_pgio_error(desc, hdr);
+                if (!last_page || last_page != req->wb_page) {
+                        *pages++ = last_page = req->wb_page;
+                        pageused++;
+                }
        }
+        if (WARN_ON_ONCE(pageused != pagecount))
+                return nfs_pgio_error(desc, hdr);
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -788,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                        return false;
                if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
                        return false;
+                if (req->wb_page == prev->wb_page) {
+                        if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+                                return false;
+                } else {
+                        if (req->wb_pgbase != 0 ||
+                            prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+                                return false;
+                }
        }
        size = pgio->pg_ops->pg_test(pgio, prev, req);
        WARN_ON_ONCE(size > req->wb_bytes);
@@ -858,13 +909,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
-        int ret;
-        ret = nfs_page_group_lock(req, false);
+        nfs_page_group_lock(req, false);
-        if (ret < 0) {
-                desc->pg_error = ret;
-                return 0;
-        }
        subreq = req;
        bytes_left = subreq->wb_bytes;
@@ -886,11 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        if (desc->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
-                        ret = nfs_page_group_lock(req, false);
+                        nfs_page_group_lock(req, false);
-                        if (ret < 0) {
-                                desc->pg_error = ret;
-                                return 0;
-                        }
                        continue;
                }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
                dprintk("%s freeing layout for inode %lu\n", __func__,
                        lo->plh_inode->i_ino);
                inode = lo->plh_inode;
+                pnfs_layoutcommit_inode(inode, false);
                spin_lock(&inode->i_lock);
                list_del_init(&lo->plh_bulk_destroy);
                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
        return (s32)(s1 - s2) > 0;
 }
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
-                const nfs4_stateid *new,
-                struct list_head *free_me_list)
-{
-        if (nfs4_stateid_match_other(&lo->plh_stateid, new))
-                return;
-        /* Layout is new! Kill existing layout segments */
-        pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
                status = -EAGAIN;
        } else if (!nfs4_valid_open_stateid(open_state)) {
                status = -EBADF;
-        } else if (list_empty(&lo->plh_segs)) {
+        } else if (list_empty(&lo->plh_segs) ||
+                   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
                int seq;
                do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
        empty = list_empty(&lo->plh_segs);
        pnfs_clear_layoutcommit(ino, &tmp_list);
        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+                struct pnfs_layout_range range = {
+                        .iomode         = IOMODE_ANY,
+                        .offset         = 0,
+                        .length         = NFS4_MAX_UINT64,
+                };
+                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+        }
        /* Don't send a LAYOUTRETURN if list was initially empty */
        if (empty) {
                spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
                dprintk("NFS: %s no layout segments to return\n", __func__);
                goto out;
        }
+        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
        lo->plh_block_lgets++;
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out;
        }
+        init_lseg(lo, lseg);
+        lseg->pls_range = res->range;
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
                dprintk("%s forget reply due to recall\n", __func__);
                goto out_forget_reply;
        }
-        if (pnfs_layoutgets_blocked(lo, 1) ||
+        if (pnfs_layoutgets_blocked(lo, 1)) {
-            pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
-        /* Check that the new stateid matches the old stateid */
+        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
-        pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
+                /* existing state ID, make sure the sequence number matches. */
-        /* Done processing layoutget. Set the layout stateid */
+                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
-        pnfs_set_layout_stateid(lo, &res->stateid, false);
+                        dprintk("%s forget reply due to sequence\n", __func__);
+                        goto out_forget_reply;
+                }
+                pnfs_set_layout_stateid(lo, &res->stateid, false);
+        } else {
+                /*
+                 * We got an entirely new state ID.  Mark all segments for the
+                 * inode invalid, and don't bother validating the stateid
+                 * sequence number.
+                 */
+                pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+        }
+        clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-        init_lseg(lo, lseg);
-        lseg->pls_range = res->range;
        pnfs_get_lseg(lseg);
        pnfs_layout_insert_lseg(lo, lseg);
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+        struct inode *inode = data->inode;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        bool mark_as_dirty = false;
+        spin_lock(&inode->i_lock);
+        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+                mark_as_dirty = true;
+                dprintk("%s: Set layoutcommit for inode %lu ",
+                        __func__, inode->i_ino);
+        }
+        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+                /* references matched in nfs4_layoutcommit_release */
+                pnfs_get_lseg(data->lseg);
+        }
+        if (data->lwb > nfsi->layout->plh_lwb)
+                nfsi->layout->plh_lwb = data->lwb;
+        spin_unlock(&inode->i_lock);
+        dprintk("%s: lseg %p end_pos %llu\n",
+                __func__, data->lseg, nfsi->layout->plh_lwb);
+        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+        if (mark_as_dirty)
+                mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 {
        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 int
 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
        struct nfs4_layoutcommit_data *data;
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        data->args.lastbytewritten = end_pos - 1;
        data->res.server = NFS_SERVER(inode);
+        if (ld->prepare_layoutcommit) {
+                status = ld->prepare_layoutcommit(&data->args);
+                if (status) {
+                        spin_lock(&inode->i_lock);
+                        if (end_pos < nfsi->layout->plh_lwb)
+                                nfsi->layout->plh_lwb = end_pos;
+                        spin_unlock(&inode->i_lock);
+                        put_rpccred(data->cred);
+                        set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+                        goto clear_layoutcommitting;
+                }
+        }
        status = nfs4_proc_layoutcommit(data, sync);
 out:
        if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
        NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
+        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
 };
 enum layoutdriver_policy_flags {
-        /* Should the pNFS client commit and return the layout upon a setattr */
+        /* Should the pNFS client commit and return the layout upon truncate to
+         * a smaller size */
        PNFS_LAYOUTRET_ON_SETATTR       = 1 << 0,
        PNFS_LAYOUTRET_ON_ERROR         = 1 << 1,
+        PNFS_READ_WHOLE_PAGE            = 1 << 2,
 };
 struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
        const char *name;
        struct module *owner;
        unsigned flags;
+        unsigned max_deviceinfo_size;
        int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
        int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+        void (*return_range) (struct pnfs_layout_hdr *lo,
+                              struct pnfs_layout_range *range);
        /* test for nfs page cache coalescing */
        const struct nfs_pageio_ops *pg_read_ops;
        const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
        enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+        struct nfs4_deviceid_node * (*alloc_deviceid_node)
+                        (struct nfs_server *server, struct pnfs_device *pdev,
+                        gfp_t gfp_flags);
        void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutreturn_args *args);
        void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
+        int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
-        void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+        void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
 };
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 /* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
-                                   const struct nfs_fh *fh,
-                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev,
                                   struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
 void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
        atomic_t                        ref;
 };
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+                const struct nfs4_deviceid *id, struct rpc_cred *cred,
+                gfp_t gfp_mask);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
-                             const struct pnfs_layoutdriver_type *,
-                             const struct nfs_client *,
                             const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+        atomic_inc(&d->ref);
+        return d;
+}
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 }
 static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+        if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+                return false;
+        return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 }
 static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+        return false;
+}
+static inline bool
 pnfs_roc(struct inode *ino)
 {
        return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
 */
 #include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
 #include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
        return NULL;
 }
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+                const struct nfs4_deviceid *dev_id,
+                struct rpc_cred *cred, gfp_t gfp_flags)
+{
+        struct nfs4_deviceid_node *d = NULL;
+        struct pnfs_device *pdev = NULL;
+        struct page **pages = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        int rc, i;
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        if (server->pnfs_curr_ld->max_deviceinfo_size &&
+            server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+                max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+        max_pages = nfs_page_array_len(0, max_resp_sz);
+        dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+                __func__, server, max_resp_sz, max_pages);
+        pdev = kzalloc(sizeof(*pdev), gfp_flags);
+        if (!pdev)
+                return NULL;
+        pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+        if (!pages)
+                goto out_free_pdev;
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(gfp_flags);
+                if (!pages[i])
+                        goto out_free_pages;
+        }
+        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+        pdev->layout_type = server->pnfs_curr_ld->id;
+        pdev->pages = pages;
+        pdev->pgbase = 0;
+        pdev->pglen = max_resp_sz;
+        pdev->mincount = 0;
+        pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+        rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free_pages;
+        /*
+         * Found new device, need to decode it and then add it to the
+         * list of known devices for this mountpoint.
+         */
+        d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+                        gfp_flags);
+out_free_pages:
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+out_free_pdev:
+        kfree(pdev);
+        dprintk("<-- %s d %p\n", __func__, d);
+        return d;
+}
 /*
 * Lookup a deviceid in cache and get a reference count on it if found
 *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
 * @id deviceid to look up
 */
 static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+__nfs4_find_get_deviceid(struct nfs_server *server,
-                   const struct nfs_client *clp, const struct nfs4_deviceid *id,
+                const struct nfs4_deviceid *id, long hash)
-                   long hash)
 {
        struct nfs4_deviceid_node *d;
        rcu_read_lock();
-        d = _lookup_deviceid(ld, clp, id, hash);
+        d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+                        hash);
        if (d != NULL)
                atomic_inc(&d->ref);
        rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
 }
 struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+nfs4_find_get_deviceid(struct nfs_server *server,
-                       const struct nfs_client *clp, const struct nfs4_deviceid *id)
+                const struct nfs4_deviceid *id, struct rpc_cred *cred,
+                gfp_t gfp_mask)
 {
-        return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+        long hash = nfs4_deviceid_hash(id);
+        struct nfs4_deviceid_node *d, *new;
+        d = __nfs4_find_get_deviceid(server, id, hash);
+        if (d)
+                return d;
+        new = nfs4_get_device_info(server, id, cred, gfp_mask);
+        if (!new)
+                return new;
+        spin_lock(&nfs4_deviceid_lock);
+        d = __nfs4_find_get_deviceid(server, id, hash);
+        if (d) {
+                spin_unlock(&nfs4_deviceid_lock);
+                server->pnfs_curr_ld->free_deviceid_node(new);
+                return d;
+        }
+        hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+        atomic_inc(&new->ref);
+        spin_unlock(&nfs4_deviceid_lock);
+        return new;
 }
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
 EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
 void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
-                        const struct pnfs_layoutdriver_type *ld,
-                        const struct nfs_client *nfs_client,
                        const struct nfs4_deviceid *id)
 {
        INIT_HLIST_NODE(&d->node);
        INIT_HLIST_NODE(&d->tmpnode);
-        d->ld = ld;
+        d->ld = server->pnfs_curr_ld;
-        d->nfs_client = nfs_client;
+        d->nfs_client = server->nfs_client;
        d->flags = 0;
        d->deviceid = *id;
        atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
 EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
 /*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- *      Note that the caller must set up the following members:
- *        new->ld
- *        new->nfs_client
- *        new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
-        struct nfs4_deviceid_node *d;
-        long hash;
-        spin_lock(&nfs4_deviceid_lock);
-        hash = nfs4_deviceid_hash(&new->deviceid);
-        d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
-        if (d) {
-                spin_unlock(&nfs4_deviceid_lock);
-                return d;
-        }
-        hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
-        spin_unlock(&nfs4_deviceid_lock);
-        atomic_inc(&new->ref);
-        return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-/*
 * Dereference a deviceid node and delete it when its reference count drops
 * to zero.
 *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
        }
        rcu_read_unlock();
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
                return NFS_TEXT_DATA;
        }
-#if !IS_ENABLED(CONFIG_NFS_V3)
-        if (args->version == 3)
-                goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
        return 0;
 out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
        dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
        return -EINVAL;
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
-        dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
-        return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
 out_nomem:
        dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
        return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b5cf28bdc5..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
                                      struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+                                                struct page *page);
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 }
 /*
- * nfs_page_search_commits_for_head_request_locked
- *
- * Search through commit lists on @inode for the head request for @page.
- * Must be called while holding the inode (which is cinfo) lock.
- *
- * Returns the head request if found, or NULL if not found.
- */
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
-                                                struct page *page)
-{
-        struct nfs_page *freq, *t;
-        struct nfs_commit_info cinfo;
-        struct inode *inode = &nfsi->vfs_inode;
-        nfs_init_cinfo_from_inode(&cinfo, inode);
-        /* search through pnfs commit lists */
-        freq = pnfs_search_commit_reqs(inode, &cinfo, page);
-        if (freq)
-                return freq->wb_head;
-        /* Linearly search the commit list for the correct request */
-        list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
-                if (freq->wb_page == page)
-                        return freq->wb_head;
-        }
-        return NULL;
-}
-/*
 * nfs_page_find_head_request_locked - find head request associated with @page
 *
 * must be called while holding the inode lock.
@@ -241,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
        unsigned int pos = 0;
        unsigned int len = nfs_page_length(req->wb_page);
-        nfs_page_group_lock(req, true);
+        nfs_page_group_lock(req, false);
        do {
                tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
 static int wb_priority(struct writeback_control *wbc)
 {
+        int ret = 0;
        if (wbc->for_reclaim)
                return FLUSH_HIGHPRI | FLUSH_STABLE;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                ret = FLUSH_COND_STABLE;
        if (wbc->for_kupdate || wbc->for_background)
-                return FLUSH_LOWPRI | FLUSH_COND_STABLE;
+                ret |= FLUSH_LOWPRI;
-        return FLUSH_COND_STABLE;
+        return ret;
 }
 /*
@@ -478,10 +452,23 @@ try_again:
                return NULL;
        }
-        /* lock each request in the page group */
+        /* holding inode lock, so always make a non-blocking call to try the
-        ret = nfs_page_group_lock(head, false);
+         * page group lock */
-        if (ret < 0)
+        ret = nfs_page_group_lock(head, true);
+        if (ret < 0) {
+                spin_unlock(&inode->i_lock);
+                if (!nonblock && ret == -EAGAIN) {
+                        nfs_page_group_lock_wait(head);
+                        nfs_release_request(head);
+                        goto try_again;
+                }
+                nfs_release_request(head);
                return ERR_PTR(ret);
+        }
+        /* lock each request in the page group */
        subreq = head;
        do {
                /*
@@ -718,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                if (likely(!PageSwapCache(head->wb_page))) {
                        set_page_private(head->wb_page, 0);
                        ClearPagePrivate(head->wb_page);
+                        smp_mb__after_atomic();
+                        wake_up_page(head->wb_page, PG_private);
                        clear_bit(PG_MAPPED, &head->wb_flags);
                }
                nfsi->npages--;
@@ -736,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
        __set_page_dirty_nobuffers(req->wb_page);
 }
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+                                                struct page *page)
+{
+        struct nfs_page *freq, *t;
+        struct nfs_commit_info cinfo;
+        struct inode *inode = &nfsi->vfs_inode;
+        nfs_init_cinfo_from_inode(&cinfo, inode);
+        /* search through pnfs commit lists */
+        freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+        if (freq)
+                return freq->wb_head;
+        /* Linearly search the commit list for the correct request */
+        list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+                if (freq->wb_page == page)
+                        return freq->wb_head;
+        }
+        return NULL;
+}
 /**
 * nfs_request_add_commit_list - add request to a commit list
 * @req: pointer to a struct nfs_page
@@ -854,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
        return hdr->verf.committed != NFS_FILE_SYNC;
 }
-#else
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
-                                      struct inode *inode)
-{
-}
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
-                    struct inode *inode,
-                    struct nfs_direct_req *dreq)
-{
-}
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                        struct nfs_commit_info *cinfo)
-{
-}
-static void
-nfs_clear_request_commit(struct nfs_page *req)
-{
-}
-int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-{
-        return 0;
-}
-#endif
 static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
        struct nfs_commit_info cinfo;
@@ -919,7 +909,6 @@ out:
        hdr->release(hdr);
 }
-#if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
@@ -976,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
        return ret;
 }
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
-        return 0;
-}
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
-                    struct nfs_commit_info *cinfo)
-{
-        return 0;
-}
-#endif
 /*
 * Search for an existing write request, and attempt to update
 * it to reflect a new dirty region on a given page.
@@ -1381,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
                return status;
        nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
        if (hdr->res.verf->committed < hdr->args.stable &&
            task->tk_status >= 0) {
                /* We tried a write call, but the server did not
@@ -1403,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
                        complain = jiffies + 300 * HZ;
                }
        }
-#endif
        /* Deal with the suid/sgid bit corner case */
        if (nfs_should_remove_suid(inode))
@@ -1456,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
 }
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 {
        int ret;
@@ -1525,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+        loff_t lwb = 0;
+        struct nfs_page *req;
+        list_for_each_entry(req, head, wb_list)
+                if (lwb < (req_offset(req) + req->wb_bytes))
+                        lwb = req_offset(req) + req->wb_bytes;
+        return lwb;
+}
 /*
 * Set up the argument/result storage required for the RPC call.
 */
@@ -1544,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
        data->inode       = inode;
        data->cred        = first->wb_context->cred;
        data->lseg        = lseg; /* reference transferred */
+        /* only set lwb for pnfs commit */
+        if (lseg)
+                data->lwb = nfs_get_lwb(&data->pages);
        data->mds_ops     = &nfs_commit_ops;
        data->completion_ops = cinfo->completion_ops;
        data->dreq        = cinfo->dreq;
@@ -1623,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
        struct nfs_page *req;
        int status = data->task.tk_status;
        struct nfs_commit_info cinfo;
+        struct nfs_server *nfss;
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
@@ -1656,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
        next:
                nfs_unlock_and_release_request(req);
        }
+        nfss = NFS_SERVER(data->inode);
+        if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
        nfs_init_cinfo(&cinfo, data->inode, data->dreq);
        if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
                nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1765,12 +1758,6 @@ out_mark_dirty:
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return ret;
 }
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
-        return 0;
-}
-#endif
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
 nfs_acl-objs := nfsacl.o
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
 /*
 * Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
 */
 #include <linux/module.h>
-#include <linux/lockd/bind.h>
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
-#include "netns.h"
+static int grace_net_id;
 static DEFINE_SPINLOCK(grace_lock);
 /**
 * locks_start_grace
+ * @net: net namespace that this lock manager belongs to
 * @lm: who this grace period is for
 *
 * A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
 *
 * This function is called to start a grace period.
 */
-void locks_start_grace(struct net *net, struct lock_manager *lm)
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
 {
-        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        struct list_head *grace_list = net_generic(net, grace_net_id);
        spin_lock(&grace_lock);
-        list_add(&lm->list, &ln->grace_list);
+        list_add(&lm->list, grace_list);
        spin_unlock(&grace_lock);
 }
 EXPORT_SYMBOL_GPL(locks_start_grace);
 /**
 * locks_end_grace
+ * @net: net namespace that this lock manager belongs to
 * @lm: who this grace period is for
 *
 * Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
 * Note that callers count on it being safe to call this more than once,
 * and the second call should be a no-op.
 */
-void locks_end_grace(struct lock_manager *lm)
+void
+locks_end_grace(struct lock_manager *lm)
 {
        spin_lock(&grace_lock);
        list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
 * to answer ordinary lock requests, and when they should accept only
 * lock reclaims.
 */
-int locks_in_grace(struct net *net)
+int
+locks_in_grace(struct net *net)
 {
-        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        struct list_head *grace_list = net_generic(net, grace_net_id);
-        return !list_empty(&ln->grace_list);
+        return !list_empty(grace_list);
 }
 EXPORT_SYMBOL_GPL(locks_in_grace);
+static int __net_init
+grace_init_net(struct net *net)
+{
+        struct list_head *grace_list = net_generic(net, grace_net_id);
+        INIT_LIST_HEAD(grace_list);
+        return 0;
+}
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+        struct list_head *grace_list = net_generic(net, grace_net_id);
+        BUG_ON(!list_empty(grace_list));
+}
+static struct pernet_operations grace_net_ops = {
+        .init = grace_init_net,
+        .exit = grace_exit_net,
+        .id   = &grace_net_id,
+        .size = sizeof(struct list_head),
+};
+static int __init
+init_grace(void)
+{
+        return register_pernet_subsys(&grace_net_ops);
+}
+static void __exit
+exit_grace(void)
+{
+        unregister_pernet_subsys(&grace_net_ops);
+}
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f3586b645d7d..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
        select FS_POSIX_ACL
        select SUNRPC_GSS
        select CRYPTO
+        select GRACE_PERIOD
        help
          This option enables support in your system's NFS server for
          version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e0be57b0f79b..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
 /* Index of predefined Linux callback client operations */
-enum {
-        NFSPROC4_CLNT_CB_NULL = 0,
-        NFSPROC4_CLNT_CB_RECALL,
-        NFSPROC4_CLNT_CB_SEQUENCE,
-};
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
                                   const struct nfsd4_callback *cb)
 {
-        const struct nfs4_delegation *args = cb->cb_op;
+        const struct nfs4_delegation *dp = cb_to_delegation(cb);
        struct nfs4_cb_compound_hdr hdr = {
                .ident = cb->cb_clp->cl_cb_ident,
                .minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_cb_compound4args(xdr, &hdr);
        encode_cb_sequence4args(xdr, cb, &hdr);
-        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_recall4args(xdr, dp, &hdr);
        encode_cb_nops(&hdr);
 }
@@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
 static struct workqueue_struct *callback_wq;
-static void run_nfsd4_cb(struct nfsd4_callback *cb)
-{
-        queue_work(callback_wq, &cb->cb_work);
-}
-static void do_probe_callback(struct nfs4_client *clp)
-{
-        struct nfsd4_callback *cb = &clp->cl_cb_null;
-        cb->cb_op = NULL;
-        cb->cb_clp = clp;
-        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
-        cb->cb_msg.rpc_argp = NULL;
-        cb->cb_msg.rpc_resp = NULL;
-        cb->cb_ops = &nfsd4_cb_probe_ops;
-        run_nfsd4_cb(cb);
-}
 /*
 * Poke the callback thread to process any updates to the callback
 * parameters, and send a null probe.
@@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
 {
        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
-        do_probe_callback(clp);
+        nfsd4_run_cb(&clp->cl_cb_null);
 }
 void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
                rpc_wake_up_next(&clp->cl_cb_waitq);
                dprintk("%s: freed slot, new seqid=%d\n", __func__,
                        clp->cl_cb_session->se_cb_seq_nr);
-                /* We're done looking into the sequence information */
-                task->tk_msg.rpc_resp = NULL;
        }
-}
-static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-{
-        struct nfsd4_callback *cb = calldata;
-        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        struct nfs4_client *clp = cb->cb_clp;
-        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
-        nfsd4_cb_done(task, calldata);
-        if (current_rpc_client != task->tk_client) {
+        if (clp->cl_cb_client != task->tk_client) {
                /* We're shutting down or changing cl_cb_client; leave
                 * it to nfsd4_process_cb_update to restart the call if
                 * necessary. */
@@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
        if (cb->cb_done)
                return;
-        switch (task->tk_status) {
+        switch (cb->cb_ops->done(cb, task)) {
        case 0:
-                cb->cb_done = true;
+                task->tk_status = 0;
+                rpc_restart_call_prepare(task);
                return;
-        case -EBADHANDLE:
+        case 1:
-        case -NFS4ERR_BAD_STATEID:
-                /* Race: client probably got cb_recall
-                 * before open reply granting delegation */
                break;
-        default:
+        case -1:
                /* Network partition? */
                nfsd4_mark_cb_down(clp, task->tk_status);
+                break;
+        default:
+                BUG();
        }
-        if (dp->dl_retries--) {
-                rpc_delay(task, 2*HZ);
-                task->tk_status = 0;
-                rpc_restart_call_prepare(task);
-                return;
-        }
-        nfsd4_mark_cb_down(clp, task->tk_status);
        cb->cb_done = true;
 }
-static void nfsd4_cb_recall_release(void *calldata)
+static void nfsd4_cb_release(void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
        struct nfs4_client *clp = cb->cb_clp;
-        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        if (cb->cb_done) {
                spin_lock(&clp->cl_lock);
                list_del(&cb->cb_per_client);
                spin_unlock(&clp->cl_lock);
-                nfs4_put_stid(&dp->dl_stid);
+                cb->cb_ops->release(cb);
        }
 }
-static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+static const struct rpc_call_ops nfsd4_cb_ops = {
        .rpc_call_prepare = nfsd4_cb_prepare,
-        .rpc_call_done = nfsd4_cb_recall_done,
+        .rpc_call_done = nfsd4_cb_done,
-        .rpc_release = nfsd4_cb_recall_release,
+        .rpc_release = nfsd4_cb_release,
 };
 int nfsd4_create_callback_queue(void)
@@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
         * instead, nfsd4_run_cb_null() will detect the killed
         * client, destroy the rpc client, and stop:
         */
-        do_probe_callback(clp);
+        nfsd4_run_cb(&clp->cl_cb_null);
        flush_workqueue(callback_wq);
 }
-static void nfsd4_release_cb(struct nfsd4_callback *cb)
-{
-        if (cb->cb_ops->rpc_release)
-                cb->cb_ops->rpc_release(cb);
-}
 /* requires cl_lock: */
 static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
 {
@@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
        }
        /* Yay, the callback channel's back! Restart any callbacks: */
        list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
-                run_nfsd4_cb(cb);
+                queue_work(callback_wq, &cb->cb_work);
 }
 static void
-nfsd4_run_callback_rpc(struct nfsd4_callback *cb)
+nfsd4_run_cb_work(struct work_struct *work)
 {
+        struct nfsd4_callback *cb =
+                container_of(work, struct nfsd4_callback, cb_work);
        struct nfs4_client *clp = cb->cb_clp;
        struct rpc_clnt *clnt;
+        if (cb->cb_ops && cb->cb_ops->prepare)
+                cb->cb_ops->prepare(cb);
        if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
                nfsd4_process_cb_update(cb);
        clnt = clp->cl_cb_client;
        if (!clnt) {
                /* Callback channel broken, or client killed; give up: */
-                nfsd4_release_cb(cb);
+                if (cb->cb_ops && cb->cb_ops->release)
+                        cb->cb_ops->release(cb);
                return;
        }
        cb->cb_msg.rpc_cred = clp->cl_cb_cred;
        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
-                        cb->cb_ops, cb);
+                        cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
 }
-void
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
-nfsd4_run_cb_null(struct work_struct *w)
+                struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
 {
-        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
-                                                        cb_work);
-        nfsd4_run_callback_rpc(cb);
-}
-void
-nfsd4_run_cb_recall(struct work_struct *w)
-{
-        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
-                                                        cb_work);
-        nfsd4_prepare_cb_recall(cb->cb_op);
-        nfsd4_run_callback_rpc(cb);
-}
-void nfsd4_cb_recall(struct nfs4_delegation *dp)
-{
-        struct nfsd4_callback *cb = &dp->dl_recall;
-        struct nfs4_client *clp = dp->dl_stid.sc_client;
-        dp->dl_retries = 1;
-        cb->cb_op = dp;
        cb->cb_clp = clp;
-        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
        cb->cb_msg.rpc_argp = cb;
        cb->cb_msg.rpc_resp = cb;
+        cb->cb_ops = ops;
-        cb->cb_ops = &nfsd4_cb_recall_ops;
+        INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
        INIT_LIST_HEAD(&cb->cb_per_client);
        cb->cb_done = true;
+}
-        run_nfsd4_cb(&dp->dl_recall);
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+        queue_work(callback_wq, &cb->cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
        memset(&ent, 0, sizeof(ent));
        /* Authentication name */
-        if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+        len = qword_get(&buf, buf1, PAGE_SIZE);
+        if (len <= 0 || len >= IDMAP_NAMESZ)
                goto out;
        memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
        /* Name */
        error = -EINVAL;
        len = qword_get(&buf, buf1, PAGE_SIZE);
-        if (len < 0)
+        if (len < 0 || len >= IDMAP_NAMESZ)
                goto out;
        if (len == 0)
                set_bit(CACHE_NEGATIVE, &ent.h.flags);
-        else if (len >= IDMAP_NAMESZ)
-                goto out;
        else
                memcpy(ent.name, buf1, sizeof(ent.name));
        error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
                goto out;
        cache_put(&res->h, cd);
        error = 0;
 out:
        kfree(buf1);
        return error;
 }
 static struct ent *
 idtoname_lookup(struct cache_detail *cd, struct ent *item)
 {
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 {
        struct ent ent, *res;
        char *buf1;
-        int error = -EINVAL;
+        int len, error = -EINVAL;
        if (buf[buflen - 1] != '\n')
                return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
        memset(&ent, 0, sizeof(ent));
        /* Authentication name */
-        if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+        len = qword_get(&buf, buf1, PAGE_SIZE);
+        if (len <= 0 || len >= IDMAP_NAMESZ)
                goto out;
        memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
                IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
        /* Name */
-        error = qword_get(&buf, buf1, PAGE_SIZE);
+        len = qword_get(&buf, buf1, PAGE_SIZE);
-        if (error <= 0 || error >= IDMAP_NAMESZ)
+        if (len <= 0 || len >= IDMAP_NAMESZ)
                goto out;
        memcpy(ent.name, buf1, sizeof(ent.name));
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
        error = 0;
 out:
        kfree(buf1);
        return (error);
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5e0dc528a0e8..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status;
 }
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+                struct nfsd4_seek *seek)
+{
+        int whence;
+        __be32 status;
+        struct file *file;
+        status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+                                            &seek->seek_stateid,
+                                            RD_STATE, &file);
+        if (status) {
+                dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+                return status;
+        }
+        switch (seek->seek_whence) {
+        case NFS4_CONTENT_DATA:
+                whence = SEEK_DATA;
+                break;
+        case NFS4_CONTENT_HOLE:
+                whence = SEEK_HOLE;
+                break;
+        default:
+                status = nfserr_union_notsupp;
+                goto out;
+        }
+        /*
+         * Note:  This call does change file->f_pos, but nothing in NFSD
+         *        should ever file->f_pos.
+         */
+        seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+        if (seek->seek_pos < 0)
+                status = nfserrno(seek->seek_pos);
+        else if (seek->seek_pos >= i_size_read(file_inode(file)))
+                seek->seek_eof = true;
+out:
+        fput(file);
+        return status;
+}
 /* This routine never returns NFS_OK!  If there are no other errors, it
 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
 * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
@@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
+        /* NFSv4.2 operations */
+        [OP_SEEK] = {
+                .op_func = (nfsd4op_func)nfsd4_seek,
+                .op_name = "OP_SEEK",
+        },
 };
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..ea95a2bc21b5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
        void (*create)(struct nfs4_client *);
        void (*remove)(struct nfs4_client *);
        int (*check)(struct nfs4_client *);
-        void (*grace_done)(struct nfsd_net *, time_t);
+        void (*grace_done)(struct nfsd_net *);
 };
 /* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
        status = mnt_want_write_file(nn->rec_file);
        if (status)
-                return;
+                goto out_creds;
        dir = nn->rec_file->f_path.dentry;
        /* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
                                user_recovery_dirname);
        }
        mnt_drop_write_file(nn->rec_file);
+out_creds:
        nfs4_reset_creds(original_cred);
 }
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 }
 static void
-nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
 {
        int status;
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
        return status;
 }
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        if (!nn->rec_file)
+                return;
+        fput(nn->rec_file);
+        nn->rec_file = NULL;
+}
 static int
 nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
        int status;
        status = nfsd4_init_recdir(net);
-        if (!status)
-                status = nfsd4_recdir_load(net);
        if (status)
-                printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+                return status;
+        status = nfsd4_recdir_load(net);
+        if (status)
+                nfsd4_shutdown_recdir(net);
        return status;
 }
@@ -546,21 +560,12 @@ err:
 }
 static void
-nfsd4_shutdown_recdir(struct nfsd_net *nn)
-{
-        if (!nn->rec_file)
-                return;
-        fput(nn->rec_file);
-        nn->rec_file = NULL;
-}
-static void
 nfsd4_legacy_tracking_exit(struct net *net)
 {
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        nfs4_release_reclaim(nn);
-        nfsd4_shutdown_recdir(nn);
+        nfsd4_shutdown_recdir(net);
        nfs4_legacy_state_shutdown(net);
 }
@@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
 }
 static void
-nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn)
 {
        int ret;
        struct cld_upcall *cup;
@@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
        }
        cup->cu_msg.cm_cmd = Cld_GraceDone;
-        cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+        cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
        ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
        if (!ret)
                ret = cup->cu_msg.cm_status;
@@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
 #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
 #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
 static char *
 nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
        return result;
 }
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+        int copied;
+        size_t len;
+        char *result;
+        /* prefix + Y/N character + terminating NULL */
+        len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+        result = kmalloc(len, GFP_KERNEL);
+        if (!result)
+                return result;
+        copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+                                clp->cl_minorversion ? 'Y' : 'N');
+        if (copied >= len) {
+                /* just return nothing if output was truncated */
+                kfree(result);
+                return NULL;
+        }
+        return result;
+}
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+        int copied;
+        size_t len;
+        char *result;
+        /* prefix + max width of int64_t string + terminating NULL */
+        len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+        result = kmalloc(len, GFP_KERNEL);
+        if (!result)
+                return result;
+        copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+                                grace_start);
+        if (copied >= len) {
+                /* just return nothing if output was truncated */
+                kfree(result);
+                return NULL;
+        }
+        return result;
+}
 static int
-nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
 {
-        char *envp[2];
+        char *envp[3];
        char *argv[4];
        int ret;
@@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
        dprintk("%s: cmd: %s\n", __func__, cmd);
        dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
-        dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+        dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+        dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
-        envp[0] = legacy;
+        envp[0] = env0;
-        envp[1] = NULL;
+        envp[1] = env1;
+        envp[2] = NULL;
        argv[0] = (char *)cltrack_prog;
        argv[1] = cmd;
@@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
 }
 static int
-nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+nfsd4_umh_cltrack_init(struct net *net)
 {
+        int ret;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
        /* XXX: The usermode helper s not working in container yet. */
        if (net != &init_net) {
                WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
                        "tracking in a container!\n");
                return -EINVAL;
        }
-        return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+        ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+        kfree(grace_start);
+        return ret;
+}
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+        wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+                         TASK_UNINTERRUPTIBLE);
+}
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+        smp_mb__before_atomic();
+        clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+        smp_mb__after_atomic();
+        wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
 }
 static void
 nfsd4_umh_cltrack_create(struct nfs4_client *clp)
 {
-        char *hexid;
+        char *hexid, *has_session, *grace_start;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        /*
+         * With v4.0 clients, there's little difference in outcome between a
+         * create and check operation, and we can end up calling into this
+         * function multiple times per client (once for each openowner). So,
+         * for v4.0 clients skip upcalling once the client has been recorded
+         * on stable storage.
+         *
+         * For v4.1+ clients, the outcome of the two operations is different,
+         * so we must ensure that we upcall for the create operation. v4.1+
+         * clients call this on RECLAIM_COMPLETE though, so we should only end
+         * up doing a single create upcall per client.
+         */
+        if (clp->cl_minorversion == 0 &&
+            test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+                return;
        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
        if (!hexid) {
                dprintk("%s: can't allocate memory for upcall!\n", __func__);
                return;
        }
-        nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+        has_session = nfsd4_cltrack_client_has_session(clp);
+        grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+        nfsd4_cltrack_upcall_lock(clp);
+        if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+                set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+        nfsd4_cltrack_upcall_unlock(clp);
+        kfree(has_session);
+        kfree(grace_start);
        kfree(hexid);
 }
@@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
 {
        char *hexid;
+        if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+                return;
        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
        if (!hexid) {
                dprintk("%s: can't allocate memory for upcall!\n", __func__);
                return;
        }
-        nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+        nfsd4_cltrack_upcall_lock(clp);
+        if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+            nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+                clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+        nfsd4_cltrack_upcall_unlock(clp);
        kfree(hexid);
 }
@@ -1230,30 +1348,45 @@ static int
 nfsd4_umh_cltrack_check(struct nfs4_client *clp)
 {
        int ret;
-        char *hexid, *legacy;
+        char *hexid, *has_session, *legacy;
+        if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+                return 0;
        hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
        if (!hexid) {
                dprintk("%s: can't allocate memory for upcall!\n", __func__);
                return -ENOMEM;
        }
+        has_session = nfsd4_cltrack_client_has_session(clp);
        legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
-        ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+        nfsd4_cltrack_upcall_lock(clp);
+        if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+                ret = 0;
+        } else {
+                ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+                if (ret == 0)
+                        set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+        }
+        nfsd4_cltrack_upcall_unlock(clp);
+        kfree(has_session);
        kfree(legacy);
        kfree(hexid);
        return ret;
 }
 static void
-nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
-                                time_t boot_time)
 {
        char *legacy;
        char timestr[22]; /* FIXME: better way to determine max size? */
-        sprintf(timestr, "%ld", boot_time);
+        sprintf(timestr, "%ld", nn->boot_time);
        legacy = nfsd4_cltrack_legacy_topdir();
-        nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+        nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
        kfree(legacy);
 }
@@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
 }
 void
-nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn)
 {
        if (nn->client_tracking_ops)
-                nn->client_tracking_ops->grace_done(nn, boot_time);
+                nn->client_tracking_ops->grace_done(nn);
 }
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d1b851548b7a..e9c3afe4b5d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab;
 static void free_session(struct nfsd4_session *);
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 static bool is_session_dead(struct nfsd4_session *ses)
 {
        return ses->se_flags & NFS4_SESSION_DEAD;
@@ -650,7 +652,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
        INIT_LIST_HEAD(&dp->dl_perclnt);
        INIT_LIST_HEAD(&dp->dl_recall_lru);
        dp->dl_type = NFS4_OPEN_DELEGATE_READ;
-        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall);
+        dp->dl_retries = 1;
+        nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+                      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
        return dp;
 out_dec:
        atomic_long_dec(&num_delegations);
@@ -1870,7 +1874,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
                free_client(clp);
                return NULL;
        }
-        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null);
+        nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        copy_verf(clp, verf);
@@ -3355,8 +3359,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
        return ret;
 }
-void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
 {
+        struct nfs4_delegation *dp = cb_to_delegation(cb);
        struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
                                          nfsd_net_id);
@@ -3377,6 +3382,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
        spin_unlock(&state_lock);
 }
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+                struct rpc_task *task)
+{
+        struct nfs4_delegation *dp = cb_to_delegation(cb);
+        switch (task->tk_status) {
+        case 0:
+                return 1;
+        case -EBADHANDLE:
+        case -NFS4ERR_BAD_STATEID:
+                /*
+                 * Race: client probably got cb_recall before open reply
+                 * granting delegation.
+                 */
+                if (dp->dl_retries--) {
+                        rpc_delay(task, 2 * HZ);
+                        return 0;
+                }
+                /*FALLTHRU*/
+        default:
+                return -1;
+        }
+}
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+        struct nfs4_delegation *dp = cb_to_delegation(cb);
+        nfs4_put_stid(&dp->dl_stid);
+}
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+        .prepare        = nfsd4_cb_recall_prepare,
+        .done           = nfsd4_cb_recall_done,
+        .release        = nfsd4_cb_recall_release,
+};
 static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
        /*
@@ -3387,7 +3429,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
         * it's safe to take a reference.
         */
        atomic_inc(&dp->dl_stid.sc_count);
-        nfsd4_cb_recall(dp);
+        nfsd4_run_cb(&dp->dl_recall);
 }
 /* Called from break_lease() with i_lock held. */
@@ -4113,7 +4155,7 @@ out:
        return status;
 }
-static void
+void
 nfsd4_end_grace(struct nfsd_net *nn)
 {
        /* do nothing if grace period already ended */
@@ -4122,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
        dprintk("NFSD: end of grace period\n");
        nn->grace_ended = true;
-        nfsd4_record_grace_done(nn, nn->boot_time);
+        /*
+         * If the server goes down again right now, an NFSv4
+         * client will still be allowed to reclaim after it comes back up,
+         * even if it hasn't yet had a chance to reclaim state this time.
+         *
+         */
+        nfsd4_record_grace_done(nn);
+        /*
+         * At this point, NFSv4 clients can still reclaim.  But if the
+         * server crashes, any that have not yet reclaimed will be out
+         * of luck on the next boot.
+         *
+         * (NFSv4.1+ clients are considered to have reclaimed once they
+         * call RECLAIM_COMPLETE.  NFSv4.0 clients are considered to
+         * have reclaimed after their first OPEN.)
+         */
        locks_end_grace(&nn->nfsd4_manager);
        /*
-         * Now that every NFSv4 client has had the chance to recover and
+         * At this point, and once lockd and/or any other containers
-         * to see the (possibly new, possibly shorter) lease time, we
+         * exit their grace period, further reclaims will fail and
-         * can safely set the next grace time to the current lease time:
+         * regular locking can resume.
         */
-        nn->nfsd4_grace = nn->nfsd4_lease;
 }
 static time_t
@@ -5664,6 +5720,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
        if (status)
                return nfserr_reclaim_bad;
+        if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+                return nfserr_no_grace;
        if (nfsd4_client_record_check(cstate->clp))
                return nfserr_reclaim_bad;
@@ -6361,10 +6420,10 @@ nfs4_state_start_net(struct net *net)
        ret = nfs4_state_create_net(net);
        if (ret)
                return ret;
-        nfsd4_client_tracking_init(net);
        nn->boot_time = get_seconds();
-        locks_start_grace(net, &nn->nfsd4_manager);
        nn->grace_ended = false;
+        locks_start_grace(net, &nn->nfsd4_manager);
+        nfsd4_client_tracking_init(net);
        printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
               nn->nfsd4_grace, net);
        queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e771a1a7c6f1..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1514,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 }
 static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+        DECODE_HEAD;
+        status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+        if (status)
+                return status;
+        READ_BUF(8 + 4);
+        p = xdr_decode_hyper(p, &seek->seek_offset);
+        seek->seek_whence = be32_to_cpup(p);
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
        return nfs_ok;
@@ -1586,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_destroy_clientid,
        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+        /* new operations for NFSv4.2 */
+        [OP_ALLOCATE]           = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_COPY]               = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_COPY_NOTIFY]        = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DEALLOCATE]         = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_IO_ADVISE]          = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTERROR]        = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTSTATS]        = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OFFLOAD_CANCEL]     = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OFFLOAD_STATUS]     = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_READ_PLUS]          = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SEEK]               = (nfsd4_dec)nfsd4_decode_seek,
+        [OP_WRITE_SAME]         = (nfsd4_dec)nfsd4_decode_notsupp,
 };
 static inline bool
@@ -2658,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        struct xdr_stream *xdr = cd->xdr;
        int start_offset = xdr->buf->len;
        int cookie_offset;
+        u32 name_and_cookie;
        int entry_bytes;
        __be32 nfserr = nfserr_toosmall;
        __be64 wire_offset;
@@ -2719,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        cd->rd_maxcount -= entry_bytes;
        if (!cd->rd_dircount)
                goto fail;
-        cd->rd_dircount--;
+        /*
+         * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+         * let's always let through the first entry, at least:
+         */
+        name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+        if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+                goto fail;
+        cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
        cd->cookie_offset = cookie_offset;
 skip_entry:
        cd->common.err = nfs_ok;
@@ -3097,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
        buf->page_len = maxcount;
        buf->len += maxcount;
-        xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+        xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+                                                        / PAGE_SIZE;
        /* Use rest of head for padding and remaining ops: */
        buf->tail[0].iov_base = xdr->p;
@@ -3322,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        }
        maxcount = min_t(int, maxcount-16, bytes_left);
+        /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+        if (!readdir->rd_dircount)
+                readdir->rd_dircount = INT_MAX;
        readdir->xdr = xdr;
        readdir->rd_maxcount = maxcount;
        readdir->common.err = 0;
@@ -3752,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+                  struct nfsd4_seek *seek)
+{
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(&resp->xdr, 4 + 8);
+        *p++ = cpu_to_be32(seek->seek_eof);
+        p = xdr_encode_hyper(p, seek->seek_pos);
+        return nfserr;
+}
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
        return nfserr;
@@ -3823,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_WANT_DELEGATION]    = (nfsd4_enc)nfsd4_encode_noop,
        [OP_DESTROY_CLIENTID]   = (nfsd4_enc)nfsd4_encode_noop,
        [OP_RECLAIM_COMPLETE]   = (nfsd4_enc)nfsd4_encode_noop,
+        /* NFSv4.2 operations */
+        [OP_ALLOCATE]           = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_COPY]               = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_COPY_NOTIFY]        = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_DEALLOCATE]         = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_IO_ADVISE]          = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTERROR]        = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTSTATS]        = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_OFFLOAD_CANCEL]     = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_OFFLOAD_STATUS]     = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_READ_PLUS]          = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SEEK]               = (nfsd4_enc)nfsd4_encode_seek,
+        [OP_WRITE_SAME]         = (nfsd4_enc)nfsd4_encode_noop,
 };
 /*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
        NFSD_Leasetime,
        NFSD_Gracetime,
        NFSD_RecoveryDir,
+        NFSD_V4EndGrace,
 #endif
 };
@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
 #endif
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Leasetime] = write_leasetime,
        [NFSD_Gracetime] = write_gracetime,
        [NFSD_RecoveryDir] = write_recoverydir,
+        [NFSD_V4EndGrace] = write_v4_end_grace,
 #endif
 };
@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
        return rv;
 }
+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * OR
+ *
+ * Input:
+ *                      buf:            any value
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *                      passed-in buffer filled with "Y" or "N" with a newline
+ *                      and NULL-terminated C string. This indicates whether
+ *                      the grace period has ended in the current net
+ *                      namespace. Return code is the size in bytes of the
+ *                      string. Writing a string that starts with 'Y', 'y', or
+ *                      '1' to the file will end the grace period for nfsd's v4
+ *                      lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+        struct net *net = file->f_dentry->d_sb->s_fs_info;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        if (size > 0) {
+                switch(buf[0]) {
+                case 'Y':
+                case 'y':
+                case '1':
+                        nfsd4_end_grace(nn);
+                        break;
+                default:
+                        return -EINVAL;
+                }
+        }
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+                         nn->grace_ended ? 'Y' : 'N');
+}
 #endif
 /*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
 #endif
                /* last one */ {""}
        };
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e883a5868be6..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                 * fix that case easily.
                 */
                struct cred *new = prepare_creds();
-                if (!new)
+                if (!new) {
-                        return nfserrno(-ENOMEM);
+                        error =  nfserrno(-ENOMEM);
+                        goto out;
+                }
                new->cap_effective =
                        cap_raise_nfsd_set(new->cap_effective,
                                           new->cap_permitted);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 64f291a25a8c..2712042a66b1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,16 +62,21 @@ typedef struct {
        (s)->si_generation
 struct nfsd4_callback {
-        void *cb_op;
        struct nfs4_client *cb_clp;
        struct list_head cb_per_client;
        u32 cb_minorversion;
        struct rpc_message cb_msg;
-        const struct rpc_call_ops *cb_ops;
+        struct nfsd4_callback_ops *cb_ops;
        struct work_struct cb_work;
        bool cb_done;
 };
+struct nfsd4_callback_ops {
+        void (*prepare)(struct nfsd4_callback *);
+        int (*done)(struct nfsd4_callback *, struct rpc_task *);
+        void (*release)(struct nfsd4_callback *);
+};
 /*
 * A core object that represents a "common" stateid. These are generally
 * embedded within the different (more specific) stateid objects and contain
@@ -127,6 +132,9 @@ struct nfs4_delegation {
        struct nfsd4_callback   dl_recall;
 };
+#define cb_to_delegation(cb) \
+        container_of(cb, struct nfs4_delegation, dl_recall)
 /* client delegation callback info */
 struct nfs4_cb_conn {
        /* SETCLIENTID info */
@@ -306,6 +314,7 @@ struct nfs4_client {
 #define NFSD4_CLIENT_STABLE             (2)     /* client on stable storage */
 #define NFSD4_CLIENT_RECLAIM_COMPLETE   (3)     /* reclaim_complete done */
 #define NFSD4_CLIENT_CONFIRMED          (4)     /* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK        (5)     /* upcall serialization */
 #define NFSD4_CLIENT_CB_FLAG_MASK       (1 << NFSD4_CLIENT_CB_UPDATE | \
                                         1 << NFSD4_CLIENT_CB_KILL)
        unsigned long           cl_flags;
@@ -516,6 +525,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 #define RD_STATE                0x00000010
 #define WR_STATE                0x00000020
+enum nfsd4_cb_op {
+        NFSPROC4_CLNT_CB_NULL = 0,
+        NFSPROC4_CLNT_CB_RECALL,
+        NFSPROC4_CLNT_CB_SEQUENCE,
+};
 struct nfsd4_compound_state;
 struct nfsd_net;
@@ -530,12 +546,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
                struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
-void nfsd4_run_cb_null(struct work_struct *w);
-void nfsd4_run_cb_recall(struct work_struct *w);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+                struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -544,13 +560,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
                                                        struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
 extern void nfsd4_client_tracking_exit(struct net *net);
 extern void nfsd4_client_record_create(struct nfs4_client *clp);
 extern void nfsd4_client_record_remove(struct nfs4_client *clp);
 extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
 /* nfs fault injection functions */
 #ifdef CONFIG_NFSD_FAULT_INJECTION
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f501a9b5c9df..965cffd17a0c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                if (err)
                        goto out;
                size_change = 1;
+                /*
+                 * RFC5661, Section 18.30.4:
+                 *   Changing the size of a file with SETATTR indirectly
+                 *   changes the time_modify and change attributes.
+                 *
+                 * (and similar for the older RFCs)
+                 */
+                if (iap->ia_size != i_size_read(inode))
+                        iap->ia_valid |= ATTR_MTIME;
        }
        iap->ia_valid |= ATTR_CTIME;
@@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 {
        struct path     path;
        struct inode    *inode;
+        struct file     *file;
        int             flags = O_RDONLY|O_LARGEFILE;
        __be32          err;
        int             host_err = 0;
@@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
                else
                        flags = O_WRONLY|O_LARGEFILE;
        }
-        *filp = dentry_open(&path, flags, current_cred());
-        if (IS_ERR(*filp)) {
-                host_err = PTR_ERR(*filp);
-                *filp = NULL;
-        } else {
-                host_err = ima_file_check(*filp, may_flags);
-                if (may_flags & NFSD_MAY_64BIT_COOKIE)
+        file = dentry_open(&path, flags, current_cred());
-                        (*filp)->f_mode |= FMODE_64BITHASH;
+        if (IS_ERR(file)) {
-                else
+                host_err = PTR_ERR(file);
-                        (*filp)->f_mode |= FMODE_32BITHASH;
+                goto out_nfserr;
        }
+        host_err = ima_file_check(file, may_flags);
+        if (host_err) {
+                nfsd_close(file);
+                goto out_nfserr;
+        }
+        if (may_flags & NFSD_MAY_64BIT_COOKIE)
+                file->f_mode |= FMODE_64BITHASH;
+        else
+                file->f_mode |= FMODE_32BITHASH;
+        *filp = file;
 out_nfserr:
        err = nfserrno(host_err);
 out:
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 465e7799742a..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete {
        u32 rca_one_fs;
 };
+struct nfsd4_seek {
+        /* request */
+        stateid_t       seek_stateid;
+        loff_t          seek_offset;
+        u32             seek_whence;
+        /* response */
+        u32             seek_eof;
+        loff_t          seek_pos;
+};
 struct nfsd4_op {
        int                                     opnum;
        __be32                                  status;
@@ -473,6 +484,9 @@ struct nfsd4_op {
                struct nfsd4_reclaim_complete   reclaim_complete;
                struct nfsd4_test_stateid       test_stateid;
                struct nfsd4_free_stateid       free_stateid;
+                /* NFSv4.2 */
+                struct nfsd4_seek               seek;
        } u;
        struct nfs4_replay *                    replay;
 };
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..d071e7f23de2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
+#include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/aio.h>
 #include "nilfs.h"
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 static int nilfs_set_page_dirty(struct page *page)
 {
+        struct inode *inode = page->mapping->host;
        int ret = __set_page_dirty_nobuffers(page);
        if (page_has_buffers(page)) {
-                struct inode *inode = page->mapping->host;
                unsigned nr_dirty = 0;
                struct buffer_head *bh, *head;
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
                if (nr_dirty)
                        nilfs_set_file_dirty(inode, nr_dirty);
+        } else if (ret) {
+                unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b13992a41bd9..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        client_fd = get_unused_fd();
+        client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
        if (client_fd < 0)
                return client_fd;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 {
        struct {
                struct file_handle handle;
-                u8 pad[64];
+                u8 pad[MAX_HANDLE_SZ];
        } f;
        int size, ret, i;
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
        size = f.handle.handle_bytes >> 2;
        ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
-        if ((ret == 255) || (ret == -ENOSPC)) {
+        if ((ret == FILEID_INVALID) || (ret < 0)) {
                WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
                return 0;
        }
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..9c0898c4cfe1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
                                      struct fsnotify_group *group, struct vfsmount *mnt,
                                      int allow_dups);
-/* final kfree of a group */
-extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 /* vfsmount specific destruction of a mark */
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
 /*
 * Final freeing of a group
 */
-void fsnotify_final_destroy_group(struct fsnotify_group *group)
+static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
        if (group->ops->free_group_priv)
                group->ops->free_group_priv(group);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0f88bc0b4e6c..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        /* ideally the idr is empty and we won't hit the BUG in the callback */
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_destroy(&group->inotify_data.idr);
-        atomic_dec(&group->inotify_data.user->inotify_devs);
+        if (group->inotify_data.user) {
-        free_uid(group->inotify_data.user);
+                atomic_dec(&group->inotify_data.user->inotify_devs);
+                free_uid(group->inotify_data.user);
+        }
 }
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
 /* If 1, output debug messages, and if 0, don't. */
 int debug_msgs = 0;
-void __ntfs_debug (const char *file, int line, const char *function,
+void __ntfs_debug(const char *file, int line, const char *function,
                const char *fmt, ...)
 {
        struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5ec1ce7a532..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
        BUG_ON(!nr_pages);
        err = nr = 0;
        do {
-                pages[nr] = find_lock_page(mapping, index);
+                pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+                                FGP_ACCESSED);
                if (!pages[nr]) {
                        if (!*cached_page) {
                                *cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
 }
 MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
        handle_t *handle;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
        page = find_or_create_page(mapping, 0, GFP_NOFS);
        if (!page) {
+                ocfs2_commit_trans(osb, handle);
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
        wc->w_pages[0] = wc->w_target_page = page;
        wc->w_num_pages = 1;
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..d13385448168 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
 }
 EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
+{
+        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long flags;
+        spin_lock_irqsave(&o2hb_live_lock, flags);
+        o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+        spin_unlock_irqrestore(&o2hb_live_lock, flags);
+        if (!test_bit(node_num, testing_map)) {
+                mlog(ML_HEARTBEAT,
+                     "node (%u) does not have heartbeating enabled.\n",
+                     node_num);
+                return 0;
+        }
+        return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num)
 {
        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
 void o2hb_exit(void);
 int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
 static int nst_fop_open(struct inode *inode, struct file *file)
 {
        struct o2net_send_tracking *dummy_nst;
-        struct seq_file *seq;
-        int ret;
-        dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+        dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
-        if (dummy_nst == NULL) {
+        if (!dummy_nst)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
-        dummy_nst->st_task = NULL;
-        ret = seq_open(file, &nst_seq_ops);
-        if (ret)
-                goto out;
-        seq = file->private_data;
-        seq->private = dummy_nst;
        o2net_debug_add_nst(dummy_nst);
-        dummy_nst = NULL;
+        return 0;
-out:
-        kfree(dummy_nst);
-        return ret;
 }
 static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+static int sc_common_open(struct file *file, int ctxt)
 {
+        struct o2net_sock_debug *sd;
        struct o2net_sock_container *dummy_sc;
-        struct seq_file *seq;
-        int ret;
-        dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+        dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
-        if (dummy_sc == NULL) {
+        if (!dummy_sc)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
-        dummy_sc->sc_page = NULL;
-        ret = seq_open(file, &sc_seq_ops);
+        sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
-        if (ret)
+        if (!sd) {
-                goto out;
+                kfree(dummy_sc);
+                return -ENOMEM;
+        }
-        seq = file->private_data;
+        sd->dbg_ctxt = ctxt;
-        seq->private = sd;
        sd->dbg_sock = dummy_sc;
-        o2net_debug_add_sc(dummy_sc);
-        dummy_sc = NULL;
+        o2net_debug_add_sc(dummy_sc);
-out:
+        return 0;
-        kfree(dummy_sc);
-        return ret;
 }
 static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 static int stats_fop_open(struct inode *inode, struct file *file)
 {
-        struct o2net_sock_debug *sd;
+        return sc_common_open(file, SHOW_SOCK_STATS);
-        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-        if (sd == NULL)
-                return -ENOMEM;
-        sd->dbg_ctxt = SHOW_SOCK_STATS;
-        sd->dbg_sock = NULL;
-        return sc_common_open(file, sd);
 }
 static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
 static int sc_fop_open(struct inode *inode, struct file *file)
 {
-        struct o2net_sock_debug *sd;
+        return sc_common_open(file, SHOW_SOCK_CONTAINERS);
-        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-        if (sd == NULL)
-                return -ENOMEM;
-        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
-        sd->dbg_sock = NULL;
-        return sc_common_open(file, sd);
 }
 static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
        }
 out:
-        spin_unlock(&qs->qs_lock);
+        if (fence) {
-        if (fence)
+                spin_unlock(&qs->qs_lock);
                o2quo_fence_self();
+        } else {
+                mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+                        "connected: %d, lowest: %d (%sreachable)\n",
+                        qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+                        lowest_reachable ? "" : "un");
+                spin_unlock(&qs->qs_lock);
+        }
 }
 static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..97de0fbd9f78 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        if (nn->nn_persistent_error || nn->nn_sc_valid)
                wake_up(&nn->nn_sc_wq);
-        if (!was_err && nn->nn_persistent_error) {
+        if (was_valid && !was_err && nn->nn_persistent_error) {
                o2quo_conn_err(o2net_num_from_nn(nn));
                queue_delayed_work(o2net_wq, &nn->nn_still_up,
                                   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
        return ret;
 }
+static int o2net_set_usertimeout(struct socket *sock)
+{
+        int user_timeout = O2NET_TCP_USER_TIMEOUT;
+        return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+                                (char *)&user_timeout, sizeof(user_timeout));
+}
 static void o2net_initialize_handshake(void)
 {
        o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
 #endif
        printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
-               "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+               "idle for %lu.%lu secs.\n",
-               msecs / 1000, msecs % 1000);
+               SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
-        /*
+        /* idle timerout happen, don't shutdown the connection, but
-         * Initialize the nn_timeout so that the next connection attempt
+         * make fence decision. Maybe the connection can recover before
-         * will continue in o2net_start_connect.
+         * the decision is made.
         */
        atomic_set(&nn->nn_timeout, 1);
+        o2quo_conn_err(o2net_num_from_nn(nn));
+        queue_delayed_work(o2net_wq, &nn->nn_still_up,
+                        msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+        o2net_sc_reset_idle_timer(sc);
-        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
 {
+        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+        /* clear fence decision since the connection recover from timeout*/
+        if (atomic_read(&nn->nn_timeout)) {
+                o2quo_conn_up(o2net_num_from_nn(nn));
+                cancel_delayed_work(&nn->nn_still_up);
+                atomic_set(&nn->nn_timeout, 0);
+        }
        /* Only push out an existing timer */
        if (timer_pending(&sc->sc_idle_timeout))
                o2net_sc_reset_idle_timer(sc);
@@ -1580,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
        struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
        int ret = 0, stop;
        unsigned int timeout;
+        unsigned int noio_flag;
+        /*
+         * sock_create allocates the sock with GFP_KERNEL. We must set
+         * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+         * by this process are done as if GFP_NOIO was specified. So we
+         * are not reentering filesystem while doing memory reclaim.
+         */
+        noio_flag = memalloc_noio_save();
        /* if we're greater we initiate tx, otherwise we accept */
        if (o2nm_this_node() <= o2net_num_from_nn(nn))
                goto out;
@@ -1650,6 +1679,12 @@ static void o2net_start_connect(struct work_struct *work)
                goto out;
        }
+        ret = o2net_set_usertimeout(sock);
+        if (ret) {
+                mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+                goto out;
+        }
        o2net_register_callbacks(sc->sc_sock->sk, sc);
        spin_lock(&nn->nn_lock);
@@ -1683,6 +1718,7 @@ out:
        if (mynode)
                o2nm_node_put(mynode);
+        memalloc_noio_restore(noio_flag);
        return;
 }
@@ -1694,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
        spin_lock(&nn->nn_lock);
        if (!nn->nn_sc_valid) {
                printk(KERN_NOTICE "o2net: No connection established with "
-                       "node %u after %u.%u seconds, giving up.\n",
+                       "node %u after %u.%u seconds, check network and"
+                       " cluster configuration.\n",
                     o2net_num_from_nn(nn),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
@@ -1808,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
        struct o2nm_node *local_node = NULL;
        struct o2net_sock_container *sc = NULL;
        struct o2net_node *nn;
+        unsigned int noio_flag;
+        /*
+         * sock_create_lite allocates the sock with GFP_KERNEL. We must set
+         * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+         * by this process are done as if GFP_NOIO was specified. So we
+         * are not reentering filesystem while doing memory reclaim.
+         */
+        noio_flag = memalloc_noio_save();
        BUG_ON(sock == NULL);
        *more = 0;
@@ -1831,6 +1877,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
                goto out;
        }
+        ret = o2net_set_usertimeout(new_sock);
+        if (ret) {
+                mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+                goto out;
+        }
        slen = sizeof(sin);
        ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
                                       &slen, 1);
@@ -1918,6 +1970,8 @@ out:
                o2nm_node_put(local_node);
        if (sc)
                sc_put(sc);
+        memalloc_noio_restore(noio_flag);
        return ret;
 }
@@ -2113,17 +2167,13 @@ int o2net_init(void)
        o2quo_init();
        if (o2net_debugfs_init())
-                return -ENOMEM;
+                goto out;
        o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
        o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
        o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
-        if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+        if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
-                kfree(o2net_hand);
+                goto out;
-                kfree(o2net_keep_req);
-                kfree(o2net_keep_resp);
-                return -ENOMEM;
-        }
        o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
        o2net_hand->connector_id = cpu_to_be64(1);
@@ -2148,6 +2198,14 @@ int o2net_init(void)
        }
        return 0;
+out:
+        kfree(o2net_hand);
+        kfree(o2net_keep_req);
+        kfree(o2net_keep_resp);
+        o2quo_exit();
+        return -ENOMEM;
 }
 void o2net_exit(void)
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        2000
 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT           30000
+#define O2NET_TCP_USER_TIMEOUT                  0x7fffffff
 /* TODO: figure this out.... */
 static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
 static int debug_lockres_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        int ret = -ENOMEM;
+        struct debug_lockres *dl;
-        struct seq_file *seq;
+        void *buf;
-        struct debug_lockres *dl = NULL;
-        dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-        if (!dl) {
+        if (!buf)
-                mlog_errno(ret);
                goto bail;
-        }
-        dl->dl_len = PAGE_SIZE;
+        dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
-        dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+        if (!dl)
-        if (!dl->dl_buf) {
+                goto bailfree;
-                mlog_errno(ret);
-                goto bail;
-        }
-        ret = seq_open(file, &debug_lockres_ops);
+        dl->dl_len = PAGE_SIZE;
-        if (ret) {
+        dl->dl_buf = buf;
-                mlog_errno(ret);
-                goto bail;
-        }
-        seq = file->private_data;
-        seq->private = dl;
        dlm_grab(dlm);
        dl->dl_ctxt = dlm;
        return 0;
+bailfree:
+        kfree(buf);
 bail:
-        if (dl)
+        mlog_errno(-ENOMEM);
-                kfree(dl->dl_buf);
+        return -ENOMEM;
-        kfree(dl);
-        return ret;
 }
 static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3fcf205ee900..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
         * to back off and try again.  This gives heartbeat a chance
         * to catch up.
         */
-        if (!o2hb_check_node_heartbeating(query->node_idx)) {
+        if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
                mlog(0, "node %u is not in our live map yet\n",
                     query->node_idx);
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
        if (!dlm) {
-                mlog_errno(-ENOMEM);
+                ret = -ENOMEM;
+                mlog_errno(ret);
                goto leave;
        }
        dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
-                mlog_errno(-ENOMEM);
+                ret = -ENOMEM;
-                kfree(dlm);
+                mlog_errno(ret);
-                dlm = NULL;
                goto leave;
        }
        dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
        if (!dlm->lockres_hash) {
-                mlog_errno(-ENOMEM);
+                ret = -ENOMEM;
-                kfree(dlm->name);
+                mlog_errno(ret);
-                kfree(dlm);
-                dlm = NULL;
                goto leave;
        }
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->master_hash = (struct hlist_head **)
                                dlm_alloc_pagevec(DLM_HASH_PAGES);
        if (!dlm->master_hash) {
-                mlog_errno(-ENOMEM);
+                ret = -ENOMEM;
-                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                mlog_errno(ret);
-                kfree(dlm->name);
-                kfree(dlm);
-                dlm = NULL;
                goto leave;
        }
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->node_num = o2nm_this_node();
        ret = dlm_create_debugfs_subroot(dlm);
-        if (ret < 0) {
+        if (ret < 0)
-                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
-                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
-                kfree(dlm->name);
-                kfree(dlm);
-                dlm = NULL;
                goto leave;
-        }
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                  atomic_read(&dlm->dlm_refs.refcount));
 leave:
+        if (ret < 0 && dlm) {
+                if (dlm->master_hash)
+                        dlm_free_pagevec((void **)dlm->master_hash,
+                                        DLM_HASH_PAGES);
+                if (dlm->lockres_hash)
+                        dlm_free_pagevec((void **)dlm->lockres_hash,
+                                        DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+        }
        return dlm;
 }
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3ec906ef5d9a..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
        return res;
 error:
-        if (res && res->lockname.name)
-                kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
        if (res)
                kmem_cache_free(dlm_lockres_cache, res);
        return NULL;
@@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
        clear_bit(bit, res->refmap);
 }
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
                                   struct dlm_lock_resource *res)
 {
-        assert_spin_locked(&res->spinlock);
        res->inflight_locks++;
        mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
             __builtin_return_address(0));
 }
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
+        __dlm_lockres_grab_inflight_ref(dlm, res);
+}
 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
                                   struct dlm_lock_resource *res)
 {
@@ -894,10 +895,8 @@ lookup:
        /* finally add the lockres to its hash bucket */
        __dlm_insert_lockres(dlm, res);
-        /* Grab inflight ref to pin the resource */
+        /* since this lockres is new it doesn't not require the spinlock */
-        spin_lock(&res->spinlock);
+        __dlm_lockres_grab_inflight_ref(dlm, res);
-        dlm_lockres_grab_inflight_ref(dlm, res);
-        spin_unlock(&res->spinlock);
        /* get an extra ref on the mle in case this is a BLOCK
         * if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2036,10 @@ kill:
             "and killing the other node now!  This node is OK and can continue.\n");
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
+        spin_lock(&dlm->master_lock);
+        if (mle)
+                __dlm_put_mle(mle);
+        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
        *ret_data = (void *)res;
        dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
                                BUG();
                        } else
                                __dlm_lockres_grab_inflight_worker(dlm, res);
-                } else /* put.. incase we are not the master */
+                        spin_unlock(&res->spinlock);
+                } else {
+                        /* put.. incase we are not the master */
+                        spin_unlock(&res->spinlock);
                        dlm_lockres_put(res);
-                spin_unlock(&res->spinlock);
+                }
        }
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
 {
-        int ret;
        struct ocfs2_dlm_seq_priv *priv;
-        struct seq_file *seq;
        struct ocfs2_super *osb;
-        priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+        priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
        if (!priv) {
-                ret = -ENOMEM;
+                mlog_errno(-ENOMEM);
-                mlog_errno(ret);
+                return -ENOMEM;
-                goto out;
        }
        osb = inode->i_private;
        ocfs2_get_dlm_debug(osb->osb_dlm_debug);
        priv->p_dlm_debug = osb->osb_dlm_debug;
        INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
-        ret = seq_open(file, &ocfs2_dlm_seq_ops);
-        if (ret) {
-                kfree(priv);
-                mlog_errno(ret);
-                goto out;
-        }
-        seq = file->private_data;
-        seq->private = priv;
        ocfs2_add_lockres_tracking(&priv->p_iter_res,
                                   priv->p_dlm_debug);
-out:
+        return 0;
-        return ret;
 }
 static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..324dc93ac896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-        handle_t *handle = NULL;
+        handle_t *handle;
        int ret = 0;
        unsigned zero_from, zero_to, block_start, block_end;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
        BUG_ON(abs_from & (inode->i_blkbits - 1));
+        handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
        page = find_or_create_page(mapping, index, GFP_NOFS);
        if (!page) {
                ret = -ENOMEM;
                mlog_errno(ret);
-                goto out;
+                goto out_commit_trans;
        }
        /* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                        goto out_unlock;
                }
-                if (!handle) {
-                        handle = ocfs2_zero_start_ordered_transaction(inode,
-                                                                      di_bh);
-                        if (IS_ERR(handle)) {
-                                ret = PTR_ERR(handle);
-                                handle = NULL;
-                                break;
-                        }
-                }
                /* must not update i_size! */
                ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                        ret = 0;
        }
+        /*
+         * fs-writeback will release the dirty pages without page lock
+         * whose offset are over inode size, the release happens at
+         * block_write_full_page().
+         */
+        i_size_write(inode, abs_to);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        di->i_mtime_nsec = di->i_ctime_nsec;
        if (handle) {
-                /*
-                 * fs-writeback will release the dirty pages without page lock
-                 * whose offset are over inode size, the release happens at
-                 * block_write_full_page().
-                 */
-                i_size_write(inode, abs_to);
-                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                di->i_size = cpu_to_le64((u64)i_size_read(inode));
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-                di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-                di->i_mtime_nsec = di->i_ctime_nsec;
                ocfs2_journal_dirty(handle, di_bh);
                ocfs2_update_inode_fsync_trans(handle, inode, 1);
-                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        }
 out_unlock:
        unlock_page(page);
        page_cache_release(page);
+out_commit_trans:
+        if (handle)
+                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
@@ -1253,7 +1252,7 @@ bail:
        brelse(bh);
        /* Release quota pointers in case we acquired them */
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+        for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
                dqput(transfer_to[qtype]);
        if (!status && attr->ia_valid & ATTR_MODE) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 {
        int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
-        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+        return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
 }
 /* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
                copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
 /*
- * This call is void because we are already reporting an error that may
+ * This is just a best-effort to tell userspace that this request
- * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * caused the error.
- * just a best-effort to tell userspace that this request caused the error.
 */
 static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
 static int ocfs2_info_handle_blocksize(struct inode *inode,
                                       struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_blocksize oib;
        if (o2info_from_user(oib, req))
-                goto bail;
+                return -EFAULT;
        oib.ib_blocksize = inode->i_sb->s_blocksize;
        o2info_set_request_filled(&oib.ib_req);
        if (o2info_to_user(oib, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oib.ib_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_clustersize(struct inode *inode,
                                         struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_clustersize oic;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oic, req))
-                goto bail;
+                return -EFAULT;
        oic.ic_clustersize = osb->s_clustersize;
        o2info_set_request_filled(&oic.ic_req);
        if (o2info_to_user(oic, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oic.ic_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_maxslots(struct inode *inode,
                                      struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_maxslots oim;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oim, req))
-                goto bail;
+                return -EFAULT;
        oim.im_max_slots = osb->max_slots;
        o2info_set_request_filled(&oim.im_req);
        if (o2info_to_user(oim, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oim.im_req, req);
-        return status;
 }
 static int ocfs2_info_handle_label(struct inode *inode,
                                   struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_label oil;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oil, req))
-                goto bail;
+                return -EFAULT;
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
        o2info_set_request_filled(&oil.il_req);
        if (o2info_to_user(oil, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oil.il_req, req);
-        return status;
 }
 static int ocfs2_info_handle_uuid(struct inode *inode,
                                  struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_uuid oiu;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oiu, req))
-                goto bail;
+                return -EFAULT;
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
        o2info_set_request_filled(&oiu.iu_req);
        if (o2info_to_user(oiu, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oiu.iu_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_fs_features(struct inode *inode,
                                         struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_fs_features oif;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oif, req))
-                goto bail;
+                return -EFAULT;
        oif.if_compat_features = osb->s_feature_compat;
        oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
        o2info_set_request_filled(&oif.if_req);
        if (o2info_to_user(oif, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oif.if_req, req);
-        return status;
 }
 static int ocfs2_info_handle_journal_size(struct inode *inode,
                                          struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_journal_size oij;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oij, req))
-                goto bail;
+                return -EFAULT;
        oij.ij_journal_size = i_size_read(osb->journal->j_inode);
        o2info_set_request_filled(&oij.ij_req);
        if (o2info_to_user(oij, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oij.ij_req, req);
-        return status;
 }
 static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
        u32 i;
        u64 blkno = -1;
        char namebuf[40];
-        int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+        int status, type = INODE_ALLOC_SYSTEM_INODE;
        struct ocfs2_info_freeinode *oifi = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
                goto out_err;
        }
-        if (o2info_from_user(*oifi, req))
+        if (o2info_from_user(*oifi, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        oifi->ifi_slotnum = osb->max_slots;
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
        o2info_set_request_filled(&oifi->ifi_req);
-        if (o2info_to_user(*oifi, req))
+        if (o2info_to_user(*oifi, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        status = 0;
 bail:
        if (status)
                o2info_set_request_error(&oifi->ifi_req, req);
+out_free:
        kfree(oifi);
 out_err:
        return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
 {
        u64 blkno = -1;
        char namebuf[40];
-        int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+        int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
        struct ocfs2_info_freefrag *oiff;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
                goto out_err;
        }
-        if (o2info_from_user(*oiff, req))
+        if (o2info_from_user(*oiff, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        /*
         * chunksize from userspace should be power of 2.
         */
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
        if (o2info_to_user(*oiff, req)) {
                status = -EFAULT;
-                goto bail;
+                goto out_free;
        }
        status = 0;
 bail:
        if (status)
                o2info_set_request_error(&oiff->iff_req, req);
+out_free:
        kfree(oiff);
 out_err:
        return status;
@@ -727,23 +690,17 @@ out_err:
 static int ocfs2_info_handle_unknown(struct inode *inode,
                                     struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_request oir;
        if (o2info_from_user(oir, req))
-                goto bail;
+                return -EFAULT;
        o2info_clear_request_filled(&oir);
        if (o2info_to_user(oir, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oir, req);
-        return status;
 }
 /*
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6219aaadeb08..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
         * 'vict_blkno' was out of the valid range.
         */
        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
-            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+            (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
                                bits_per_unit))) {
                ret = -EINVAL;
                goto out;
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index f266d67df3c6..1eae330193a6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,6 +17,9 @@
 #include "ocfs2.h"
+/* Number of quota types we support */
+#define OCFS2_MAXQUOTAS 2
 /*
 * In-memory structures
 */
@@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk {
 };
 struct ocfs2_quota_recovery {
-        struct list_head r_list[MAXQUOTAS];     /* List of chunks to recover */
+        struct list_head r_list[OCFS2_MAXQUOTAS];       /* List of chunks to recover */
 };
 /* In-memory structure with quota header information */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b990a62cff50..c93d67220887 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 int ocfs2_global_read_info(struct super_block *sb, int type)
 {
        struct inode *gqinode = NULL;
-        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+        unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
-                                        GROUP_QUOTA_SYSTEM_INODE };
+                                              GROUP_QUOTA_SYSTEM_INODE };
        struct ocfs2_global_disk_dqinfo dinfo;
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2001862bf2b1..10b653930ee2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 /* Check whether we understand format of quota files */
 static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 {
-        unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+        unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
-        unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+        unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
-        unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+        unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
-        unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+        unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
-        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+        unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
-                                        GROUP_QUOTA_SYSTEM_INODE };
+                                              GROUP_QUOTA_SYSTEM_INODE };
        struct buffer_head *bh = NULL;
        struct inode *linode = sb_dqopt(sb)->files[type];
        struct inode *ginode = NULL;
@@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
 {
        int type;
-        for (type = 0; type < MAXQUOTAS; type++)
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++)
                free_recovery_list(&(rec->r_list[type]));
        kfree(rec);
 }
@@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
        rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
        if (!rec)
                return NULL;
-        for (type = 0; type < MAXQUOTAS; type++)
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++)
                INIT_LIST_HEAD(&(rec->r_list[type]));
        return rec;
 }
@@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
                                                struct ocfs2_super *osb,
                                                int slot_num)
 {
-        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                              LOCAL_GROUP_QUOTA_SYSTEM_INODE };
        struct super_block *sb = osb->sb;
        struct ocfs2_local_disk_dqinfo *ldinfo;
        struct inode *lqinode;
@@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
                return ERR_PTR(-ENOMEM);
        /* First init... */
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                /* At this point, journal of the slot is already replayed so
@@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                                struct ocfs2_quota_recovery *rec,
                                int slot_num)
 {
-        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+        unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
-                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+                                              LOCAL_GROUP_QUOTA_SYSTEM_INODE };
        struct super_block *sb = osb->sb;
        struct ocfs2_local_disk_dqinfo *ldinfo;
        struct buffer_head *bh;
@@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
               "slot %u\n", osb->dev_str, slot_num);
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
                if (list_empty(&(rec->r_list[type])))
                        continue;
                trace_ocfs2_finish_quota_recovery(slot_num);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
                 */
                ocfs2_control_this_node = -1;
                running_proto.pv_major = 0;
-                running_proto.pv_major = 0;
+                running_proto.pv_minor = 0;
        }
 out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..93c85bc745e1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
 {
        int type;
        struct super_block *sb = osb->sb;
-        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
        int status = 0;
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                if (unsuspend)
@@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
 static int ocfs2_enable_quotas(struct ocfs2_super *osb)
 {
-        struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+        struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
        struct super_block *sb = osb->sb;
-        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[OCFS2_MAXQUOTAS] = {
+                                        LOCAL_USER_QUOTA_SYSTEM_INODE,
                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
        int status;
        int type;
        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
@@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
                        goto out_quota_off;
        }
-        for (type = 0; type < MAXQUOTAS; type++)
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++)
                iput(inode[type]);
        return 0;
 out_quota_off:
        ocfs2_disable_quotas(osb);
-        for (type = 0; type < MAXQUOTAS; type++)
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++)
                iput(inode[type]);
        mlog_errno(status);
        return status;
@@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        /* We mostly ignore errors in this function because there's not much
         * we can do when we see them */
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
                if (!sb_has_quota_loaded(sb, type))
                        continue;
                /* Cancel periodic syncing before we grab dqonoff_mutex */
@@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
 {
-        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                return -EINVAL;
@@ -2532,6 +2536,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        kfree(osb->journal);
        kfree(osb->local_alloc_copy);
        kfree(osb->uuid_str);
+        kfree(osb->vol_label);
        ocfs2_put_dlm_debug(osb->osb_dlm_debug);
        memset(osb, 0, sizeof(struct ocfs2_super));
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
                 * other children
                 */
                if (child && list_empty(&child->mnt_mounts)) {
+                        list_del_init(&child->mnt_child);
                        hlist_del_init_rcu(&child->mnt_hash);
                        hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
                }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index baf852b648ad..950100e326a1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -376,37 +376,6 @@ static const struct file_operations proc_lstats_operations = {
 #endif
-#ifdef CONFIG_CGROUPS
-static int cgroup_open(struct inode *inode, struct file *file)
-{
-        struct pid *pid = PROC_I(inode)->pid;
-        return single_open(file, proc_cgroup_show, pid);
-}
-static const struct file_operations proc_cgroup_operations = {
-        .open           = cgroup_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-#endif
-#ifdef CONFIG_PROC_PID_CPUSET
-static int cpuset_open(struct inode *inode, struct file *file)
-{
-        struct pid *pid = PROC_I(inode)->pid;
-        return single_open(file, proc_cpuset_show, pid);
-}
-static const struct file_operations proc_cpuset_operations = {
-        .open           = cpuset_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-#endif
 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
 {
@@ -632,29 +601,35 @@ static const struct file_operations proc_single_file_operations = {
        .release        = single_release,
 };
-static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 {
-        struct task_struct *task = get_proc_task(file_inode(file));
+        struct task_struct *task = get_proc_task(inode);
-        struct mm_struct *mm;
+        struct mm_struct *mm = ERR_PTR(-ESRCH);
-        if (!task)
+        if (task) {
-                return -ESRCH;
+                mm = mm_access(task, mode);
+                put_task_struct(task);
-        mm = mm_access(task, mode);
+                if (!IS_ERR_OR_NULL(mm)) {
-        put_task_struct(task);
+                        /* ensure this mm_struct can't be freed */
+                        atomic_inc(&mm->mm_count);
+                        /* but do not pin its memory */
+                        mmput(mm);
+                }
+        }
+        return mm;
+}
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+        struct mm_struct *mm = proc_mem_open(inode, mode);
        if (IS_ERR(mm))
                return PTR_ERR(mm);
-        if (mm) {
-                /* ensure this mm_struct can't be freed */
-                atomic_inc(&mm->mm_count);
-                /* but do not pin its memory */
-                mmput(mm);
-        }
        file->private_data = mm;
        return 0;
 }
@@ -2573,10 +2548,10 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",     S_IRUGO, proc_cpuset_operations),
+        ONE("cpuset",     S_IRUGO, proc_cpuset_show),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
@@ -2919,10 +2894,10 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",    S_IRUGO, proc_cpuset_operations),
+        ONE("cpuset",    S_IRUGO, proc_cpuset_show),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
 * task_[no]mmu.c
 */
 struct proc_maps_private {
-        struct pid *pid;
+        struct inode *inode;
        struct task_struct *task;
+        struct mm_struct *mm;
 #ifdef CONFIG_MMU
        struct vm_area_struct *tail_vma;
 #endif
@@ -278,6 +279,8 @@ struct proc_maps_private {
 #endif
 };
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
 extern const struct file_operations proc_pid_maps_operations;
 extern const struct file_operations proc_tid_maps_operations;
 extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6df8d0722c97..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
 struct kcore_list kcore_modules;
 static void __init add_modules_range(void)
 {
-        kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+        if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+                kclist_add(&kcore_modules, (void *)MODULES_VADDR,
                        MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+        }
 }
 #else
 static void __init add_modules_range(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
        if (PageBuddy(page))
                u |= 1 << KPF_BUDDY;
+        if (PageBalloon(page))
+                u |= 1 << KPF_BALLOON;
        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c42d64..b7a7dc963a35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
 #ifdef CONFIG_NUMA
 /*
- * These functions are for numa_maps but called in generic **maps seq_file
+ * Save get_task_policy() for show_numa_map().
- * ->start(), ->stop() ops.
- *
- * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
- * Each mempolicy object is controlled by reference counting. The problem here
- * is how to avoid accessing dead mempolicy object.
- *
- * Because we're holding mmap_sem while reading seq_file, it's safe to access
- * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
- *
- * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
- * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
- * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
- * gurantee the task never exits under us. But taking task_lock() around
- * get_vma_plicy() causes lock order problem.
- *
- * To access task->mempolicy without lock, we hold a reference count of an
- * object pointed by task->mempolicy and remember it. This will guarantee
- * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
 */
 static void hold_task_mempolicy(struct proc_maps_private *priv)
 {
        struct task_struct *task = priv->task;
        task_lock(task);
-        priv->task_mempolicy = task->mempolicy;
+        priv->task_mempolicy = get_task_policy(task);
        mpol_get(priv->task_mempolicy);
        task_unlock(task);
 }
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+static void vma_stop(struct proc_maps_private *priv)
 {
-        if (vma && vma != priv->tail_vma) {
+        struct mm_struct *mm = priv->mm;
-                struct mm_struct *mm = vma->vm_mm;
-                release_task_mempolicy(priv);
+        release_task_mempolicy(priv);
-                up_read(&mm->mmap_sem);
+        up_read(&mm->mmap_sem);
-                mmput(mm);
+        mmput(mm);
-        }
+}
+static struct vm_area_struct *
+m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+        if (vma == priv->tail_vma)
+                return NULL;
+        return vma->vm_next ?: priv->tail_vma;
+}
+static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+        if (m->count < m->size) /* vma is copied successfully */
+                m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
 }
-static void *m_start(struct seq_file *m, loff_t *pos)
+static void *m_start(struct seq_file *m, loff_t *ppos)
 {
        struct proc_maps_private *priv = m->private;
        unsigned long last_addr = m->version;
        struct mm_struct *mm;
-        struct vm_area_struct *vma, *tail_vma = NULL;
+        struct vm_area_struct *vma;
-        loff_t l = *pos;
+        unsigned int pos = *ppos;
-        /* Clear the per syscall fields in priv */
-        priv->task = NULL;
-        priv->tail_vma = NULL;
-        /*
-         * We remember last_addr rather than next_addr to hit with
-         * vmacache most of the time. We have zero last_addr at
-         * the beginning and also after lseek. We will have -1 last_addr
-         * after the end of the vmas.
-         */
+        /* See m_cache_vma(). Zero at the start or after lseek. */
        if (last_addr == -1UL)
                return NULL;
-        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+        priv->task = get_proc_task(priv->inode);
        if (!priv->task)
                return ERR_PTR(-ESRCH);
-        mm = mm_access(priv->task, PTRACE_MODE_READ);
+        mm = priv->mm;
-        if (!mm || IS_ERR(mm))
+        if (!mm || !atomic_inc_not_zero(&mm->mm_users))
-                return mm;
+                return NULL;
-        down_read(&mm->mmap_sem);
-        tail_vma = get_gate_vma(priv->task->mm);
+        down_read(&mm->mmap_sem);
-        priv->tail_vma = tail_vma;
        hold_task_mempolicy(priv);
-        /* Start with last addr hint */
+        priv->tail_vma = get_gate_vma(mm);
-        vma = find_vma(mm, last_addr);
-        if (last_addr && vma) {
+        if (last_addr) {
-                vma = vma->vm_next;
+                vma = find_vma(mm, last_addr);
-                goto out;
+                if (vma && (vma = m_next_vma(priv, vma)))
+                        return vma;
        }
-        /*
+        m->version = 0;
-         * Check the vma index is within the range and do
+        if (pos < mm->map_count) {
-         * sequential scan until m_index.
+                for (vma = mm->mmap; pos; pos--) {
-         */
+                        m->version = vma->vm_start;
-        vma = NULL;
-        if ((unsigned long)l < mm->map_count) {
-                vma = mm->mmap;
-                while (l-- && vma)
                        vma = vma->vm_next;
-                goto out;
+                }
+                return vma;
        }
-        if (l != mm->map_count)
+        /* we do not bother to update m->version in this case */
-                tail_vma = NULL; /* After gate vma */
+        if (pos == mm->map_count && priv->tail_vma)
+                return priv->tail_vma;
-out:
-        if (vma)
-                return vma;
-        release_task_mempolicy(priv);
+        vma_stop(priv);
-        /* End of vmas has been reached */
+        return NULL;
-        m->version = (tail_vma != NULL)? 0: -1UL;
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        return tail_vma;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct proc_maps_private *priv = m->private;
-        struct vm_area_struct *vma = v;
+        struct vm_area_struct *next;
-        struct vm_area_struct *tail_vma = priv->tail_vma;
        (*pos)++;
-        if (vma && (vma != tail_vma) && vma->vm_next)
+        next = m_next_vma(priv, v);
-                return vma->vm_next;
+        if (!next)
-        vma_stop(priv, vma);
+                vma_stop(priv);
-        return (vma != tail_vma)? tail_vma: NULL;
+        return next;
 }
 static void m_stop(struct seq_file *m, void *v)
 {
        struct proc_maps_private *priv = m->private;
-        struct vm_area_struct *vma = v;
-        if (!IS_ERR(vma))
+        if (!IS_ERR_OR_NULL(v))
-                vma_stop(priv, vma);
+                vma_stop(priv);
-        if (priv->task)
+        if (priv->task) {
                put_task_struct(priv->task);
+                priv->task = NULL;
+        }
+}
+static int proc_maps_open(struct inode *inode, struct file *file,
+                        const struct seq_operations *ops, int psize)
+{
+        struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+        if (!priv)
+                return -ENOMEM;
+        priv->inode = inode;
+        priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+        if (IS_ERR(priv->mm)) {
+                int err = PTR_ERR(priv->mm);
+                seq_release_private(inode, file);
+                return err;
+        }
+        return 0;
+}
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct proc_maps_private *priv = seq->private;
+        if (priv->mm)
+                mmdrop(priv->mm);
+        return seq_release_private(inode, file);
 }
 static int do_maps_open(struct inode *inode, struct file *file,
                        const struct seq_operations *ops)
 {
-        struct proc_maps_private *priv;
+        return proc_maps_open(inode, file, ops,
-        int ret = -ENOMEM;
+                                sizeof(struct proc_maps_private));
-        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+}
-        if (priv) {
-                priv->pid = proc_pid(inode);
+static pid_t pid_of_stack(struct proc_maps_private *priv,
-                ret = seq_open(file, ops);
+                                struct vm_area_struct *vma, bool is_pid)
-                if (!ret) {
+{
-                        struct seq_file *m = file->private_data;
+        struct inode *inode = priv->inode;
-                        m->private = priv;
+        struct task_struct *task;
-                } else {
+        pid_t ret = 0;
-                        kfree(priv);
-                }
+        rcu_read_lock();
+        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+        if (task) {
+                task = task_of_stack(task, vma, is_pid);
+                if (task)
+                        ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
        }
+        rcu_read_unlock();
        return ret;
 }
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
        struct proc_maps_private *priv = m->private;
-        struct task_struct *task = priv->task;
        vm_flags_t flags = vma->vm_flags;
        unsigned long ino = 0;
        unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                        goto done;
                }
-                tid = vm_is_stack(task, vma, is_pid);
+                tid = pid_of_stack(priv, vma, is_pid);
                if (tid != 0) {
                        /*
                         * Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
 static int show_map(struct seq_file *m, void *v, int is_pid)
 {
-        struct vm_area_struct *vma = v;
+        show_map_vma(m, v, is_pid);
-        struct proc_maps_private *priv = m->private;
+        m_cache_vma(m, v);
-        struct task_struct *task = priv->task;
-        show_map_vma(m, vma, is_pid);
-        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task->mm))
-                        ? vma->vm_start : 0;
        return 0;
 }
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
        .open           = pid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 const struct file_operations proc_tid_maps_operations = {
        .open           = tid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 /*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
-        struct proc_maps_private *priv = m->private;
-        struct task_struct *task = priv->task;
        struct vm_area_struct *vma = v;
        struct mem_size_stats mss;
        struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                                mss.nonlinear >> 10);
        show_smap_vma_flags(m, vma);
+        m_cache_vma(m, vma);
-        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task->mm))
-                        ? vma->vm_start : 0;
        return 0;
 }
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
        .open           = pid_smaps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 const struct file_operations proc_tid_smaps_operations = {
        .open           = tid_smaps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 /*
@@ -931,23 +929,32 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
        while (addr < end) {
                struct vm_area_struct *vma = find_vma(walk->mm, addr);
                pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-                unsigned long vm_end;
+                /* End of address space hole, which we mark as non-present. */
+                unsigned long hole_end;
-                if (!vma) {
-                        vm_end = end;
+                if (vma)
-                } else {
+                        hole_end = min(end, vma->vm_start);
-                        vm_end = min(end, vma->vm_end);
+                else
-                        if (vma->vm_flags & VM_SOFTDIRTY)
+                        hole_end = end;
-                                pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+                for (; addr < hole_end; addr += PAGE_SIZE) {
+                        err = add_to_pagemap(addr, &pme, pm);
+                        if (err)
+                                goto out;
                }
-                for (; addr < vm_end; addr += PAGE_SIZE) {
+                if (!vma)
+                        break;
+                /* Addresses in the VMA. */
+                if (vma->vm_flags & VM_SOFTDIRTY)
+                        pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
                        err = add_to_pagemap(addr, &pme, pm);
                        if (err)
                                goto out;
                }
        }
 out:
        return err;
 }
@@ -1020,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        spinlock_t *ptl;
        pte_t *pte;
        int err = 0;
-        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
@@ -1034,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                for (; addr != end; addr += PAGE_SIZE) {
                        unsigned long offset;
+                        pagemap_entry_t pme;
                        offset = (addr & ~PAGEMAP_WALK_MASK) >>
                                        PAGE_SHIFT;
@@ -1048,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (pmd_trans_unstable(pmd))
                return 0;
-        for (; addr != end; addr += PAGE_SIZE) {
-                int flags2;
+        while (1) {
+                /* End of address space hole, which we mark as non-present. */
-                /* check to see if we've left 'vma' behind
+                unsigned long hole_end;
-                 * and need a new, higher one */
-                if (vma && (addr >= vma->vm_end)) {
+                if (vma)
-                        vma = find_vma(walk->mm, addr);
+                        hole_end = min(end, vma->vm_start);
-                        if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+                else
-                                flags2 = __PM_SOFT_DIRTY;
+                        hole_end = end;
-                        else
-                                flags2 = 0;
+                for (; addr < hole_end; addr += PAGE_SIZE) {
-                        pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+                        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+                        err = add_to_pagemap(addr, &pme, pm);
+                        if (err)
+                                return err;
                }
-                /* check that 'vma' actually covers this address,
+                if (!vma || vma->vm_start >= end)
-                 * and that it isn't a huge page vma */
+                        break;
-                if (vma && (vma->vm_start <= addr) &&
+                /*
-                    !is_vm_hugetlb_page(vma)) {
+                 * We can't possibly be in a hugetlb VMA. In general,
+                 * for a mm_walk with a pmd_entry and a hugetlb_entry,
+                 * the pmd_entry can only be called on addresses in a
+                 * hugetlb if the walk starts in a non-hugetlb VMA and
+                 * spans a hugepage VMA. Since pagemap_read walks are
+                 * PMD-sized and PMD-aligned, this will never be true.
+                 */
+                BUG_ON(is_vm_hugetlb_page(vma));
+                /* Addresses in the VMA. */
+                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+                        pagemap_entry_t pme;
                        pte = pte_offset_map(pmd, addr);
                        pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
-                        /* unmap before userspace copy */
                        pte_unmap(pte);
+                        err = add_to_pagemap(addr, &pme, pm);
+                        if (err)
+                                return err;
                }
-                err = add_to_pagemap(addr, &pme, pm);
-                if (err)
+                if (addr == end)
-                        return err;
+                        break;
+                vma = find_vma(walk->mm, addr);
        }
        cond_resched();
@@ -1406,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        struct vm_area_struct *vma = v;
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
-        struct task_struct *task = proc_priv->task;
        struct mm_struct *mm = vma->vm_mm;
        struct mm_walk walk = {};
        struct mempolicy *pol;
@@ -1426,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        walk.private = md;
        walk.mm = mm;
-        pol = get_vma_policy(task, vma, vma->vm_start);
+        pol = __get_vma_policy(vma, vma->vm_start);
-        mpol_to_str(buffer, sizeof(buffer), pol);
+        if (pol) {
-        mpol_cond_put(pol);
+                mpol_to_str(buffer, sizeof(buffer), pol);
+                mpol_cond_put(pol);
+        } else {
+                mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+        }
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1438,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_puts(m, " heap");
        } else {
-                pid_t tid = vm_is_stack(task, vma, is_pid);
+                pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
                if (tid != 0) {
                        /*
                         * Thread stack in /proc/PID/task/TID/maps or
@@ -1486,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
                        seq_printf(m, " N%d=%lu", nid, md->node[nid]);
 out:
        seq_putc(m, '\n');
+        m_cache_vma(m, vma);
-        if (m->count < m->size)
-                m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
        return 0;
 }
@@ -1519,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
 static int numa_maps_open(struct inode *inode, struct file *file,
                          const struct seq_operations *ops)
 {
-        struct numa_maps_private *priv;
+        return proc_maps_open(inode, file, ops,
-        int ret = -ENOMEM;
+                                sizeof(struct numa_maps_private));
-        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-        if (priv) {
-                priv->proc_maps.pid = proc_pid(inode);
-                ret = seq_open(file, ops);
-                if (!ret) {
-                        struct seq_file *m = file->private_data;
-                        m->private = priv;
-                } else {
-                        kfree(priv);
-                }
-        }
-        return ret;
 }
 static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1549,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
        .open           = pid_numa_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 const struct file_operations proc_tid_numa_maps_operations = {
        .open           = tid_numa_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = proc_map_release,
 };
 #endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
        return size;
 }
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+                                struct vm_area_struct *vma, bool is_pid)
+{
+        struct inode *inode = priv->inode;
+        struct task_struct *task;
+        pid_t ret = 0;
+        rcu_read_lock();
+        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+        if (task) {
+                task = task_of_stack(task, vma, is_pid);
+                if (task)
+                        ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+        }
+        rcu_read_unlock();
+        return ret;
+}
 /*
 * display a single VMA to a sequenced file
 */
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
                seq_pad(m, ' ');
                seq_path(m, &file->f_path, "");
        } else if (mm) {
-                pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+                pid_t tid = pid_of_stack(priv, vma, is_pid);
                if (tid != 0) {
                        seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        loff_t n = *pos;
        /* pin the task and mm whilst we play with them */
-        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+        priv->task = get_proc_task(priv->inode);
        if (!priv->task)
                return ERR_PTR(-ESRCH);
-        mm = mm_access(priv->task, PTRACE_MODE_READ);
+        mm = priv->mm;
-        if (!mm || IS_ERR(mm)) {
+        if (!mm || !atomic_inc_not_zero(&mm->mm_users))
-                put_task_struct(priv->task);
+                return NULL;
-                priv->task = NULL;
-                return mm;
-        }
-        down_read(&mm->mmap_sem);
+        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
                if (n-- == 0)
                        return p;
+        up_read(&mm->mmap_sem);
+        mmput(mm);
        return NULL;
 }
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
 {
        struct proc_maps_private *priv = m->private;
+        if (!IS_ERR_OR_NULL(_vml)) {
+                up_read(&priv->mm->mmap_sem);
+                mmput(priv->mm);
+        }
        if (priv->task) {
-                struct mm_struct *mm = priv->task->mm;
-                up_read(&mm->mmap_sem);
-                mmput(mm);
                put_task_struct(priv->task);
+                priv->task = NULL;
        }
 }
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
                     const struct seq_operations *ops)
 {
        struct proc_maps_private *priv;
-        int ret = -ENOMEM;
+        priv = __seq_open_private(file, ops, sizeof(*priv));
-        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (!priv)
-        if (priv) {
+                return -ENOMEM;
-                priv->pid = proc_pid(inode);
-                ret = seq_open(file, ops);
+        priv->inode = inode;
-                if (!ret) {
+        priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
-                        struct seq_file *m = file->private_data;
+        if (IS_ERR(priv->mm)) {
-                        m->private = priv;
+                int err = PTR_ERR(priv->mm);
-                } else {
-                        kfree(priv);
+                seq_release_private(inode, file);
-                }
+                return err;
        }
-        return ret;
+        return 0;
+}
+static int map_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct proc_maps_private *priv = seq->private;
+        if (priv->mm)
+                mmdrop(priv->mm);
+        return seq_release_private(inode, file);
 }
 static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
        .open           = pid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = map_release,
 };
 const struct file_operations proc_tid_maps_operations = {
        .open           = tid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release_private,
+        .release        = map_release,
 };
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f2d0eee9d1f1..8b663b2d9562 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2725,7 +2725,7 @@ static int __init dquot_init(void)
                panic("Cannot create dquot hash table");
        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
-                ret = percpu_counter_init(&dqstats.counter[i], 0);
+                ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
                if (ret)
                        panic("Cannot create dquot stat counters");
        }
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 735c2c2b4536..1894d96ccb7c 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data {
 } reiserfs_proc_info_data_t;
 #endif
+/* Number of quota types we support */
+#define REISERFS_MAXQUOTAS 2
 /* reiserfs union of in-core super block data */
 struct reiserfs_sb_info {
        /* Buffer containing the super block */
@@ -615,7 +618,7 @@ struct reiserfs_sb_info {
        spinlock_t old_work_lock;     /* protects old_work and work_queued */
 #ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];
+        char *s_qf_names[REISERFS_MAXQUOTAS];
        int s_jquota_fmt;
 #endif
        char *s_jdev;           /* Stored jdev for mount option showing */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d46e88a33b02..f1376c92cf74 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -206,7 +206,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        int i;
        int ms_active_set;
-        int quota_enabled[MAXQUOTAS];
+        int quota_enabled[REISERFS_MAXQUOTAS];
 #endif
        /* compose key to look for "save" links */
@@ -227,7 +227,7 @@ static int finish_unfinished(struct super_block *s)
                s->s_flags |= MS_ACTIVE;
        }
        /* Turn on quotas so that they are updated correctly */
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
                quota_enabled[i] = 1;
                if (REISERFS_SB(s)->s_qf_names[i]) {
                        int ret;
@@ -370,7 +370,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        reiserfs_write_unlock(s);
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
                if (sb_dqopt(s)->files[i] && quota_enabled[i])
                        dquot_quota_off(s, i);
        }
@@ -1360,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
 {
        int i;
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
                if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
                        kfree(REISERFS_SB(s)->s_qf_names[i]);
                REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
@@ -1381,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        char *new_opts = kstrdup(arg, GFP_KERNEL);
        int err;
-        char *qf_names[MAXQUOTAS];
+        char *qf_names[REISERFS_MAXQUOTAS];
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
@@ -1400,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
            (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
            qf_names, &qfmt)) {
 #ifdef CONFIG_QUOTA
-                for (i = 0; i < MAXQUOTAS; i++)
+                for (i = 0; i < REISERFS_MAXQUOTAS; i++)
                        if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
                                kfree(qf_names[i]);
 #endif
@@ -1844,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        char *jdev_name;
        struct reiserfs_sb_info *sbi;
        int errval = -EINVAL;
-        char *qf_names[MAXQUOTAS] = {};
+        char *qf_names[REISERFS_MAXQUOTAS] = {};
        unsigned int qfmt = 0;
        save_mount_options(s, data);
@@ -2169,7 +2169,7 @@ error_unlocked:
 #ifdef CONFIG_QUOTA
        {
                int j;
-                for (j = 0; j < MAXQUOTAS; j++)
+                for (j = 0; j < REISERFS_MAXQUOTAS; j++)
                        kfree(qf_names[j]);
        }
 #endif
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
         * include/linux/fs.h).  We don't necessarily hold i_mutex when this
         * is called, so take i_lock for that case.
         *
-         * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+         * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
         * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
         * for that case too, and do both at once by combining the tests.
         *
diff --git a/fs/super.c b/fs/super.c
index b9a214d2fe98..1b836107acee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
                goto fail;
        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-                if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
+                if (percpu_counter_init(&s->s_writers.counter[i], 0,
+                                        GFP_KERNEL) < 0)
                        goto fail;
                lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
                                 &type->s_writers_key[i], 0);
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
                return ret;
        return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c350216ea8..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
                spin_lock_irq(&ctx->wqh.lock);
                if (!timerfd_canceled(ctx)) {
                        ctx->ticks = ticks;
-                        if (ticks)
+                        wake_up_locked(&ctx->wqh);
-                                wake_up_locked(&ctx->wqh);
                } else
                        ret = -ECANCELED;
                spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 86c6743ec1fe..bb15771b92ae 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -223,11 +223,18 @@ out:
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
-        if (filp->f_mode & FMODE_WRITE) {
+        if (filp->f_mode & FMODE_WRITE &&
+            atomic_read(&inode->i_writecount) > 1) {
+                /*
+                 * Grab i_mutex to avoid races with writes changing i_size
+                 * while we are running.
+                 */
+                mutex_lock(&inode->i_mutex);
                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
                up_write(&UDF_I(inode)->i_data_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        return 0;
 }
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
        udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
-struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 {
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct logicalVolIntegrityDescImpUse *lvidiu;
+        int err;
        inode = new_inode(sb);
-        if (!inode) {
+        if (!inode)
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return NULL;
-        }
-        *err = -ENOSPC;
        iinfo = UDF_I(inode);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
        }
        if (!iinfo->i_ext.i_data) {
                iput(inode);
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return NULL;
        }
+        err = -ENOSPC;
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
-                              start, err);
+                              start, &err);
-        if (*err) {
+        if (err) {
                iput(inode);
-                return NULL;
+                return ERR_PTR(err);
        }
        lvidiu = udf_sb_lvidiu(sb);
        if (lvidiu) {
                iinfo->i_unique = lvid_get_unique_id(sb);
+                inode->i_generation = iinfo->i_unique;
                mutex_lock(&sbi->s_alloc_mutex);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                iinfo->i_crtime = current_fs_time(inode->i_sb);
-        insert_inode_hash(inode);
+        if (unlikely(insert_inode_locked(inode) < 0)) {
+                make_bad_inode(inode);
+                iput(inode);
+                return ERR_PTR(-EIO);
+        }
        mark_inode_dirty(inode);
-        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..c9b4df5810d5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
 static umode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
-static void udf_fill_inode(struct inode *, struct buffer_head *);
 static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
        return 0;
 }
-static void __udf_read_inode(struct inode *inode)
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+static int udf_read_inode(struct inode *inode, bool hidden_inode)
 {
        struct buffer_head *bh = NULL;
        struct fileEntry *fe;
+        struct extendedFileEntry *efe;
        uint16_t ident;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+        struct kernel_lb_addr *iloc = &iinfo->i_location;
+        unsigned int link_count;
+        unsigned int indirections = 0;
+        int ret = -EIO;
+reread:
+        if (iloc->logicalBlockNum >=
+            sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+                udf_debug("block=%d, partition=%d out of range\n",
+                          iloc->logicalBlockNum, iloc->partitionReferenceNum);
+                return -EIO;
+        }
        /*
         * Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-        bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+        bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
        if (!bh) {
                udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
-                make_bad_inode(inode);
+                return -EIO;
-                return;
        }
        if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
            ident != TAG_IDENT_USE) {
                udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
                        inode->i_ino, ident);
-                brelse(bh);
+                goto out;
-                make_bad_inode(inode);
-                return;
        }
        fe = (struct fileEntry *)bh->b_data;
+        efe = (struct extendedFileEntry *)bh->b_data;
        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;
-                ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
+                ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
-                                        &ident);
                if (ident == TAG_IDENT_IE && ibh) {
-                        struct buffer_head *nbh = NULL;
                        struct kernel_lb_addr loc;
                        struct indirectEntry *ie;
                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);
-                        if (ie->indirectICB.extLength &&
+                        if (ie->indirectICB.extLength) {
-                                (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
+                                brelse(ibh);
-                                                        &ident))) {
+                                memcpy(&iinfo->i_location, &loc,
-                                if (ident == TAG_IDENT_FE ||
+                                       sizeof(struct kernel_lb_addr));
-                                        ident == TAG_IDENT_EFE) {
+                                if (++indirections > UDF_MAX_ICB_NESTING) {
-                                        memcpy(&iinfo->i_location,
+                                        udf_err(inode->i_sb,
-                                                &loc,
+                                                "too many ICBs in ICB hierarchy"
-                                                sizeof(struct kernel_lb_addr));
+                                                " (max %d supported)\n",
-                                        brelse(bh);
+                                                UDF_MAX_ICB_NESTING);
-                                        brelse(ibh);
+                                        goto out;
-                                        brelse(nbh);
-                                        __udf_read_inode(inode);
-                                        return;
                                }
-                                brelse(nbh);
+                                brelse(bh);
+                                goto reread;
                        }
                }
                brelse(ibh);
        } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
                udf_err(inode->i_sb, "unsupported strategy type: %d\n",
                        le16_to_cpu(fe->icbTag.strategyType));
-                brelse(bh);
+                goto out;
-                make_bad_inode(inode);
-                return;
        }
-        udf_fill_inode(inode, bh);
-        brelse(bh);
-}
-static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
-{
-        struct fileEntry *fe;
-        struct extendedFileEntry *efe;
-        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
-        struct udf_inode_info *iinfo = UDF_I(inode);
-        unsigned int link_count;
-        fe = (struct fileEntry *)bh->b_data;
-        efe = (struct extendedFileEntry *)bh->b_data;
        if (fe->icbTag.strategyType == cpu_to_le16(4))
                iinfo->i_strat4096 = 0;
        else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                        sizeof(struct extendedFileEntry))) {
+                                        sizeof(struct extendedFileEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
                       inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                                sizeof(struct fileEntry))) {
+                                                sizeof(struct fileEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct fileEntry),
                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                        sizeof(struct unallocSpaceEntry))) {
+                                        sizeof(struct unallocSpaceEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
                       inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
-                return;
+                return 0;
        }
+        ret = -EIO;
        read_lock(&sbi->s_cred_lock);
        i_uid_write(inode, le32_to_cpu(fe->uid));
        if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,13 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        read_unlock(&sbi->s_cred_lock);
        link_count = le16_to_cpu(fe->fileLinkCount);
-        if (!link_count)
+        if (!link_count) {
+                if (!hidden_inode) {
+                        ret = -ESTALE;
+                        goto out;
+                }
                link_count = 1;
+        }
        set_nlink(inode, link_count);
        inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1487,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
                iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
        }
+        inode->i_generation = iinfo->i_unique;
        switch (fe->icbTag.fileType) {
        case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1537,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        default:
                udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
                        inode->i_ino, fe->icbTag.fileType);
-                make_bad_inode(inode);
+                goto out;
-                return;
        }
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                struct deviceSpec *dsea =
@@ -1549,8 +1548,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                                      le32_to_cpu(dsea->minorDeviceIdent)));
                        /* Developer ID ??? */
                } else
-                        make_bad_inode(inode);
+                        goto out;
        }
+        ret = 0;
+out:
+        brelse(bh);
+        return ret;
 }
 static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1667,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
        fe->permissions = cpu_to_le32(udfperms);
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
                fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
        else
                fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1826,36 +1829,28 @@ out:
        return err;
 }
-struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
+struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
+                         bool hidden_inode)
 {
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
+        int err;
        if (!inode)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
-        if (inode->i_state & I_NEW) {
-                memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
-                __udf_read_inode(inode);
-                unlock_new_inode(inode);
-        }
-        if (is_bad_inode(inode))
+        if (!(inode->i_state & I_NEW))
-                goto out_iput;
+                return inode;
-        if (ino->logicalBlockNum >= UDF_SB(sb)->
+        memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
-                        s_partmaps[ino->partitionReferenceNum].s_partition_len) {
+        err = udf_read_inode(inode, hidden_inode);
-                udf_debug("block=%d, partition=%d out of range\n",
+        if (err < 0) {
-                          ino->logicalBlockNum, ino->partitionReferenceNum);
+                iget_failed(inode);
-                make_bad_inode(inode);
+                return ERR_PTR(err);
-                goto out_iput;
        }
+        unlock_new_inode(inode);
        return inode;
- out_iput:
-        iput(inode);
-        return NULL;
 }
 int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9737cba1357d..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                                                NULL, 0),
                };
                inode = udf_iget(dir->i_sb, lb);
-                if (!inode) {
+                if (IS_ERR(inode))
-                        return ERR_PTR(-EACCES);
+                        return inode;
-                }
        } else
 #endif /* UDF_RECOVERY */
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
-                if (!inode) {
+                if (IS_ERR(inode))
-                        return ERR_PTR(-EACCES);
+                        return ERR_CAST(inode);
-                }
        }
        return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
        return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
-                      bool excl)
 {
+        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct inode *dir = dentry->d_parent->d_inode;
        struct udf_fileident_bh fibh;
-        struct inode *inode;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct udf_inode_info *iinfo;
-        inode = udf_new_inode(dir, mode, &err);
-        if (!inode) {
-                return err;
-        }
-        iinfo = UDF_I(inode);
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                inode->i_data.a_ops = &udf_adinicb_aops;
-        else
-                inode->i_data.a_ops = &udf_aops;
-        inode->i_op = &udf_file_inode_operations;
-        inode->i_fop = &udf_file_operations;
-        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
-        if (!fi) {
+        if (unlikely(!fi)) {
                inode_dec_link_count(inode);
+                unlock_new_inode(inode);
                iput(inode);
                return err;
        }
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+                      bool excl)
 {
-        struct inode *inode;
+        struct inode *inode = udf_new_inode(dir, mode);
-        struct udf_inode_info *iinfo;
-        int err;
-        inode = udf_new_inode(dir, mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                return err;
-        iinfo = UDF_I(inode);
+        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                inode->i_data.a_ops = &udf_adinicb_aops;
        else
                inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_fop = &udf_file_operations;
        mark_inode_dirty(inode);
+        return udf_add_nondir(dentry, inode);
+}
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct inode *inode = udf_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        else
+                inode->i_data.a_ops = &udf_aops;
+        inode->i_op = &udf_file_inode_operations;
+        inode->i_fop = &udf_file_operations;
+        mark_inode_dirty(inode);
        d_tmpfile(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
 }
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
                     dev_t rdev)
 {
        struct inode *inode;
-        struct udf_fileident_bh fibh;
-        struct fileIdentDesc cfi, *fi;
-        int err;
-        struct udf_inode_info *iinfo;
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        err = -EIO;
+        inode = udf_new_inode(dir, mode);
-        inode = udf_new_inode(dir, mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
-        iinfo = UDF_I(inode);
        init_special_inode(inode, mode, rdev);
-        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+        return udf_add_nondir(dentry, inode);
-        if (!fi) {
-                inode_dec_link_count(inode);
-                iput(inode);
-                return err;
-        }
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
-        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
-        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                mark_inode_dirty(dir);
-        mark_inode_dirty(inode);
-        if (fibh.sbh != fibh.ebh)
-                brelse(fibh.ebh);
-        brelse(fibh.sbh);
-        d_instantiate(dentry, inode);
-        err = 0;
-out:
-        return err;
 }
 static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        err = -EIO;
+        inode = udf_new_inode(dir, S_IFDIR | mode);
-        inode = udf_new_inode(dir, S_IFDIR | mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
        iinfo = UDF_I(inode);
        inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
        if (!fi) {
                inode_dec_link_count(inode);
+                unlock_new_inode(inode);
                iput(inode);
                goto out;
        }
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (!fi) {
                clear_nlink(inode);
                mark_inode_dirty(inode);
+                unlock_new_inode(inode);
                iput(inode);
                goto out;
        }
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        inc_nlink(dir);
        mark_inode_dirty(dir);
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
 static int udf_symlink(struct inode *dir, struct dentry *dentry,
                       const char *symname)
 {
-        struct inode *inode;
+        struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
        struct pathComponent *pc;
        const char *compstart;
-        struct udf_fileident_bh fibh;
        struct extent_position epos = {};
        int eoffset, elen = 0;
-        struct fileIdentDesc *fi;
-        struct fileIdentDesc cfi;
        uint8_t *ea;
        int err;
        int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct udf_inode_info *iinfo;
        struct super_block *sb = dir->i_sb;
-        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
        iinfo = UDF_I(inode);
        down_write(&iinfo->i_data_sem);
@@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        mark_inode_dirty(inode);
        up_write(&iinfo->i_data_sem);
-        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+        err = udf_add_nondir(dentry, inode);
-        if (!fi)
-                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
-        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(lvid_get_unique_id(sb));
-        }
-        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                mark_inode_dirty(dir);
-        if (fibh.sbh != fibh.ebh)
-                brelse(fibh.ebh);
-        brelse(fibh.sbh);
-        d_instantiate(dentry, inode);
-        err = 0;
 out:
        kfree(name);
        return err;
@@ -1037,6 +990,7 @@ out:
 out_no_entry:
        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        goto out;
 }
@@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct udf_fileident_bh fibh;
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
-                goto out_unlock;
+                return ERR_PTR(-EACCES);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
@@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
        tloc = lelb_to_cpu(cfi.icb.extLocation);
        inode = udf_iget(child->d_inode->i_sb, &tloc);
-        if (!inode)
+        if (IS_ERR(inode))
-                goto out_unlock;
+                return ERR_CAST(inode);
        return d_obtain_alias(inode);
-out_unlock:
-        return ERR_PTR(-EACCES);
 }
@@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
        loc.partitionReferenceNum = partref;
        inode = udf_iget(sb, &loc);
-        if (inode == NULL)
+        if (IS_ERR(inode))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 813da94d447b..e229315bbf7a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -959,14 +959,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
        addr.logicalBlockNum = meta_file_loc;
        addr.partitionReferenceNum = partition_num;
-        metadata_fe = udf_iget(sb, &addr);
+        metadata_fe = udf_iget_special(sb, &addr);
-        if (metadata_fe == NULL)
+        if (IS_ERR(metadata_fe)) {
                udf_warn(sb, "metadata inode efe not found\n");
-        else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+                return metadata_fe;
+        }
+        if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
                udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
                iput(metadata_fe);
-                metadata_fe = NULL;
+                return ERR_PTR(-EIO);
        }
        return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
        struct kernel_lb_addr addr;
+        struct inode *fe;
        map = &sbi->s_partmaps[partition];
        mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Metadata file location: block = %d part = %d\n",
                  mdata->s_meta_file_loc, map->s_partition_num);
-        mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+        fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
-                mdata->s_meta_file_loc, map->s_partition_num);
+                                         map->s_partition_num);
+        if (IS_ERR(fe)) {
-        if (mdata->s_metadata_fe == NULL) {
                /* mirror file entry */
                udf_debug("Mirror metadata file location: block = %d part = %d\n",
                          mdata->s_mirror_file_loc, map->s_partition_num);
-                mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+                fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
-                        mdata->s_mirror_file_loc, map->s_partition_num);
+                                                 map->s_partition_num);
-                if (mdata->s_mirror_fe == NULL) {
+                if (IS_ERR(fe)) {
                        udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
-                        return -EIO;
+                        return PTR_ERR(fe);
                }
-        }
+                mdata->s_mirror_fe = fe;
+        } else
+                mdata->s_metadata_fe = fe;
        /*
         * bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                udf_debug("Bitmap file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-                mdata->s_bitmap_fe = udf_iget(sb, &addr);
+                fe = udf_iget_special(sb, &addr);
-                if (mdata->s_bitmap_fe == NULL) {
+                if (IS_ERR(fe)) {
                        if (sb->s_flags & MS_RDONLY)
                                udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
                        else {
                                udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
-                                return -EIO;
+                                return PTR_ERR(fe);
                        }
-                }
+                } else
+                        mdata->s_bitmap_fe = fe;
        }
        udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
+                struct inode *inode;
-                map->s_uspace.s_table = udf_iget(sb, &loc);
+                inode = udf_iget_special(sb, &loc);
-                if (!map->s_uspace.s_table) {
+                if (IS_ERR(inode)) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                  p_index);
-                        return -EIO;
+                        return PTR_ERR(inode);
                }
+                map->s_uspace.s_table = inode;
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
                udf_debug("unallocSpaceTable (part %d) @ %ld\n",
                          p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                                phd->freedSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
+                struct inode *inode;
-                map->s_fspace.s_table = udf_iget(sb, &loc);
+                inode = udf_iget_special(sb, &loc);
-                if (!map->s_fspace.s_table) {
+                if (IS_ERR(inode)) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
                                  p_index);
-                        return -EIO;
+                        return PTR_ERR(inode);
                }
+                map->s_fspace.s_table = inode;
                map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
                udf_debug("freedSpaceTable (part %d) @ %ld\n",
                          p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
        sector_t vat_block;
        struct kernel_lb_addr ino;
+        struct inode *inode;
        /*
         * VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
        ino.partitionReferenceNum = type1_index;
        for (vat_block = start_block;
             vat_block >= map->s_partition_root &&
-             vat_block >= start_block - 3 &&
+             vat_block >= start_block - 3; vat_block--) {
-             !sbi->s_vat_inode; vat_block--) {
                ino.logicalBlockNum = vat_block - map->s_partition_root;
-                sbi->s_vat_inode = udf_iget(sb, &ino);
+                inode = udf_iget_special(sb, &ino);
+                if (!IS_ERR(inode)) {
+                        sbi->s_vat_inode = inode;
+                        break;
+                }
        }
 }
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
        inode = udf_iget(sb, &rootdir);
-        if (!inode) {
+        if (IS_ERR(inode)) {
                udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
                       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
-                ret = -EIO;
+                ret = PTR_ERR(inode);
                goto error_out;
        }
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..1cc3c993ebd0 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -138,12 +138,22 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 /* file.c */
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
-extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
+extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
+                                bool hidden_inode);
+static inline struct inode *udf_iget_special(struct super_block *sb,
+                                             struct kernel_lb_addr *ino)
+{
+        return __udf_iget(sb, ino, true);
+}
+static inline struct inode *udf_iget(struct super_block *sb,
+                                     struct kernel_lb_addr *ino)
+{
+        return __udf_iget(sb, ino, false);
+}
 extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
 extern int udf_setsize(struct inode *, loff_t);
-extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +219,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
 /* truncate.c */
 extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 1f11483eba6a..77c331f1a770 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = {
 /*2038*/ SPY(68, 17, 0)
 };
-extern struct timezone sys_tz;
 #define SECS_PER_HOUR   (60 * 60)
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
        ufsi->i_oeftflag = 0;
        ufsi->i_dir_start_lookup = 0;
        memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EIO;
+                goto failed;
+        }
        mark_inode_dirty(inode);
        if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
 fail_remove_inode:
        unlock_ufs(sb);
        clear_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
        return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c97990e..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-        if (want_delete) {
+        if (want_delete)
-                lock_ufs(inode->i_sb);
+                ufs_free_inode(inode);
-                ufs_free_inode (inode);
-                unlock_ufs(inode->i_sb);
-        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
        int err = ufs_add_link(dentry, inode);
        if (!err) {
+                unlock_new_inode(inode);
                d_instantiate(dentry, inode);
                return 0;
        }
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        lock_ufs(dir->i_sb);
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
-                goto out;
+                goto out_notlocked;
+        lock_ufs(dir->i_sb);
        if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
                /* slow symlink */
                inode->i_op = &ufs_symlink_inode_operations;
@@ -155,6 +157,7 @@ out_notlocked:
 out_fail:
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        goto out;
 }
@@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
        struct inode * inode;
        int err;
-        lock_ufs(dir->i_sb);
-        inode_inc_link_count(dir);
        inode = ufs_new_inode(dir, S_IFDIR|mode);
-        err = PTR_ERR(inode);
        if (IS_ERR(inode))
-                goto out_dir;
+                return PTR_ERR(inode);
        inode->i_op = &ufs_dir_inode_operations;
        inode->i_fop = &ufs_dir_operations;
@@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
        inode_inc_link_count(inode);
+        lock_ufs(dir->i_sb);
+        inode_inc_link_count(dir);
        err = ufs_make_empty(inode, dir);
        if (err)
                goto out_fail;
@@ -211,8 +213,8 @@ out:
 out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput (inode);
-out_dir:
        inode_dec_link_count(dir);
        unlock_ufs(dir->i_sb);
        goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d32844..86df952d3e24 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents(
        struct xfs_bmap_free    *flist,
        int                     num_exts)
 {
-        struct xfs_btree_cur            *cur;
+        struct xfs_btree_cur            *cur = NULL;
        struct xfs_bmbt_rec_host        *gotp;
        struct xfs_bmbt_irec            got;
        struct xfs_bmbt_irec            left;
@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents(
        int                             error = 0;
        int                             i;
        int                             whichfork = XFS_DATA_FORK;
-        int                             logflags;
+        int                             logflags = 0;
        xfs_filblks_t                   blockcount = 0;
        int                             total_extents;
@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents(
                }
        }
-        /* We are going to change core inode */
-        logflags = XFS_ILOG_CORE;
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
-        } else {
-                cur = NULL;
-                logflags |= XFS_ILOG_DEXT;
        }
        /*
@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents(
                        blockcount = left.br_blockcount +
                                got.br_blockcount;
                        xfs_iext_remove(ip, *current_ext, 1, 0);
+                        logflags |= XFS_ILOG_CORE;
                        if (cur) {
                                error = xfs_btree_delete(cur, &i);
                                if (error)
                                        goto del_cursor;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        } else {
+                                logflags |= XFS_ILOG_DEXT;
                        }
                        XFS_IFORK_NEXT_SET(ip, whichfork,
                                XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents(
                        got.br_startoff = startoff;
                }
+                logflags |= XFS_ILOG_CORE;
                if (cur) {
                        error = xfs_bmbt_update(cur, got.br_startoff,
                                                got.br_startblock,
@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents(
                                                got.br_state);
                        if (error)
                                goto del_cursor;
+                } else {
+                        logflags |= XFS_ILOG_DEXT;
                }
                (*current_ext)++;
@@ -5597,6 +5598,7 @@ del_cursor:
                xfs_btree_del_cursor(cur,
                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-        xfs_trans_log_inode(tp, ip, logflags);
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
        return error;
 }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4caa54f..b984647c24db 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+        struct page             *page)
+{
+        struct address_space    *mapping = page->mapping;
+        struct inode            *inode = mapping->host;
+        loff_t                  end_offset;
+        loff_t                  offset;
+        int                     newly_dirty;
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
+        end_offset = i_size_read(inode);
+        offset = page_offset(page);
+        spin_lock(&mapping->private_lock);
+        if (page_has_buffers(page)) {
+                struct buffer_head *head = page_buffers(page);
+                struct buffer_head *bh = head;
+                do {
+                        if (offset < end_offset)
+                                set_buffer_dirty(bh);
+                        bh = bh->b_this_page;
+                        offset += 1 << inode->i_blkbits;
+                } while (bh != head);
+        }
+        newly_dirty = !TestSetPageDirty(page);
+        spin_unlock(&mapping->private_lock);
+        if (newly_dirty) {
+                /* sigh - __set_page_dirty() is static, so copy it here, too */
+                unsigned long flags;
+                spin_lock_irqsave(&mapping->tree_lock, flags);
+                if (page->mapping) {    /* Race with truncate? */
+                        WARN_ON_ONCE(!PageUptodate(page));
+                        account_page_dirtied(page, mapping);
+                        radix_tree_tag_set(&mapping->page_tree,
+                                        page_index(page), PAGECACHE_TAG_DIRTY);
+                }
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        }
+        return newly_dirty;
+}
 const struct address_space_operations xfs_address_space_operations = {
        .readpage               = xfs_vm_readpage,
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
+        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d39a35..1707980f9a4b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,6 +1470,26 @@ xfs_collapse_file_space(
        start_fsb = XFS_B_TO_FSB(mp, offset + len);
        shift_fsb = XFS_B_TO_FSB(mp, len);
+        /*
+         * Writeback the entire file and force remove any post-eof blocks. The
+         * writeback prevents changes to the extent list via concurrent
+         * writeback and the eofblocks trim prevents the extent shift algorithm
+         * from running into a post-eof delalloc extent.
+         *
+         * XXX: This is a temporary fix until the extent shift loop below is
+         * converted to use offsets and lookups within the ILOCK rather than
+         * carrying around the index into the extent list for the next
+         * iteration.
+         */
+        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+        if (error)
+                return error;
+        if (xfs_can_free_eofblocks(ip, true)) {
+                error = xfs_free_eofblocks(mp, ip, false);
+                if (error)
+                        return error;
+        }
        error = xfs_free_file_space(ip, offset, len);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b1708d134..de5368c803f9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -291,12 +291,22 @@ xfs_file_read_iter(
                if (inode->i_mapping->nrpages) {
                        ret = filemap_write_and_wait_range(
                                                        VFS_I(ip)->i_mapping,
-                                                        pos, -1);
+                                                        pos, pos + size - 1);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
                        }
-                        truncate_pagecache_range(VFS_I(ip), pos, -1);
+                        /*
+                         * Invalidate whole pages. This can return an error if
+                         * we fail to invalidate a page, but this should never
+                         * happen on XFS. Warn if it does fail.
+                         */
+                        ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+                                        pos >> PAGE_CACHE_SHIFT,
+                                        (pos + size - 1) >> PAGE_CACHE_SHIFT);
+                        WARN_ON_ONCE(ret);
+                        ret = 0;
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
@@ -632,10 +642,19 @@ xfs_file_dio_aio_write(
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, -1);
+                                                    pos, pos + count - 1);
                if (ret)
                        goto out;
-                truncate_pagecache_range(VFS_I(ip), pos, -1);
+                /*
+                 * Invalidate whole pages. This can return an error if
+                 * we fail to invalidate a page, but this should never
+                 * happen on XFS. Warn if it does fail.
+                 */
+                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+                                        pos >> PAGE_CACHE_SHIFT,
+                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                WARN_ON_ONCE(ret);
+                ret = 0;
        }
        /*