105 files changed, 2043 insertions, 1157 deletions
diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..733750096b71 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
        struct {
                unsigned        tail;
+                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
@@ -792,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
        for (i = 0; i < table->nr; ++i) {
                struct kioctx *ctx = table->table[i];
+                struct completion requests_done =
+                        COMPLETION_INITIALIZER_ONSTACK(requests_done);
                if (!ctx)
                        continue;
@@ -803,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
                 * that it needs to unmap the area, just set it to 0.
                 */
                ctx->mmap_size = 0;
-                kill_ioctx(mm, ctx, NULL);
+                kill_ioctx(mm, ctx, &requests_done);
+                /* Wait until all IO for the context are done. */
+                wait_for_completion(&requests_done);
        }
        RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -857,6 +863,68 @@ out:
        return ret;
 }
+/* refill_reqs_available
+ *      Updates the reqs_available reference counts used for tracking the
+ *      number of free slots in the completion ring.  This can be called
+ *      from aio_complete() (to optimistically update reqs_available) or
+ *      from aio_get_req() (the we're out of events case).  It must be
+ *      called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+        unsigned events_in_ring, completed;
+        /* Clamp head since userland can write to it. */
+        head %= ctx->nr_events;
+        if (head <= tail)
+                events_in_ring = tail - head;
+        else
+                events_in_ring = ctx->nr_events - (head - tail);
+        completed = ctx->completed_events;
+        if (events_in_ring < completed)
+                completed -= events_in_ring;
+        else
+                completed = 0;
+        if (!completed)
+                return;
+        ctx->completed_events -= completed;
+        put_reqs_available(ctx, completed);
+}
+/* user_refill_reqs_available
+ *      Called to refill reqs_available when aio_get_req() encounters an
+ *      out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+        spin_lock_irq(&ctx->completion_lock);
+        if (ctx->completed_events) {
+                struct aio_ring *ring;
+                unsigned head;
+                /* Access of ring->head may race with aio_read_events_ring()
+                 * here, but that's okay since whether we read the old version
+                 * or the new version, and either will be valid.  The important
+                 * part is that head cannot pass tail since we prevent
+                 * aio_complete() from updating tail by holding
+                 * ctx->completion_lock.  Even if head is invalid, the check
+                 * against ctx->completed_events below will make sure we do the
+                 * safe/right thing.
+                 */
+                ring = kmap_atomic(ctx->ring_pages[0]);
+                head = ring->head;
+                kunmap_atomic(ring);
+                refill_reqs_available(ctx, head, ctx->tail);
+        }
+        spin_unlock_irq(&ctx->completion_lock);
+}
 /* aio_get_req
 *      Allocate a slot for an aio request.
 * Returns NULL if no requests are free.
@@ -865,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
        struct kiocb *req;
-        if (!get_reqs_available(ctx))
+        if (!get_reqs_available(ctx)) {
-                return NULL;
+                user_refill_reqs_available(ctx);
+                if (!get_reqs_available(ctx))
+                        return NULL;
+        }
        req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
        if (unlikely(!req))
@@ -925,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
+        unsigned tail, pos, head;
        unsigned long   flags;
-        unsigned tail, pos;
        /*
         * Special case handling for sync iocbs:
@@ -987,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        ctx->tail = tail;
        ring = kmap_atomic(ctx->ring_pages[0]);
+        head = ring->head;
        ring->tail = tail;
        kunmap_atomic(ring);
        flush_dcache_page(ctx->ring_pages[0]);
+        ctx->completed_events++;
+        if (ctx->completed_events > 1)
+                refill_reqs_available(ctx, head, tail);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
-        put_reqs_available(ctx, 1);
        /*
         * We have to order our ring_info tail store above and test
@@ -1042,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
        tail = ring->tail;
        kunmap_atomic(ring);
+        /*
+         * Ensure that once we've read the current tail pointer, that
+         * we also see the events that were stored up to the tail.
+         */
+        smp_rmb();
        pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
        if (head == tail)
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..fbd76ded9a34 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/workqueue.h>
 #include "async-thread.h"
 #include "ctree.h"
@@ -55,8 +54,39 @@ struct btrfs_workqueue {
        struct __btrfs_workqueue *high;
 };
-static inline struct __btrfs_workqueue
+static void normal_work_helper(struct btrfs_work *work);
-*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+#define BTRFS_WORK_HELPER(name)                                 \
+void btrfs_##name(struct work_struct *arg)                              \
+{                                                                       \
+        struct btrfs_work *work = container_of(arg, struct btrfs_work,  \
+                                               normal_work);            \
+        normal_work_helper(work);                                       \
+}
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
        spin_unlock_irqrestore(lock, flags);
 }
-static void normal_work_helper(struct work_struct *arg)
+static void normal_work_helper(struct btrfs_work *work)
 {
-        struct btrfs_work *work;
        struct __btrfs_workqueue *wq;
        int need_order = 0;
-        work = container_of(arg, struct btrfs_work, normal_work);
        /*
         * We should not touch things inside work in the following cases:
         * 1) after work->func() if it has no ordered_free
@@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg)
                trace_btrfs_all_work_done(work);
 }
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free)
@@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work,
        work->func = func;
        work->ordered_func = ordered_func;
        work->ordered_free = ordered_free;
-        INIT_WORK(&work->normal_work, normal_work_helper);
+        INIT_WORK(&work->normal_work, uniq_func);
        INIT_LIST_HEAD(&work->ordered_list);
        work->flags = 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e9e31c94758f 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
 struct btrfs_workqueue;
 /* Internal use only */
 struct __btrfs_workqueue;
 struct btrfs_work;
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
 struct btrfs_work {
        btrfs_func_t func;
@@ -38,11 +40,35 @@ struct btrfs_work {
        unsigned long flags;
 };
+#define BTRFS_WORK_HELPER_PROTO(name)                                   \
+void btrfs_##name(struct work_struct *arg)
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              int flags,
                                              int max_active,
                                              int thresh);
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
                     btrfs_func_t func,
                     btrfs_func_t ordered_func,
                     btrfs_func_t ordered_free);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..a2e90f855d7d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                return -ENOMEM;
        async_work->delayed_root = delayed_root;
-        btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+        btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
-                        NULL, NULL);
+                        btrfs_async_run_delayed_root, NULL, NULL);
        async_work->nr = nr;
        btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0ed9e664f7d..a1d36e62179c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
-#include "async-thread.h"
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
@@ -693,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err)
 {
        struct end_io_wq *end_io_wq = bio->bi_private;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        fs_info = end_io_wq->info;
        end_io_wq->error = err;
-        btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
-                        btrfs_queue_work(fs_info->endio_meta_write_workers,
+                        wq = fs_info->endio_meta_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_meta_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
-                        btrfs_queue_work(fs_info->endio_freespace_worker,
+                        wq = fs_info->endio_freespace_worker;
-                                         &end_io_wq->work);
+                        func = btrfs_freespace_write_helper;
-                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                        wq = fs_info->endio_raid56_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_raid56_helper;
-                else
+                } else {
-                        btrfs_queue_work(fs_info->endio_write_workers,
+                        wq = fs_info->endio_write_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_write_helper;
+                }
        } else {
-                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
-                        btrfs_queue_work(fs_info->endio_raid56_workers,
+                        wq = fs_info->endio_raid56_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_raid56_helper;
-                else if (end_io_wq->metadata)
+                } else if (end_io_wq->metadata) {
-                        btrfs_queue_work(fs_info->endio_meta_workers,
+                        wq = fs_info->endio_meta_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_meta_helper;
-                else
+                } else {
-                        btrfs_queue_work(fs_info->endio_workers,
+                        wq = fs_info->endio_workers;
-                                         &end_io_wq->work);
+                        func = btrfs_endio_helper;
+                }
        }
+        btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+        btrfs_queue_work(wq, &end_io_wq->work);
 }
 /*
@@ -828,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->submit_bio_start = submit_bio_start;
        async->submit_bio_done = submit_bio_done;
-        btrfs_init_work(&async->work, run_one_async_start,
+        btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
                        run_one_async_done, run_one_async_free);
        async->bio_flags = bio_flags;
@@ -3450,7 +3455,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                btrfs_set_stack_device_generation(dev_item, 0);
                btrfs_set_stack_device_type(dev_item, dev->type);
                btrfs_set_stack_device_id(dev_item, dev->devid);
-                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+                btrfs_set_stack_device_total_bytes(dev_item,
+                                                   dev->disk_total_bytes);
                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 102ed3143976..3efe1c3877bf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
        atomic_set(&caching_ctl->count, 1);
-        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+                        caching_thread, NULL, NULL);
        spin_lock(&cache->lock);
        /*
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
                async->sync = 0;
        init_completion(&async->wait);
-        btrfs_init_work(&async->work, delayed_ref_async_start,
+        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
-                        NULL, NULL);
+                        delayed_ref_async_start, NULL, NULL);
        btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 */
 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        /*
+        u64 num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        u64 num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        u64 target;
        u64 tmp;
@@ -8440,13 +8435,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        if (stripped)
                return extended_to_chunk(stripped);
-        /*
+        num_devices = root->fs_info->fs_devices->rw_devices;
-         * we add in the count of missing devices because we want
-         * to make sure that any RAID levels on a degraded FS
-         * continue to be honored.
-         */
-        num_devices = root->fs_info->fs_devices->rw_devices +
-                root->fs_info->fs_devices->missing_devices;
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..af0359dcf337 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                offset += len;
                                continue;
                        }
                }
@@ -4207,8 +4208,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
-        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        start = round_down(start, BTRFS_I(inode)->root->sectorsize);
-        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+        len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
        /*
         * lookup the last file extent.  We're not using i_size here
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3afac292d67..ff1cc0399b9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1840,7 +1840,15 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
-        filemap_flush(inode->i_mapping);
+        /*
+         * ordered_data_close is set by settattr when we are about to truncate
+         * a file from a non-zero size to a zero size.  This tries to
+         * flush down new bytes that may have been written if the
+         * application were using truncate to replace a file in place.
+         */
+        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                               &BTRFS_I(inode)->runtime_flags))
+                        filemap_flush(inode->i_mapping);
        return 0;
 }
@@ -1958,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        btrfs_init_log_ctx(&ctx);
-        ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
@@ -2088,10 +2096,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                goto out;
        }
-        if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+        if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
                u64 num_bytes;
-                path->slots[0]++;
                key.offset = offset;
                btrfs_set_item_key_safe(root, path, &key);
                fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2216,7 +2223,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                goto out_only_mutex;
        }
-        lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+        lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
        lockend = round_down(offset + len,
                             BTRFS_I(inode)->root->sectorsize) - 1;
        same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2277,7 +2284,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                                                tail_start + tail_len, 0, 1);
                                if (ret)
                                        goto out_only_mutex;
-                                }
+                        }
                }
        }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03708ef3deef..016c403bfe7e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -778,8 +778,12 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-                if (ret)
+                if (ret) {
+                        btrfs_drop_extent_cache(inode, async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
+                }
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode,
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
                if (ret)
-                        goto out_reserve;
+                        goto out_drop_extent_cache;
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
                        if (ret)
-                                goto out_reserve;
+                                goto out_drop_extent_cache;
                }
                if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode,
 out:
        return ret;
+out_drop_extent_cache:
+        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
@@ -1096,8 +1102,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
-                btrfs_init_work(&async_cow->work, async_cow_start,
+                btrfs_init_work(&async_cow->work,
-                                async_cow_submit, async_cow_free);
+                                btrfs_delalloc_helper,
+                                async_cow_start, async_cow_submit,
+                                async_cow_free);
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
@@ -1881,7 +1889,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        SetPageChecked(page);
        page_cache_get(page);
-        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+                        btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
@@ -2822,7 +2831,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-        struct btrfs_workqueue *workers;
+        struct btrfs_workqueue *wq;
+        btrfs_work_func_t func;
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2831,13 +2841,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                            end - start + 1, uptodate))
                return 0;
-        btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+        if (btrfs_is_free_space_inode(inode)) {
+                wq = root->fs_info->endio_freespace_worker;
+                func = btrfs_freespace_write_helper;
+        } else {
+                wq = root->fs_info->endio_write_workers;
+                func = btrfs_endio_write_helper;
+        }
-        if (btrfs_is_free_space_inode(inode))
+        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
-                workers = root->fs_info->endio_freespace_worker;
+                        NULL);
-        else
+        btrfs_queue_work(wq, &ordered_extent->work);
-                workers = root->fs_info->endio_write_workers;
-        btrfs_queue_work(workers, &ordered_extent->work);
        return 0;
 }
@@ -4234,7 +4248,8 @@ out:
                        btrfs_abort_transaction(trans, root, ret);
        }
 error:
-        if (last_size != (u64)-1)
+        if (last_size != (u64)-1 &&
+            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        btrfs_free_path(path);
        return err;
@@ -4674,6 +4689,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                remove_extent_mapping(map_tree, em);
                free_extent_map(em);
+                if (need_resched()) {
+                        write_unlock(&map_tree->lock);
+                        cond_resched();
+                        write_lock(&map_tree->lock);
+                }
        }
        write_unlock(&map_tree->lock);
@@ -4696,6 +4716,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                 &cached_state, GFP_NOFS);
                free_extent_state(state);
+                cond_resched();
                spin_lock(&io_tree->lock);
        }
        spin_unlock(&io_tree->lock);
@@ -5181,6 +5202,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
+                /*
+                 * If orphan cleanup did remove any orphans, it means the tree
+                 * was modified and therefore the commit root is not the same as
+                 * the current root anymore. This is a problem, because send
+                 * uses the commit root and therefore can see inode items that
+                 * don't exist in the current root anymore, and for example make
+                 * calls to btrfs_iget, which will do tree lookups based on the
+                 * current root and not on the commit root. Those lookups will
+                 * fail, returning a -ESTALE error, and making send fail with
+                 * that error. So make sure a send does not see any orphans we
+                 * have just removed, and that it will see the same inodes
+                 * regardless of whether a transaction commit happened before
+                 * it started (meaning that the commit root will be the same as
+                 * the current root) or not.
+                 */
+                if (sub_root->node != sub_root->commit_root) {
+                        u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
+                        if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
+                                struct extent_buffer *eb;
+                                /*
+                                 * Assert we can't have races between dentry
+                                 * lookup called through the snapshot creation
+                                 * ioctl and the VFS.
+                                 */
+                                ASSERT(mutex_is_locked(&dir->i_mutex));
+                                down_write(&root->fs_info->commit_root_sem);
+                                eb = sub_root->commit_root;
+                                sub_root->commit_root =
+                                        btrfs_root_node(sub_root);
+                                up_write(&root->fs_info->commit_root_sem);
+                                free_extent_buffer(eb);
+                        }
+                }
        }
        return inode;
@@ -5577,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        return ret;
 }
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+        struct btrfs_iget_args args;
+        args.location = &BTRFS_I(inode)->location;
+        args.root = BTRFS_I(inode)->root;
+        return insert_inode_locked4(inode,
+                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+                   btrfs_find_actor, &args);
+}
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
@@ -5606,6 +5674,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        }
        /*
+         * O_TMPFILE, set link count to 0, so that after this point,
+         * we fill in an inode item with the correct link count.
+         */
+        if (!name)
+                set_nlink(inode, 0);
+        /*
         * we have to initialize this early, so we can reclaim the inode
         * number if we fail afterwards in this function.
         */
@@ -5662,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                sizes[1] = name_len + sizeof(*ref);
        }
+        location = &BTRFS_I(inode)->location;
+        location->objectid = objectid;
+        location->offset = 0;
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        ret = btrfs_insert_inode_locked(inode);
+        if (ret < 0)
+                goto fail;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
        if (ret != 0)
-                goto fail;
+                goto fail_unlock;
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
@@ -5688,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        location = &BTRFS_I(inode)->location;
-        location->objectid = objectid;
-        location->offset = 0;
-        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
        btrfs_inherit_iflags(inode, dir);
        if (S_ISREG(mode)) {
@@ -5703,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                BTRFS_INODE_NODATASUM;
        }
-        btrfs_insert_inode_hash(inode);
        inode_tree_add(inode);
        trace_btrfs_inode_new(inode);
@@ -5718,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                          btrfs_ino(inode), root->root_key.objectid, ret);
        return inode;
+fail_unlock:
+        unlock_new_inode(inode);
 fail:
        if (dir && name)
                BTRFS_I(dir)->index_cnt--;
@@ -5852,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
        * if the filesystem supports xattrs by looking at the
        * ops vector.
        */
        inode->i_op = &btrfs_special_inode_operations;
-        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                init_special_inode(inode, inode->i_mode, rdev);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+        if (err) {
+                goto out_unlock_inode;
+        } else {
                btrfs_update_inode(trans, root, inode);
+                unlock_new_inode(inode);
                d_instantiate(dentry, inode);
        }
 out_unlock:
        btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
@@ -5883,6 +5964,12 @@ out_unlock:
                iput(inode);
        }
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5917,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
        drop_inode_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err)
-                goto out_unlock;
-        err = btrfs_update_inode(trans, root, inode);
-        if (err)
-                goto out_unlock;
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5934,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                goto out_unlock;
+                goto out_unlock_inode;
-        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
 out_unlock:
@@ -5953,6 +6040,11 @@ out_unlock:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6060,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        }
        drop_on_err = 1;
+        /* these must be set before we unlock the inode */
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
-        inode->i_op = &btrfs_dir_inode_operations;
-        inode->i_fop = &btrfs_dir_file_operations;
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
                             dentry->d_name.len, 0, index);
        if (err)
-                goto out_fail;
+                goto out_fail_inode;
        d_instantiate(dentry, inode);
+        /*
+         * mkdir is special.  We're unlocking after we call d_instantiate
+         * to avoid a race with nfsd calling d_instantiate.
+         */
+        unlock_new_inode(inode);
        drop_on_err = 0;
 out_fail:
@@ -6088,6 +6185,10 @@ out_fail:
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
+out_fail_inode:
+        unlock_new_inode(inode);
+        goto out_fail;
 }
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
@@ -6097,14 +6198,14 @@ out_fail:
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                struct extent_map *existing,
                                struct extent_map *em,
-                                u64 map_start, u64 map_len)
+                                u64 map_start)
 {
        u64 start_diff;
        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
        start_diff = map_start - em->start;
        em->start = map_start;
-        em->len = map_len;
+        em->len = existing->start - em->start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
@@ -6275,6 +6376,8 @@ next:
                        goto not_found;
                if (start + len <= found_key.offset)
                        goto not_found;
+                if (start > found_key.offset)
+                        goto next;
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
@@ -6390,8 +6493,7 @@ insert:
                                                         em->len);
                        if (existing) {
                                err = merge_extent_mapping(em_tree, existing,
-                                                           em, start,
+                                                           em, start);
-                                                           root->sectorsize);
                                free_extent_map(existing);
                                if (err) {
                                        free_extent_map(em);
@@ -7158,7 +7260,8 @@ again:
        if (!ret)
                goto out_test;
-        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+        btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+                        finish_ordered_fn, NULL, NULL);
        btrfs_queue_work(root->fs_info->endio_write_workers,
                         &ordered->work);
 out_test:
@@ -7306,10 +7409,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
-        if (ret) {
+        if (ret)
-                bio_put(orig_bio);
                return -EIO;
-        }
        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
@@ -7326,6 +7427,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                return -ENOMEM;
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
        atomic_inc(&dip->pending_bios);
@@ -7534,7 +7636,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        count = iov_iter_count(iter);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags))
-                filemap_fdatawrite_range(inode->i_mapping, offset, count);
+                filemap_fdatawrite_range(inode->i_mapping, offset,
+                                         offset + count - 1);
        if (rw & WRITE) {
                /*
@@ -8041,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        set_nlink(inode, 1);
        btrfs_i_size_write(inode, 0);
+        unlock_new_inode(inode);
        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
        if (err)
@@ -8495,7 +8599,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+        WARN_ON_ONCE(!inode);
+        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+                        btrfs_run_delalloc_work, NULL, NULL);
        return work;
 }
@@ -8699,12 +8805,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -8713,23 +8813,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        */
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+        if (err)
+                goto out_unlock_inode;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                drop_inode = 1;
+                goto out_unlock_inode;
-        else {
-                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-        }
-        if (drop_inode)
-                goto out_unlock;
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
-                drop_inode = 1;
+                goto out_unlock_inode;
-                goto out_unlock;
        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
@@ -8738,9 +8837,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
        if (err) {
-                drop_inode = 1;
                btrfs_free_path(path);
-                goto out_unlock;
+                goto out_unlock_inode;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8764,12 +8862,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
-        if (err)
+        if (err) {
                drop_inode = 1;
+                goto out_unlock_inode;
+        }
+        unlock_new_inode(inode);
+        d_instantiate(dentry, inode);
 out_unlock:
-        if (!err)
-                d_instantiate(dentry, inode);
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -8777,6 +8878,11 @@ out_unlock:
        }
        btrfs_btree_balance_dirty(root);
        return err;
+out_unlock_inode:
+        drop_inode = 1;
+        unlock_new_inode(inode);
+        goto out_unlock;
 }
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8960,14 +9066,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out;
        }
-        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
-        if (ret)
-                goto out;
-        ret = btrfs_update_inode(trans, root, inode);
-        if (ret)
-                goto out;
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
@@ -8975,10 +9073,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+        if (ret)
+                goto out_inode;
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret)
+                goto out_inode;
        ret = btrfs_orphan_add(trans, inode);
        if (ret)
-                goto out;
+                goto out_inode;
+        /*
+         * We set number of links to 0 in btrfs_new_inode(), and here we set
+         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+         * through:
+         *
+         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+         */
+        set_nlink(inode, 1);
+        unlock_new_inode(inode);
        d_tmpfile(dentry, inode);
        mark_inode_dirty(inode);
@@ -8988,8 +9102,12 @@ out:
                iput(inode);
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return ret;
+out_inode:
+        unlock_new_inode(inode);
+        goto out;
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..8a8e29878c34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (ret)
                goto fail;
-        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
-        if (ret)
-                goto fail;
-        /*
-         * If orphan cleanup did remove any orphans, it means the tree was
-         * modified and therefore the commit root is not the same as the
-         * current root anymore. This is a problem, because send uses the
-         * commit root and therefore can see inode items that don't exist
-         * in the current root anymore, and for example make calls to
-         * btrfs_iget, which will do tree lookups based on the current root
-         * and not on the commit root. Those lookups will fail, returning a
-         * -ESTALE error, and making send fail with that error. So make sure
-         * a send does not see any orphans we have just removed, and that it
-         * will see the same inodes regardless of whether a transaction
-         * commit happened before it started (meaning that the commit root
-         * will be the same as the current root) or not.
-         */
-        if (readonly && pending_snapshot->snap->node !=
-            pending_snapshot->snap->commit_root) {
-                trans = btrfs_join_transaction(pending_snapshot->snap);
-                if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
-                        ret = PTR_ERR(trans);
-                        goto fail;
-                }
-                if (!IS_ERR(trans)) {
-                        ret = btrfs_commit_transaction(trans,
-                                                       pending_snapshot->snap);
-                        if (ret)
-                                goto fail;
-                }
-        }
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -1052,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
                return false;
        next = defrag_lookup_extent(inode, em->start + em->len);
-        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
-            (em->block_start + em->block_len == next->block_start))
+                ret = false;
+        else if ((em->block_start + em->block_len == next->block_start) &&
+                 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
                ret = false;
        free_extent_map(next);
@@ -1088,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
        }
        next_mergeable = defrag_check_next_extent(inode, em);
        /*
         * we hit a real extent, if it is big or the next extent is not a
         * real extent, don't bother defragging it
@@ -1735,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
              BTRFS_SUBVOL_QGROUP_INHERIT)) {
                ret = -EOPNOTSUPP;
-                goto out;
+                goto free_args;
        }
        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
                if (vol_args->size > PAGE_CACHE_SIZE) {
                        ret = -EINVAL;
-                        goto out;
+                        goto free_args;
                }
                inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
                if (IS_ERR(inherit)) {
                        ret = PTR_ERR(inherit);
-                        goto out;
+                        goto free_args;
                }
        }
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol, ptr,
                                              readonly, inherit);
+        if (ret)
+                goto free_inherit;
-        if (ret == 0 && ptr &&
+        if (ptr && copy_to_user(arg +
-            copy_to_user(arg +
+                                offsetof(struct btrfs_ioctl_vol_args_v2,
-                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                        transid),
-                                  transid), ptr, sizeof(*ptr)))
+                                ptr, sizeof(*ptr)))
                ret = -EFAULT;
-out:
-        kfree(vol_args);
+free_inherit:
        kfree(inherit);
+free_args:
+        kfree(vol_args);
        return ret;
 }
@@ -2685,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
-                goto out;
+                goto err_drop;
        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2703,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 out:
        kfree(vol_args);
+err_drop:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -3527,7 +3500,8 @@ process_slot:
                        btrfs_mark_buffer_dirty(leaf);
                        btrfs_release_path(path);
-                        last_dest_end = new_key.offset + datal;
+                        last_dest_end = ALIGN(new_key.offset + datal,
+                                              root->sectorsize);
                        ret = clone_finish_inode_update(trans, inode,
                                                        last_dest_end,
                                                        destoff, olen);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 963895c1f801..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                spin_unlock(&root->ordered_extent_lock);
                btrfs_init_work(&ordered->flush_work,
+                                btrfs_flush_delalloc_helper,
                                btrfs_run_ordered_extent_work, NULL, NULL);
                list_add_tail(&ordered->work_list, &works);
                btrfs_queue_work(root->fs_info->flush_workers,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b497498484be..ded5c601d916 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1973,7 +1973,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
                                   elem.seq, &roots);
        btrfs_put_tree_mod_seq(fs_info, &elem);
        if (ret < 0)
-                return ret;
+                goto out;
        if (roots->nnodes != 1)
                goto out;
@@ -2720,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
        memset(&fs_info->qgroup_rescan_work, 0,
               sizeof(fs_info->qgroup_rescan_work));
        btrfs_init_work(&fs_info->qgroup_rescan_work,
+                        btrfs_qgroup_rescan_helper,
                        btrfs_qgroup_rescan_worker, NULL, NULL);
        if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..0a6b6e4bcbb9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,7 +1416,8 @@ cleanup:
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        rmw_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-        btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        read_rebuild_work, NULL, NULL);
        btrfs_queue_work(rbio->fs_info->rmw_workers,
                         &rbio->work);
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
        plug = container_of(cb, struct btrfs_plug_cb, cb);
        if (from_schedule) {
-                btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+                btrfs_init_work(&plug->work, btrfs_rmw_helper,
+                                unplug_work, NULL, NULL);
                btrfs_queue_work(plug->info->rmw_workers,
                                 &plug->work);
                return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..20408c6b665a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
                /* FIXME we cannot handle this properly right now */
                BUG();
        }
-        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+        btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+                        reada_start_machine_worker, NULL, NULL);
        rmw->fs_info = fs_info;
        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..f4a41f37be22 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
                sbio->index = i;
                sbio->sctx = sctx;
                sbio->page_count = 0;
-                btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_bio_end_io_worker, NULL, NULL);
                if (i != SCRUB_BIOS_PER_SCTX - 1)
                        sctx->bios[i]->next_free = i + 1;
@@ -999,8 +999,8 @@ nodatasum_case:
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                scrub_pending_trans_workers_inc(sctx);
-                btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
-                                NULL, NULL);
+                                scrub_fixup_nodatasum, NULL, NULL);
                btrfs_queue_work(fs_info->scrub_workers,
                                 &fixup_nodatasum->work);
                goto out;
@@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
        sbio->err = err;
        sbio->bio = bio;
-        btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+                         scrub_wr_bio_end_io_worker, NULL, NULL);
        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
@@ -2904,6 +2905,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        struct scrub_ctx *sctx;
        int ret;
        struct btrfs_device *dev;
+        struct rcu_string *name;
        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
@@ -2965,6 +2967,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -ENODEV;
        }
+        if (!is_dev_replace && !readonly && !dev->writeable) {
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                rcu_read_lock();
+                name = rcu_dereference(dev->name);
+                btrfs_err(fs_info, "scrub: device %s is not writable",
+                          name->str);
+                rcu_read_unlock();
+                return -EROFS;
+        }
        mutex_lock(&fs_info->scrub_lock);
        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        nocow_ctx->len = len;
        nocow_ctx->mirror_num = mirror_num;
        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-        btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+                        copy_nocow_pages_worker, NULL, NULL);
        INIT_LIST_HEAD(&nocow_ctx->inodes);
        btrfs_queue_work(fs_info->scrub_nocow_workers,
                         &nocow_ctx->work);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..12e53556e214 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
        if (!fs_info->device_dir_kobj)
                return -EINVAL;
-        if (one_device) {
+        if (one_device && one_device->bdev) {
                disk = one_device->bdev->bd_part;
                disk_kobj = &part_to_dev(disk)->kobj;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..d296efe2d3e7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only);
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        struct list_head ordered_sums;
        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        bool has_extents = false;
-        bool need_find_last_extent = (*last_extent == 0);
+        bool need_find_last_extent = true;
        bool done = false;
        INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                 */
                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
                        has_extents = true;
-                        if (need_find_last_extent &&
+                        if (first_key.objectid == (u64)-1)
-                            first_key.objectid == (u64)-1)
                                first_key = ins_keys[i];
                } else {
                        need_find_last_extent = false;
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        if (!has_extents)
                return ret;
+        if (need_find_last_extent && *last_extent == first_key.offset) {
+                /*
+                 * We don't have any leafs between our current one and the one
+                 * we processed before that can have file extent items for our
+                 * inode (and have a generation number smaller than our current
+                 * transaction id).
+                 */
+                need_find_last_extent = false;
+        }
        /*
         * Because we use btrfs_search_forward we could skip leaves that were
         * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
                                               0, 0);
                if (ret)
                        break;
-                *last_extent = offset + len;
+                *last_extent = extent_end;
        }
        /*
         * Need to let the callers know we dropped the path so they should
@@ -3849,8 +3860,10 @@ process:
 * This handles both files and directories.
 */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, struct inode *inode,
+                           struct btrfs_root *root, struct inode *inode,
-                             int inode_only)
+                           int inode_only,
+                           const loff_t start,
+                           const loff_t end)
 {
        struct btrfs_path *path;
        struct btrfs_path *dst_path;
@@ -3867,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int ins_nr;
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        path = btrfs_alloc_path();
        if (!path)
@@ -4040,13 +4054,35 @@ log_extents:
                        goto out_unlock;
                }
        } else if (inode_only == LOG_INODE_ALL) {
-                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
-                write_lock(&tree->lock);
+                write_lock(&em_tree->lock);
-                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+                /*
-                        list_del_init(&em->list);
+                 * We can't just remove every em if we're called for a ranged
-                write_unlock(&tree->lock);
+                 * fsync - that is, one that doesn't cover the whole possible
+                 * file range (0 to LLONG_MAX). This is because we can have
+                 * em's that fall outside the range we're logging and therefore
+                 * their ordered operations haven't completed yet
+                 * (btrfs_finish_ordered_io() not invoked yet). This means we
+                 * didn't get their respective file extent item in the fs/subvol
+                 * tree yet, and need to let the next fast fsync (one which
+                 * consults the list of modified extent maps) find the em so
+                 * that it logs a matching file extent item and waits for the
+                 * respective ordered operation to complete (if it's still
+                 * running).
+                 *
+                 * Removing every em outside the range we're logging would make
+                 * the next fast fsync not log their matching file extent items,
+                 * therefore making us lose data after a log replay.
+                 */
+                list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+                                         list) {
+                        const u64 mod_end = em->mod_start + em->mod_len - 1;
+                        if (em->mod_start >= start && mod_end <= end)
+                                list_del_init(&em->list);
+                }
+                write_unlock(&em_tree->lock);
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,8 +4092,19 @@ log_extents:
                        goto out_unlock;
                }
        }
-        BTRFS_I(inode)->logged_trans = trans->transid;
-        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+        write_lock(&em_tree->lock);
+        /*
+         * If we're doing a ranged fsync and there are still modified extents
+         * in the list, we must run on the next fsync call as it might cover
+         * those extents (a full fsync or an fsync for other range).
+         */
+        if (list_empty(&em_tree->modified_extents)) {
+                BTRFS_I(inode)->logged_trans = trans->transid;
+                BTRFS_I(inode)->last_log_commit =
+                        BTRFS_I(inode)->last_sub_trans;
+        }
+        write_unlock(&em_tree->lock);
 out_unlock:
        if (unlikely(err))
                btrfs_put_logged_extents(&logged_list);
@@ -4152,7 +4199,10 @@ out:
 */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct inode *inode,
-                                  struct dentry *parent, int exists_only,
+                                  struct dentry *parent,
+                                  const loff_t start,
+                                  const loff_t end,
+                                  int exists_only,
                                  struct btrfs_log_ctx *ctx)
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
-        ret = btrfs_log_inode(trans, root, inode, inode_only);
+        ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
        if (ret)
                goto end_trans;
@@ -4226,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
-                        ret = btrfs_log_inode(trans, root, inode, inode_only);
+                        ret = btrfs_log_inode(trans, root, inode, inode_only,
+                                              0, LLONG_MAX);
                        if (ret)
                                goto end_trans;
                }
@@ -4260,13 +4311,15 @@ end_no_trans:
 */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx)
 {
        struct dentry *parent = dget_parent(dentry);
        int ret;
        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-                                     0, ctx);
+                                     start, end, 0, ctx);
        dput(parent);
        return ret;
@@ -4503,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                    root->fs_info->last_trans_committed))
                return 0;
-        return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+        return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+                                      LLONG_MAX, 1, NULL);
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..e2e798ae7cd7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                          const loff_t start,
+                          const loff_t end,
                          struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..340a92d08e84 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -508,6 +508,44 @@ static noinline int device_list_add(const char *path,
                ret = 1;
                device->fs_devices = fs_devices;
        } else if (!device->name || strcmp(device->name->str, path)) {
+                /*
+                 * When FS is already mounted.
+                 * 1. If you are here and if the device->name is NULL that
+                 *    means this device was missing at time of FS mount.
+                 * 2. If you are here and if the device->name is different
+                 *    from 'path' that means either
+                 *      a. The same device disappeared and reappeared with
+                 *         different name. or
+                 *      b. The missing-disk-which-was-replaced, has
+                 *         reappeared now.
+                 *
+                 * We must allow 1 and 2a above. But 2b would be a spurious
+                 * and unintentional.
+                 *
+                 * Further in case of 1 and 2a above, the disk at 'path'
+                 * would have missed some transaction when it was away and
+                 * in case of 2a the stale bdev has to be updated as well.
+                 * 2b must not be allowed at all time.
+                 */
+                /*
+                 * As of now don't allow update to btrfs_fs_device through
+                 * the btrfs dev scan cli, after FS has been mounted.
+                 */
+                if (fs_devices->opened) {
+                        return -EBUSY;
+                } else {
+                        /*
+                         * That is if the FS is _not_ mounted and if you
+                         * are here, that means there is more than one
+                         * disk with same uuid and devid.We keep the one
+                         * with larger generation number or the last-in if
+                         * generation are equal.
+                         */
+                        if (found_transid < device->generation)
+                                return -EEXIST;
+                }
                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
@@ -519,6 +557,15 @@ static noinline int device_list_add(const char *path,
                }
        }
+        /*
+         * Unmount does not free the btrfs_device struct but would zero
+         * generation along with most of the other members. So just update
+         * it back. We need it to pick the disk with largest generation
+         * (as above).
+         */
+        if (!fs_devices->opened)
+                device->generation = found_transid;
        if (found_transid > fs_devices->latest_trans) {
                fs_devices->latest_devid = devid;
                fs_devices->latest_trans = found_transid;
@@ -1436,7 +1483,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
        btrfs_set_device_group(leaf, dev_item, 0);
        btrfs_set_device_seek_speed(leaf, dev_item, 0);
@@ -1671,7 +1718,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->total_devices--;
        if (device->missing)
-                root->fs_info->fs_devices->missing_devices--;
+                device->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
@@ -1801,8 +1848,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
        if (srcdev->bdev) {
                fs_info->fs_devices->open_devices--;
-                /* zero out the old super */
+                /*
-                btrfs_scratch_superblock(srcdev);
+                 * zero out the old super if it is not writable
+                 * (e.g. seed device)
+                 */
+                if (srcdev->writeable)
+                        btrfs_scratch_superblock(srcdev);
        }
        call_rcu(&srcdev->rcu, free_device);
@@ -1941,6 +1992,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
        fs_devices->seeding = 0;
        fs_devices->num_devices = 0;
        fs_devices->open_devices = 0;
+        fs_devices->missing_devices = 0;
+        fs_devices->num_can_discard = 0;
+        fs_devices->rotating = 0;
        fs_devices->seed = seed_devices;
        generate_random_uuid(fs_devices->fsid);
@@ -5800,7 +5854,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
        else
                generate_random_uuid(dev->uuid);
-        btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+        btrfs_init_work(&dev->work, btrfs_submit_helper,
+                        pending_bios_fn, NULL, NULL);
        return dev;
 }
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
          support for OS/2 and Windows ME and similar servers is provided as
          well.
+          The module also provides optional support for the followon
+          protocols for CIFS including SMB3, which enables
+          useful performance and security features (see the description
+          of CONFIG_CIFS_SMB2).
          The cifs module provides an advanced network file system
          client for mounting to CIFS compliant servers.  It includes
          support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
          depends on CIFS_XATTR && KEYS
          help
            Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
-            is handed over to the application/caller.
+            is handed over to the application/caller.  See the man
+            page for getcifsacl for more information.
 config CIFS_DEBUG
        bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
           Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 config CIFS_SMB2
-        bool "SMB2 network file system support"
+        bool "SMB2 and SMB3 network file system support"
        depends on CIFS && INET
        select NLS
        select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
        select DNS_RESOLVER
        help
-          This enables experimental support for the SMB2 (Server Message Block
+          This enables support for the Server Message Block version 2
-          version 2) protocol. The SMB2 protocol is the successor to the
+          family of protocols, including SMB3.  SMB3 support is
-          popular CIFS and SMB network file sharing protocols. SMB2 is the
+          enabled on mount by specifying "vers=3.0" in the mount
-          native file sharing mechanism for recent versions of Windows
+          options. These protocols are the successors to the popular
-          operating systems (since Vista).  SMB2 enablement will eventually
+          CIFS and SMB network file sharing protocols. SMB3 is the
-          allow users better performance, security and features, than would be
+          native file sharing mechanism for the more recent
-          possible with cifs. Note that smb2 mount options also are simpler
+          versions of Windows (Windows 8 and Windows 2012 and
-          (compared to cifs) due to protocol improvements.
+          later) and Samba server and many others support SMB3 well.
+          In general SMB3 enables better performance, security
-          Unless you are a developer or tester, say N.
+          and features, than would be possible with CIFS (Note that
+          when mounting to Samba, due to the CIFS POSIX extensions,
+          CIFS mounts can provide slightly better POSIX compatibility
+          than SMB3 mounts do though). Note that SMB2/SMB3 mount
+          options are also slightly simpler (compared to CIFS) due
+          to protocol improvements.
 config CIFS_FSCACHE
          bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ac4f260155c8..889b98455750 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
+{
+        struct super_block *sb = file->f_path.dentry->d_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        if (server->ops->fallocate)
+                return server->ops->fallocate(file, tcon, mode, off, len);
+        return -EOPNOTSUPP;
+}
 static int cifs_permission(struct inode *inode, int mask)
 {
        struct cifs_sb_info *cifs_sb;
@@ -812,8 +825,9 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
        if (!(S_ISREG(inode->i_mode)))
                return -EINVAL;
-        /* check if file is oplocked */
+        /* Check if file is oplocked if this is request for new lease */
-        if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+        if (arg == F_UNLCK ||
+            ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
            ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
                return generic_setlease(file, arg, lease);
        else if (tlink_tcon(cfile->tlink)->local_lease &&
@@ -908,6 +922,7 @@ const struct file_operations cifs_file_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_strict_ops = {
@@ -927,6 +942,7 @@ const struct file_operations cifs_file_strict_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_direct_ops = {
@@ -947,6 +963,7 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_nobrl_ops = {
@@ -965,6 +982,7 @@ const struct file_operations cifs_file_nobrl_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_strict_nobrl_ops = {
@@ -983,6 +1001,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -1002,6 +1021,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
+        .fallocate = cifs_fallocate,
 };
 const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0012e1e291d4..25b8392bfdd2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
-/* used to define string lengths for reversing unicode strings */
-/*         (256+1)*2 = 514                                     */
-/*           (max path length + 1 for null) * 2 for unicode    */
-#define MAX_NAME 514
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
@@ -409,6 +404,10 @@ struct smb_version_operations {
        /* get mtu credits */
        int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
                                unsigned int *, unsigned int *);
+        /* check if we need to issue closedir */
+        bool (*dir_needs_close)(struct cifsFileInfo *);
+        long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
+                          loff_t);
 };
 struct smb_version_values {
@@ -883,6 +882,7 @@ struct cifs_tcon {
                                for this mount even if server would support */
        bool local_lease:1; /* check leases (only) on local system not remote */
        bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
+        bool broken_sparse_sup; /* if server or share does not support sparse */
        bool need_reconnect:1; /* connection reset, tid now invalid */
 #ifdef CONFIG_CIFS_SMB2
        bool print:1;           /* set if connection to printer share */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 33df36ef9d52..5f9822ac0245 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2253,6 +2253,29 @@ typedef struct {
 /* minimum includes first three fields, and empty FS Name */
 #define MIN_FS_ATTR_INFO_SIZE 12
+/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORT_INTEGRITY_STREAMS  0x04000000
+#define FILE_SUPPORTS_USN_JOURNAL       0x02000000
+#define FILE_SUPPORTS_OPEN_BY_FILE_ID   0x01000000
+#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
+#define FILE_SUPPORTS_HARD_LINKS        0x00400000
+#define FILE_SUPPORTS_TRANSACTIONS      0x00200000
+#define FILE_SEQUENTIAL_WRITE_ONCE      0x00100000
+#define FILE_READ_ONLY_VOLUME           0x00080000
+#define FILE_NAMED_STREAMS              0x00040000
+#define FILE_SUPPORTS_ENCRYPTION        0x00020000
+#define FILE_SUPPORTS_OBJECT_IDS        0x00010000
+#define FILE_VOLUME_IS_COMPRESSED       0x00008000
+#define FILE_SUPPORTS_REMOTE_STORAGE    0x00000100
+#define FILE_SUPPORTS_REPARSE_POINTS    0x00000080
+#define FILE_SUPPORTS_SPARSE_FILES      0x00000040
+#define FILE_VOLUME_QUOTAS              0x00000020
+#define FILE_FILE_COMPRESSION           0x00000010
+#define FILE_PERSISTENT_ACLS            0x00000008
+#define FILE_UNICODE_ON_DISK            0x00000004
+#define FILE_CASE_PRESERVED_NAMES       0x00000002
+#define FILE_CASE_SENSITIVE_SEARCH      0x00000001
 typedef struct {
        __le32 Attributes;
        __le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 03ed8a09581c..8a9fded7c135 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -837,7 +837,6 @@ cifs_demultiplex_thread(void *p)
        struct TCP_Server_Info *server = p;
        unsigned int pdu_length;
        char *buf = NULL;
-        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
        current->flags |= PF_MEMALLOC;
@@ -928,19 +927,7 @@ cifs_demultiplex_thread(void *p)
        if (server->smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(server->smallbuf);
-        task_to_wake = xchg(&server->tsk, NULL);
        clean_demultiplex_info(server);
-        /* if server->tsk was NULL then wait for a signal before exiting */
-        if (!task_to_wake) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                while (!signal_pending(current)) {
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                }
-                set_current_state(TASK_RUNNING);
-        }
        module_put_and_exit(0);
 }
@@ -1600,6 +1587,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        tmp_end++;
                        if (!(tmp_end < end && tmp_end[1] == delim)) {
                                /* No it is not. Set the password to NULL */
+                                kfree(vol->password);
                                vol->password = NULL;
                                break;
                        }
@@ -1637,6 +1625,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                        options = end;
                        }
+                        kfree(vol->password);
                        /* Now build new password string */
                        temp_len = strlen(value);
                        vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -2061,8 +2050,6 @@ cifs_find_tcp_session(struct smb_vol *vol)
 static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
-        struct task_struct *task;
        spin_lock(&cifs_tcp_ses_lock);
        if (--server->srv_count > 0) {
                spin_unlock(&cifs_tcp_ses_lock);
@@ -2086,10 +2073,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
        kfree(server->session_key.response);
        server->session_key.response = NULL;
        server->session_key.len = 0;
-        task = xchg(&server->tsk, NULL);
-        if (task)
-                force_sig(SIGKILL, task);
 }
 static struct TCP_Server_Info *
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..6cbd9c688cfe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
                goto out;
        }
+        if (file->f_flags & O_DIRECT &&
+            CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        file->f_op = &cifs_file_direct_nobrl_ops;
+                else
+                        file->f_op = &cifs_file_direct_ops;
+                }
        file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
        if (file_info == NULL) {
                if (server->ops->close)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4ab2f79ffa7a..7c018a1c52f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
        cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
                 inode, file->f_flags, full_path);
+        if (file->f_flags & O_DIRECT &&
+            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        file->f_op = &cifs_file_direct_nobrl_ops;
+                else
+                        file->f_op = &cifs_file_direct_ops;
+        }
        if (server->oplocks)
                oplock = REQ_OPLOCK;
        else
@@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
        cifs_dbg(FYI, "Freeing private data in close dir\n");
        spin_lock(&cifs_file_list_lock);
-        if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+        if (server->ops->dir_needs_close(cfile)) {
                cfile->invalidHandle = true;
                spin_unlock(&cifs_file_list_lock);
                if (server->ops->close_dir)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 426d6c6ad8bf..7899a40465b3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1720,13 +1720,22 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
 unlink_target:
        /* Try unlinking the target dentry if it's not negative */
        if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
-                tmprc = cifs_unlink(target_dir, target_dentry);
+                if (d_is_dir(target_dentry))
+                        tmprc = cifs_rmdir(target_dir, target_dentry);
+                else
+                        tmprc = cifs_unlink(target_dir, target_dentry);
                if (tmprc)
                        goto cifs_rename_exit;
                rc = cifs_do_rename(xid, source_dentry, from_name,
                                    target_dentry, to_name);
        }
+        /* force revalidate to go get info when needed */
+        CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
+        source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
+                target_dir->i_mtime = current_fs_time(source_dir->i_sb);
 cifs_rename_exit:
        kfree(info_buf_source);
        kfree(from_name);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 81340c6253eb..b7415d596dbd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -574,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
                cinode->oplock = 0;
 }
-static int
-cifs_oplock_break_wait(void *unused)
-{
-        schedule();
-        return signal_pending(current) ? -ERESTARTSYS : 0;
-}
 /*
 * We wait for oplock breaks to be processed before we attempt to perform
 * writes.
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b15862e0f68c..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -593,11 +593,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
                /* close and restart search */
                cifs_dbg(FYI, "search backing up - close and restart search\n");
                spin_lock(&cifs_file_list_lock);
-                if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+                if (server->ops->dir_needs_close(cfile)) {
                        cfile->invalidHandle = true;
                        spin_unlock(&cifs_file_list_lock);
-                        if (server->ops->close)
+                        if (server->ops->close_dir)
-                                server->ops->close(xid, tcon, &cfile->fid);
+                                server->ops->close_dir(xid, tcon, &cfile->fid);
                } else
                        spin_unlock(&cifs_file_list_lock);
                if (cfile->srch_inf.ntwrk_buf_start) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 39ee32688eac..3a5e83317683 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
        kfree(ses->serverOS);
        ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-        if (ses->serverOS)
+        if (ses->serverOS) {
                strncpy(ses->serverOS, bcc_ptr, len);
-        if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+                if (strncmp(ses->serverOS, "OS/2", 4) == 0)
-                cifs_dbg(FYI, "OS/2 server\n");
+                        cifs_dbg(FYI, "OS/2 server\n");
+        }
        bcc_ptr += len + 1;
        bleft -= len + 1;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5e8c22d6c7b9..1a6df4b03f67 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1015,6 +1015,12 @@ cifs_wp_retry_size(struct inode *inode)
        return CIFS_SB(inode->i_sb)->wsize;
 }
+static bool
+cifs_dir_needs_close(struct cifsFileInfo *cfile)
+{
+        return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
+}
 struct smb_version_operations smb1_operations = {
        .send_cancel = send_nt_cancel,
        .compare_fids = cifs_compare_fids,
@@ -1086,6 +1092,7 @@ struct smb_version_operations smb1_operations = {
        .create_mf_symlink = cifs_create_mf_symlink,
        .is_read_op = cifs_is_read_op,
        .wp_retry_size = cifs_wp_retry_size,
+        .dir_needs_close = cifs_dir_needs_close,
 #ifdef CONFIG_CIFS_XATTR
        .query_all_EAs = CIFSSMBQAllEAs,
        .set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
                goto out;
        }
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL) {
                rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0150182a4494..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
        *adjust_tz = false;
        *symlink = false;
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL)
                return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index e31a9dfdcd39..af59d03db492 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
        {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
        {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
-        {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
+        {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
        {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
        {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
        {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -298,7 +298,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
        {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
        {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
-        {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"},
+        {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
        {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
        {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
        {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f2e6ac29a8d6..4aa7a0f07d6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length)
                /* Windows 7 server returns 24 bytes more */
                if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
                        return 0;
-                /* server can return one byte more */
+                /* server can return one byte more due to implied bcc[0] */
                if (clc_len == 4 + len + 1)
                        return 0;
+                /*
+                 * MacOS server pads after SMB2.1 write response with 3 bytes
+                 * of junk. Other servers match RFC1001 len to actual
+                 * SMB2/SMB3 frame length (header + smb2 response specific data)
+                 * Log the server error (once), but allow it and continue
+                 * since the frame is parseable.
+                 */
+                if (clc_len < 4 /* RFC1001 header size */ + len) {
+                        printk_once(KERN_WARNING
+                                "SMB2 server sent bad RFC1001 len %d not %d\n",
+                                len, clc_len - 4);
+                        return 0;
+                }
                return 1;
        }
        return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 77f8aeb9c2fc..f522193b7184 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        int rc;
        struct smb2_file_all_info *smb2_data;
-        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+        smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                            GFP_KERNEL);
        if (smb2_data == NULL)
                return -ENOMEM;
@@ -731,11 +731,72 @@ smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
        return SMB2_write(xid, parms, written, iov, nr_segs);
 }
+/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
+static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
+                struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
+{
+        struct cifsInodeInfo *cifsi;
+        int rc;
+        cifsi = CIFS_I(inode);
+        /* if file already sparse don't bother setting sparse again */
+        if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
+                return true; /* already sparse */
+        if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
+                return true; /* already not sparse */
+        /*
+         * Can't check for sparse support on share the usual way via the
+         * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
+         * since Samba server doesn't set the flag on the share, yet
+         * supports the set sparse FSCTL and returns sparse correctly
+         * in the file attributes. If we fail setting sparse though we
+         * mark that server does not support sparse files for this share
+         * to avoid repeatedly sending the unsupported fsctl to server
+         * if the file is repeatedly extended.
+         */
+        if (tcon->broken_sparse_sup)
+                return false;
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
+                        true /* is_fctl */, &setsparse, 1, NULL, NULL);
+        if (rc) {
+                tcon->broken_sparse_sup = true;
+                cifs_dbg(FYI, "set sparse rc = %d\n", rc);
+                return false;
+        }
+        if (setsparse)
+                cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
+        else
+                cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
+        return true;
+}
 static int
 smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
                   struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
 {
        __le64 eof = cpu_to_le64(size);
+        struct inode *inode;
+        /*
+         * If extending file more than one page make sparse. Many Linux fs
+         * make files sparse by default when extending via ftruncate
+         */
+        inode = cfile->dentry->d_inode;
+        if (!set_alloc && (size > inode->i_size + 8192)) {
+                __u8 set_sparse = 1;
+                /* whether set sparse succeeds or not, extend the file */
+                smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
+        }
        return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
                            cfile->fid.volatile_fid, cfile->pid, &eof, false);
 }
@@ -954,6 +1015,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
+static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
+                            loff_t offset, loff_t len, bool keep_size)
+{
+        struct inode *inode;
+        struct cifsInodeInfo *cifsi;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct file_zero_data_information fsctl_buf;
+        long rc;
+        unsigned int xid;
+        xid = get_xid();
+        inode = cfile->dentry->d_inode;
+        cifsi = CIFS_I(inode);
+        /* if file not oplocked can't be sure whether asking to extend size */
+        if (!CIFS_CACHE_READ(cifsi))
+                if (keep_size == false)
+                        return -EOPNOTSUPP;
+        /*
+         * Must check if file sparse since fallocate -z (zero range) assumes
+         * non-sparse allocation
+         */
+        if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
+                return -EOPNOTSUPP;
+        /*
+         * need to make sure we are not asked to extend the file since the SMB3
+         * fsctl does not change the file size. In the future we could change
+         * this to zero the first part of the range then set the file size
+         * which for a non sparse file would zero the newly extended range
+         */
+        if (keep_size == false)
+                if (i_size_read(inode) < offset + len)
+                        return -EOPNOTSUPP;
+        cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+        fsctl_buf.FileOffset = cpu_to_le64(offset);
+        fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+                        true /* is_fctl */, (char *)&fsctl_buf,
+                        sizeof(struct file_zero_data_information), NULL, NULL);
+        free_xid(xid);
+        return rc;
+}
+static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode;
+        struct cifsInodeInfo *cifsi;
+        struct cifsFileInfo *cfile = file->private_data;
+        struct file_zero_data_information fsctl_buf;
+        long rc;
+        unsigned int xid;
+        __u8 set_sparse = 1;
+        xid = get_xid();
+        inode = cfile->dentry->d_inode;
+        cifsi = CIFS_I(inode);
+        /* Need to make file sparse, if not already, before freeing range. */
+        /* Consider adding equivalent for compressed since it could also work */
+        if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
+                return -EOPNOTSUPP;
+        cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+        fsctl_buf.FileOffset = cpu_to_le64(offset);
+        fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+        rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+                        cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+                        true /* is_fctl */, (char *)&fsctl_buf,
+                        sizeof(struct file_zero_data_information), NULL, NULL);
+        free_xid(xid);
+        return rc;
+}
+static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
+                           loff_t off, loff_t len)
+{
+        /* KEEP_SIZE already checked for by do_fallocate */
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return smb3_punch_hole(file, tcon, off, len);
+        else if (mode & FALLOC_FL_ZERO_RANGE) {
+                if (mode & FALLOC_FL_KEEP_SIZE)
+                        return smb3_zero_range(file, tcon, off, len, true);
+                return smb3_zero_range(file, tcon, off, len, false);
+        }
+        return -EOPNOTSUPP;
+}
 static void
 smb2_downgrade_oplock(struct TCP_Server_Info *server,
                        struct cifsInodeInfo *cinode, bool set_level2)
@@ -1161,6 +1321,12 @@ smb2_wp_retry_size(struct inode *inode)
                     SMB2_MAX_BUFFER_SIZE);
 }
+static bool
+smb2_dir_needs_close(struct cifsFileInfo *cfile)
+{
+        return !cfile->invalidHandle;
+}
 struct smb_version_operations smb20_operations = {
        .compare_fids = smb2_compare_fids,
        .setup_request = smb2_setup_request,
@@ -1236,6 +1402,7 @@ struct smb_version_operations smb20_operations = {
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
 };
 struct smb_version_operations smb21_operations = {
@@ -1313,6 +1480,7 @@ struct smb_version_operations smb21_operations = {
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
 };
 struct smb_version_operations smb30_operations = {
@@ -1393,6 +1561,8 @@ struct smb_version_operations smb30_operations = {
        .clone_range = smb2_clone_range,
        .validate_negotiate = smb3_validate_negotiate,
        .wp_retry_size = smb2_wp_retry_size,
+        .dir_needs_close = smb2_dir_needs_close,
+        .fallocate = smb3_fallocate,
 };
 struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 42ebc1a8be6c..74b3a6684383 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
        struct smb2_sess_setup_rsp *rsp = NULL;
        struct kvec iov[2];
        int rc = 0;
-        int resp_buftype;
+        int resp_buftype = CIFS_NO_BUFFER;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
        struct TCP_Server_Info *server = ses->server;
        u16 blob_length = 0;
@@ -907,7 +907,8 @@ tcon_exit:
 tcon_error_exit:
        if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
                cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
-                tcon->bad_network_name = true;
+                if (tcon)
+                        tcon->bad_network_name = true;
        }
        goto tcon_exit;
 }
@@ -1224,7 +1225,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        cifs_dbg(FYI, "SMB2 IOCTL\n");
-        *out_data = NULL;
+        if (out_data != NULL)
+                *out_data = NULL;
        /* zero out returned data len, in case of error */
        if (plen)
                *plen = 0;
@@ -1400,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
        rsp = (struct smb2_close_rsp *)iov[0].iov_base;
        if (rc != 0) {
-                if (tcon)
+                cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
-                        cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
                goto close_exit;
        }
@@ -1530,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
 {
        return query_info(xid, tcon, persistent_fid, volatile_fid,
                          FILE_ALL_INFORMATION,
-                          sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+                          sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
                          sizeof(struct smb2_file_all_info), data);
 }
@@ -2177,6 +2179,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
        if (rc) {
+                if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+                        srch_inf->endOfSearch = true;
+                        rc = 0;
+                }
                cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
                goto qdir_exit;
        }
@@ -2214,11 +2220,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        else
                cifs_dbg(VFS, "illegal search buffer type\n");
-        if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
-                srch_inf->endOfSearch = 1;
-        else
-                srch_inf->endOfSearch = 0;
        return rc;
 qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 69f3595d3952..fbe486c285a9 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -573,6 +573,12 @@ struct copychunk_ioctl {
        __u32 Reserved2;
 } __packed;
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+        __le64  FileOffset;
+        __le64  BeyondFinalZero;
+} __packed;
 struct copychunk_ioctl_rsp {
        __le32 ChunksWritten;
        __le32 ChunkBytesWritten;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 0e538b5c9622..83efa59535be 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -63,7 +63,7 @@
 #define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
 #define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
 #define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
-#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000980C8
 #define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
 #define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
 #define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..7a5b51440afa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
                                        unsigned int hash)
 {
        hash += (unsigned long) parent / L1_CACHE_BYTES;
-        hash = hash + (hash >> d_hash_shift);
+        return dentry_hashtable + hash_32(hash, d_hash_shift);
-        return dentry_hashtable + (hash & d_hash_mask);
 }
 /* Statistics gathering. */
@@ -2656,6 +2655,12 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        dentry->d_parent = dentry;
        list_del_init(&dentry->d_u.d_child);
        anon->d_parent = dparent;
+        if (likely(!d_unhashed(anon))) {
+                hlist_bl_lock(&anon->d_sb->s_anon);
+                __hlist_bl_del(&anon->d_hash);
+                anon->d_hash.pprev = NULL;
+                hlist_bl_unlock(&anon->d_sb->s_anon);
+        }
        list_move(&anon->d_u.d_child, &dparent->d_subdirs);
        write_seqcount_end(&dentry->d_seq);
@@ -2714,7 +2719,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        write_seqlock(&rename_lock);
                        __d_materialise_dentry(dentry, new);
                        write_sequnlock(&rename_lock);
-                        __d_drop(new);
                        _d_rehash(new);
                        spin_unlock(&new->d_lock);
                        spin_unlock(&inode->i_lock);
@@ -2778,7 +2782,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                                 * could splice into our tree? */
                                __d_materialise_dentry(dentry, alias);
                                write_sequnlock(&rename_lock);
-                                __d_drop(alias);
                                goto found;
                        } else {
                                /* Nope, but we must(!) avoid directory
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                goto error_tgt_fput;
        /* Check if EPOLLWAKEUP is allowed */
-        ep_take_care_of_epollwakeup(&epds);
+        if (ep_op_has_event(op))
+                ep_take_care_of_epollwakeup(&epds);
        /*
         * We have to check that the file structure underneath the file descriptor
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 08cdfe5461e3..622e88249024 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
                 */
                overhead += ngroups * (2 + sbi->s_itb_per_group);
-                /* Add the journal blocks as well */
+                /* Add the internal journal blocks as well */
-                overhead += sbi->s_journal->j_maxlen;
+                if (sbi->s_journal && !sbi->journal_bdev)
+                        overhead += sbi->s_journal->j_maxlen;
                sbi->s_overhead_last = overhead;
                smp_wmb();
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b19760b1de5..b0c225cdb52c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 /*
 * Special error return code only used by dx_probe() and its callers.
 */
-#define ERR_BAD_DX_DIR  -75000
+#define ERR_BAD_DX_DIR  (-(MAX_ERRNO - 1))
 /*
 * Timeout and state flag for lazy initialization inode thread.
@@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        up_write(&EXT4_I(inode)->i_data_sem);
 }
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+        int changed = 0;
+        if (newsize > inode->i_size) {
+                i_size_write(inode, newsize);
+                changed = 1;
+        }
+        if (newsize > EXT4_I(inode)->i_disksize) {
+                ext4_update_i_disksize(inode, newsize);
+                changed |= 2;
+        }
+        return changed;
+}
 struct ext4_group_info {
        unsigned long   bb_state;
        struct rb_root  bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 76c2df382b7d..74292a71b384 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4665,7 +4665,8 @@ retry:
 }
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
-                                  ext4_lblk_t len, int flags, int mode)
+                                  ext4_lblk_t len, loff_t new_size,
+                                  int flags, int mode)
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
@@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        int retries = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
+        loff_t epos;
        map.m_lblk = offset;
+        map.m_len = len;
        /*
         * Don't normalize the request if it can fit in one extent so
         * that it doesn't get unnecessarily split into multiple
@@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        credits = ext4_chunk_trans_blocks(inode, len);
 retry:
-        while (ret >= 0 && ret < len) {
+        while (ret >= 0 && len) {
-                map.m_lblk = map.m_lblk + ret;
-                map.m_len = len = len - ret;
                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
@@ -4709,6 +4710,21 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
+                map.m_lblk += ret;
+                map.m_len = len = len - ret;
+                epos = (loff_t)map.m_lblk << inode->i_blkbits;
+                inode->i_ctime = ext4_current_time(inode);
+                if (new_size) {
+                        if (epos > new_size)
+                                epos = new_size;
+                        if (ext4_update_inode_size(inode, epos) & 0x1)
+                                inode->i_mtime = inode->i_ctime;
+                } else {
+                        if (epos > inode->i_size)
+                                ext4_set_inode_flag(inode,
+                                                    EXT4_INODE_EOFBLOCKS);
+                }
+                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
                        break;
@@ -4731,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        loff_t new_size = 0;
        int ret = 0;
        int flags;
-        int partial;
+        int credits;
+        int partial_begin, partial_end;
        loff_t start, end;
        ext4_lblk_t lblk;
        struct address_space *mapping = inode->i_mapping;
@@ -4771,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        if (start < offset || end > offset + len)
                return -EINVAL;
-        partial = (offset + len) & ((1 << blkbits) - 1);
+        partial_begin = offset & ((1 << blkbits) - 1);
+        partial_end = (offset + len) & ((1 << blkbits) - 1);
        lblk = start >> blkbits;
        max_blocks = (end >> blkbits);
@@ -4805,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                 * If we have a partial block after EOF we have to allocate
                 * the entire block.
                 */
-                if (partial)
+                if (partial_end)
                        max_blocks += 1;
        }
@@ -4813,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                /* Now release the pages and zero block aligned part of pages*/
                truncate_pagecache_range(inode, start, end - 1);
+                inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
                /* Wait all existing dio workers, newcomers will block on i_mutex */
                ext4_inode_block_unlocked_dio(inode);
@@ -4825,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                if (ret)
                        goto out_dio;
-                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-                                             mode);
+                                             flags, mode);
                if (ret)
                        goto out_dio;
        }
+        if (!partial_begin && !partial_end)
+                goto out_dio;
-        handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+        /*
+         * In worst case we have to writeout two nonadjacent unwritten
+         * blocks and update the inode
+         */
+        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+        if (ext4_should_journal_data(inode))
+                credits += 2;
+        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(inode->i_sb, ret);
@@ -4839,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        }
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        if (new_size) {
-                if (new_size > i_size_read(inode))
+                ext4_update_inode_size(inode, new_size);
-                        i_size_write(inode, new_size);
-                if (new_size > EXT4_I(inode)->i_disksize)
-                        ext4_update_i_disksize(inode, new_size);
        } else {
                /*
                * Mark that we allocate beyond EOF so the subsequent truncate
@@ -4853,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        ext4_mark_inode_dirty(handle, inode);
        /* Zero out partial block at the edges of the range */
@@ -4880,13 +4903,11 @@ out_mutex:
 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
-        handle_t *handle;
        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
        int flags;
        ext4_lblk_t lblk;
-        struct timespec tv;
        unsigned int blkbits = inode->i_blkbits;
        /* Return error if mode is not supported */
@@ -4937,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                        goto out;
        }
-        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+                                     flags, mode);
        if (ret)
                goto out;
-        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
-        if (IS_ERR(handle))
+                ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
-                goto out;
+                                                EXT4_I(inode)->i_sync_tid);
-        tv = inode->i_ctime = ext4_current_time(inode);
-        if (new_size) {
-                if (new_size > i_size_read(inode)) {
-                        i_size_write(inode, new_size);
-                        inode->i_mtime = tv;
-                }
-                if (new_size > EXT4_I(inode)->i_disksize)
-                        ext4_update_i_disksize(inode, new_size);
-        } else {
-                /*
-                * Mark that we allocate beyond EOF so the subsequent truncate
-                * can proceed even if the new size is the same as i_size.
-                */
-                if ((offset + len) > i_size_read(inode))
-                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
-        ext4_mark_inode_dirty(handle, inode);
-        if (file->f_flags & O_SYNC)
-                ext4_handle_sync(handle);
-        ext4_journal_stop(handle);
 out:
        mutex_unlock(&inode->i_mutex);
        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 367a60c07cf0..3aa26e9117c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file,
        } else
                copied = block_write_end(file, mapping, pos,
                                         len, copied, page, fsdata);
        /*
-         * No need to use i_size_read() here, the i_size
+         * it's important to update i_size while still holding page lock:
-         * cannot change under us because we hole i_mutex.
-         *
-         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
-        if (pos + copied > inode->i_size) {
+        i_size_changed = ext4_update_inode_size(inode, pos + copied);
-                i_size_write(inode, pos + copied);
-                i_size_changed = 1;
-        }
-        if (pos + copied > EXT4_I(inode)->i_disksize) {
-                /* We need to mark inode dirty even if
-                 * new_i_size is less that inode->i_size
-                 * but greater than i_disksize. (hint delalloc)
-                 */
-                ext4_update_i_disksize(inode, (pos + copied));
-                i_size_changed = 1;
-        }
        unlock_page(page);
        page_cache_release(page);
@@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
-        loff_t new_i_size;
+        int size_changed = 0;
        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
                if (!partial)
                        SetPageUptodate(page);
        }
-        new_i_size = pos + copied;
+        size_changed = ext4_update_inode_size(inode, pos + copied);
-        if (new_i_size > inode->i_size)
-                i_size_write(inode, pos+copied);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
+        unlock_page(page);
-                ext4_update_i_disksize(inode, new_i_size);
+        page_cache_release(page);
+        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }
-        unlock_page(page);
-        page_cache_release(page);
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
@@ -2095,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
+        int progress = 0;
        mpd->io_submit.io_end->offset =
                                ((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2111,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
-                            (err == -ENOSPC && ext4_count_free_clusters(sb)))
+                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
+                                if (progress)
+                                        goto update_disksize;
                                return err;
+                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
@@ -2129,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
                        *give_up_on_write = true;
                        return err;
                }
+                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
-                        return err;
+                        goto update_disksize;
        } while (map->m_len);
+update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 956027711faf..8b0f9ef517d6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        int last = first + count - 1;
        struct super_block *sb = e4b->bd_sb;
+        if (WARN_ON(count == 0))
+                return;
        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        /* Don't bother if the block group is corrupt. */
@@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
        int err;
        if (pa == NULL) {
+                if (ac->ac_f_ex.fe_len == 0)
+                        return;
                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                if (err) {
                        /*
@@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                               ac->ac_f_ex.fe_len);
                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+                ext4_mb_unload_buddy(&e4b);
                return;
        }
        if (pa->pa_type == MB_INODE_PA)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b147a67baa0d..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                                   buffer */
        int num = 0;
        ext4_lblk_t  nblocks;
-        int i, err;
+        int i, err = 0;
        int namelen;
        *res_dir = NULL;
@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                 * return.  Otherwise, fall back to doing a search the
                 * old fashioned way.
                 */
-                if (bh || (err != ERR_BAD_DX_DIR))
+                if (err == -ENOENT)
+                        return NULL;
+                if (err && err != ERR_BAD_DX_DIR)
+                        return ERR_PTR(err);
+                if (bh)
                        return bh;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
@@ -1295,6 +1299,11 @@ restart:
                                }
                                num++;
                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
+                                if (unlikely(err)) {
+                                        if (ra_max == 0)
+                                                return ERR_PTR(err);
+                                        break;
+                                }
                                bh_use[ra_max] = bh;
                                if (bh)
                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                return ERR_PTR(-ENAMETOOLONG);
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return (struct dentry *) bh;
        inode = NULL;
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
        struct buffer_head *bh;
        bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+        if (IS_ERR(bh))
+                return (struct dentry *) bh;
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (!bh)
                goto end_rmdir;
@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (!bh)
                goto end_unlink;
@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
        struct ext4_dir_entry_2 *de;
        bh = ext4_find_entry(dir, d_name, &de, NULL);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (bh) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                brelse(bh);
@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
        return retval;
 }
-static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
+                               int force_reread)
 {
        int retval;
        /*
@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
        if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
            ent->de->name_len != ent->dentry->d_name.len ||
            strncmp(ent->de->name, ent->dentry->d_name.name,
-                    ent->de->name_len)) {
+                    ent->de->name_len) ||
+            force_reread) {
                retval = ext4_find_delete_entry(handle, ent->dir,
                                                &ent->dentry->d_name);
        } else {
@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                .dentry = new_dentry,
                .inode = new_dentry->d_inode,
        };
+        int force_reread;
        int retval;
        dquot_initialize(old.dir);
@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dquot_initialize(new.inode);
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+        if (IS_ERR(old.bh))
+                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
+        if (IS_ERR(new.bh)) {
+                retval = PTR_ERR(new.bh);
+                new.bh = NULL;
+                goto end_rename;
+        }
        if (new.bh) {
                if (!new.inode) {
                        brelse(new.bh);
@@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (retval)
                        goto end_rename;
        }
+        /*
+         * If we're renaming a file within an inline_data dir and adding or
+         * setting the new dirent causes a conversion from inline_data to
+         * extents/blockmap, we need to force the dirent delete code to
+         * re-read the directory, or else we end up trying to delete a dirent
+         * from what is now the extent tree root (or a block map).
+         */
+        force_reread = (new.dir->i_ino == old.dir->i_ino &&
+                        ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
@@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (retval)
                        goto end_rename;
        }
+        if (force_reread)
+                force_reread = !ext4_test_inode_flag(new.dir,
+                                                     EXT4_INODE_INLINE_DATA);
        /*
         * Like most other Unix systems, set the ctime for inodes on a
@@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * ok, that's it
         */
-        ext4_rename_delete(handle, &old);
+        ext4_rename_delete(handle, &old, force_reread);
        if (new.inode) {
                ext4_dec_count(handle, new.inode);
@@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
                                 &old.de, &old.inlined);
+        if (IS_ERR(old.bh))
+                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
+        if (IS_ERR(new.bh)) {
+                retval = PTR_ERR(new.bh);
+                new.bh = NULL;
+                goto end_rename;
+        }
        /* RENAME_EXCHANGE case: old *and* new must both exist */
        if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
                bh = bclean(handle, sb, block);
                if (IS_ERR(bh)) {
                        err = PTR_ERR(bh);
+                        bh = NULL;
                        goto out;
                }
                overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
                bh = bclean(handle, sb, block);
                if (IS_ERR(bh)) {
                        err = PTR_ERR(bh);
+                        bh = NULL;
                        goto out;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad154b9..0b28b36e7915 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
-                /* journal checksum v2 */
+                /* journal checksum v3 */
                compat = 0;
-                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
                jbd2_journal_clear_features(sbi->s_journal,
                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                                JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                                JBD2_FEATURE_INCOMPAT_CSUM_V2);
        }
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
          mounted as f2fs. Each file shows the whole f2fs information.
          /sys/kernel/debug/f2fs/status includes:
-            - major file system information managed by f2fs currently
+            - major filesystem information managed by f2fs currently
            - average SIT information about whole segments
            - current memory footprint consumed by f2fs.
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
        bool "F2FS consistency checking feature"
        depends on F2FS_FS
        help
-          Enables BUG_ONs which check the file system consistency in runtime.
+          Enables BUG_ONs which check the filesystem consistency in runtime.
          If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5bada52..ec3b7a5381fa 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -160,14 +160,11 @@ static int f2fs_write_meta_page(struct page *page,
                goto redirty_out;
        if (wbc->for_reclaim)
                goto redirty_out;
+        if (unlikely(f2fs_cp_error(sbi)))
-        /* Should not write any meta pages, if any IO error was occurred */
+                goto redirty_out;
-        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
-                goto no_write;
        f2fs_wait_on_page_writeback(page, META);
        write_meta_page(sbi, page);
-no_write:
        dec_page_count(sbi, F2FS_DIRTY_META);
        unlock_page(page);
        return 0;
@@ -348,7 +345,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
        return e ? true : false;
 }
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
 {
        struct ino_entry *e, *tmp;
        int i;
@@ -446,8 +443,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
        struct f2fs_orphan_block *orphan_blk = NULL;
        unsigned int nentries = 0;
        unsigned short index;
-        unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
+        unsigned short orphan_blocks =
-                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+                        (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
        struct page *page = NULL;
        struct ino_entry *orphan = NULL;
@@ -737,7 +734,7 @@ retry:
 /*
 * Freeze all the FS-operations for checkpoint.
 */
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
@@ -745,6 +742,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
                .for_reclaim = 0,
        };
        struct blk_plug plug;
+        int err = 0;
        blk_start_plug(&plug);
@@ -754,11 +752,15 @@ retry_flush_dents:
        if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
                f2fs_unlock_all(sbi);
                sync_dirty_dir_inodes(sbi);
+                if (unlikely(f2fs_cp_error(sbi))) {
+                        err = -EIO;
+                        goto out;
+                }
                goto retry_flush_dents;
        }
        /*
-         * POR: we should ensure that there is no dirty node pages
+         * POR: we should ensure that there are no dirty node pages
         * until finishing nat/sit flush.
         */
 retry_flush_nodes:
@@ -767,9 +769,16 @@ retry_flush_nodes:
        if (get_pages(sbi, F2FS_DIRTY_NODES)) {
                up_write(&sbi->node_write);
                sync_node_pages(sbi, 0, &wbc);
+                if (unlikely(f2fs_cp_error(sbi))) {
+                        f2fs_unlock_all(sbi);
+                        err = -EIO;
+                        goto out;
+                }
                goto retry_flush_nodes;
        }
+out:
        blk_finish_plug(&plug);
+        return err;
 }
 static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -813,8 +822,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
        /* Flush all the NAT/SIT pages */
-        while (get_pages(sbi, F2FS_DIRTY_META))
+        while (get_pages(sbi, F2FS_DIRTY_META)) {
                sync_meta_pages(sbi, META, LONG_MAX);
+                if (unlikely(f2fs_cp_error(sbi)))
+                        return;
+        }
        next_free_nid(sbi, &last_nid);
@@ -825,7 +837,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
        ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
                ckpt->cur_node_segno[i] =
                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
                ckpt->cur_node_blkoff[i] =
@@ -833,7 +845,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
                ckpt->alloc_type[i + CURSEG_HOT_NODE] =
                                curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
        }
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
                ckpt->cur_data_segno[i] =
                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
                ckpt->cur_data_blkoff[i] =
@@ -848,24 +860,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* 2 cp  + n data seg summary + orphan inode blocks */
        data_sum_blocks = npages_for_summary_flush(sbi);
-        if (data_sum_blocks < 3)
+        if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
        else
                clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
-        orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
+        orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
-                                        / F2FS_ORPHANS_PER_BLOCK;
        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
                        orphan_blocks);
        if (is_umount) {
                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks + NR_CURSEG_NODE_TYPE);
        } else {
                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-                ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks);
        }
@@ -924,6 +935,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* wait for previous submitted node/meta pages writeback */
        wait_on_all_pages_writeback(sbi);
+        if (unlikely(f2fs_cp_error(sbi)))
+                return;
        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -934,15 +948,17 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* Here, we only have one bio having CP pack */
        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
-        if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+        release_dirty_inode(sbi);
-                clear_prefree_segments(sbi);
-                release_dirty_inode(sbi);
+        if (unlikely(f2fs_cp_error(sbi)))
-                F2FS_RESET_SB_DIRT(sbi);
+                return;
-        }
+        clear_prefree_segments(sbi);
+        F2FS_RESET_SB_DIRT(sbi);
 }
 /*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
 */
 void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 {
@@ -952,7 +968,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
        mutex_lock(&sbi->cp_mutex);
-        block_operations(sbi);
+        if (!sbi->s_dirty)
+                goto out;
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto out;
+        if (block_operations(sbi))
+                goto out;
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
@@ -976,9 +998,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        do_checkpoint(sbi, is_umount);
        unblock_operations(sbi);
-        mutex_unlock(&sbi->cp_mutex);
        stat_inc_cp_count(sbi->stat_info);
+out:
+        mutex_unlock(&sbi->cp_mutex);
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
 }
@@ -999,8 +1021,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
         * for cp pack we can have max 1020*504 orphan entries
         */
        sbi->n_orphans = 0;
-        sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+        sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
-                                * F2FS_ORPHANS_PER_BLOCK;
+                        NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099c51c..76de83e25a89 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
                struct page *page = bvec->bv_page;
                if (unlikely(err)) {
-                        SetPageError(page);
+                        set_page_dirty(page);
                        set_bit(AS_EIO, &page->mapping->flags);
                        f2fs_stop_checkpoint(sbi);
                }
@@ -691,7 +691,7 @@ get_next:
                        allocated = true;
                        blkaddr = dn.data_blkaddr;
                }
-                /* Give more consecutive addresses for the read ahead */
+                /* Give more consecutive addresses for the readahead */
                if (blkaddr == (bh_result->b_blocknr + ofs)) {
                        ofs++;
                        dn.ofs_in_node++;
@@ -739,7 +739,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
        trace_f2fs_readpage(page, DATA);
-        /* If the file has inline data, try to read it directlly */
+        /* If the file has inline data, try to read it directly */
        if (f2fs_has_inline_data(inode))
                ret = f2fs_read_inline_data(inode, page);
        else
@@ -836,10 +836,19 @@ write:
        /* Dentry blocks are controlled by checkpoint */
        if (S_ISDIR(inode->i_mode)) {
+                if (unlikely(f2fs_cp_error(sbi)))
+                        goto redirty_out;
                err = do_write_data_page(page, &fio);
                goto done;
        }
+        /* we should bypass data pages to proceed the kworkder jobs */
+        if (unlikely(f2fs_cp_error(sbi))) {
+                SetPageError(page);
+                unlock_page(page);
+                return 0;
+        }
        if (!wbc->for_reclaim)
                need_balance_fs = true;
        else if (has_not_enough_free_secs(sbi, 0))
@@ -927,7 +936,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
-                truncate_blocks(inode, inode->i_size);
+                truncate_blocks(inode, inode->i_size, true);
        }
 }
@@ -946,7 +955,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
        f2fs_balance_fs(sbi);
 repeat:
-        err = f2fs_convert_inline_data(inode, pos + len);
+        err = f2fs_convert_inline_data(inode, pos + len, NULL);
        if (err)
                goto fail;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba33be11..fecebdbfd781 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        struct f2fs_stat_info *si = F2FS_STAT(sbi);
        int i;
-        /* valid check of the segment numbers */
+        /* validation check of the segment numbers */
        si->hit_ext = sbi->read_hit_ext;
        si->total_ext = sbi->total_hit_ext;
        si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -152,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
        si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
-        /* buld nm */
+        /* build nm */
        si->base_mem += sizeof(struct f2fs_nm_info);
        si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c3d903..155fb056b7f1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,7 +124,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                /*
                 * For the most part, it should be a bug when name_len is zero.
-                 * We stop here for figuring out where the bugs are occurred.
+                 * We stop here for figuring out where the bugs has occurred.
                 */
                f2fs_bug_on(!de->name_len);
@@ -391,7 +391,7 @@ put_error:
 error:
        /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
        truncate_inode_pages(&inode->i_data, 0);
-        truncate_blocks(inode, 0);
+        truncate_blocks(inode, 0, false);
        remove_dirty_dir_inode(inode);
        remove_inode_page(inode);
        return ERR_PTR(err);
@@ -563,7 +563,7 @@ fail:
 }
 /*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
 * entry in name page does not need to be touched during deletion.
 */
 void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab5338a97a..e921242186f6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,7 @@
 #define f2fs_bug_on(condition)  BUG_ON(condition)
 #define f2fs_down_write(x, y)   down_write_nest_lock(x, y)
 #else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(condition)  WARN_ON(condition)
 #define f2fs_down_write(x, y)   down_write(x)
 #endif
@@ -395,7 +395,7 @@ enum count_type {
 };
 /*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
 * The available types are:
 * DATA                 User data pages. It operates as async mode.
 * NODE                 Node pages. It operates as async mode.
@@ -470,7 +470,7 @@ struct f2fs_sb_info {
        struct list_head dir_inode_list;        /* dir inode list */
        spinlock_t dir_inode_lock;              /* for dir inode list lock */
-        /* basic file system units */
+        /* basic filesystem units */
        unsigned int log_sectors_per_block;     /* log2 sectors per block */
        unsigned int log_blocksize;             /* log2 block size */
        unsigned int blocksize;                 /* block size */
@@ -799,7 +799,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
        /*
         * odd numbered checkpoint should at cp segment 0
-         * and even segent must be at cp segment 1
+         * and even segment must be at cp segment 1
         */
        if (!(ckpt_version & 1))
                start_addr += sbi->blocks_per_seg;
@@ -1096,6 +1096,11 @@ static inline int f2fs_readonly(struct super_block *sb)
        return sb->s_flags & MS_RDONLY;
 }
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+        return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
 static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 {
        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1122,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1202,10 +1207,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
-                struct f2fs_summary *, struct node_info *, block_t);
 void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
 int restore_node_summary(struct f2fs_sb_info *, unsigned int,
                                struct f2fs_summary_block *);
@@ -1238,8 +1241,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
 void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
-                                struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
                block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1262,6 +1263,7 @@ int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
 void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
 bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
@@ -1439,8 +1441,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
 */
 bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
 int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
 void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9bd569..060aee65aee8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        sb_start_pagefault(inode->i_sb);
+        /* force to convert with normal data indices */
+        err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+        if (err)
+                goto out;
        /* block allocation */
        f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,6 +115,25 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
        return 1;
 }
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        bool need_cp = false;
+        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+                need_cp = true;
+        else if (file_wrong_pino(inode))
+                need_cp = true;
+        else if (!space_for_roll_forward(sbi))
+                need_cp = true;
+        else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+                need_cp = true;
+        else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+                need_cp = true;
+        return need_cp;
+}
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
@@ -154,23 +178,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        /* guarantee free sections for fsync */
        f2fs_balance_fs(sbi);
-        down_read(&fi->i_sem);
        /*
         * Both of fdatasync() and fsync() are able to be recovered from
         * sudden-power-off.
         */
-        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+        down_read(&fi->i_sem);
-                need_cp = true;
+        need_cp = need_do_checkpoint(inode);
-        else if (file_wrong_pino(inode))
-                need_cp = true;
-        else if (!space_for_roll_forward(sbi))
-                need_cp = true;
-        else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-                need_cp = true;
-        else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
-                need_cp = true;
        up_read(&fi->i_sem);
        if (need_cp) {
@@ -288,7 +301,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                if (err && err != -ENOENT) {
                        goto fail;
                } else if (err == -ENOENT) {
-                        /* direct node is not exist */
+                        /* direct node does not exists */
                        if (whence == SEEK_DATA) {
                                pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
                                                        F2FS_I(inode));
@@ -417,7 +430,7 @@ out:
        f2fs_put_page(page, 1);
 }
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        unsigned int blocksize = inode->i_sb->s_blocksize;
@@ -433,14 +446,16 @@ int truncate_blocks(struct inode *inode, u64 from)
        free_from = (pgoff_t)
                        ((from + blocksize - 1) >> (sbi->log_blocksize));
-        f2fs_lock_op(sbi);
+        if (lock)
+                f2fs_lock_op(sbi);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
        if (err) {
                if (err == -ENOENT)
                        goto free_next;
-                f2fs_unlock_op(sbi);
+                if (lock)
+                        f2fs_unlock_op(sbi);
                trace_f2fs_truncate_blocks_exit(inode, err);
                return err;
        }
@@ -458,7 +473,8 @@ int truncate_blocks(struct inode *inode, u64 from)
        f2fs_put_dnode(&dn);
 free_next:
        err = truncate_inode_blocks(inode, free_from);
-        f2fs_unlock_op(sbi);
+        if (lock)
+                f2fs_unlock_op(sbi);
 done:
        /* lastly zero out the first data page */
        truncate_partial_data_page(inode, from);
@@ -475,7 +491,7 @@ void f2fs_truncate(struct inode *inode)
        trace_f2fs_truncate(inode);
-        if (!truncate_blocks(inode, i_size_read(inode))) {
+        if (!truncate_blocks(inode, i_size_read(inode), true)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                mark_inode_dirty(inode);
        }
@@ -533,7 +549,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
                        attr->ia_size != i_size_read(inode)) {
-                err = f2fs_convert_inline_data(inode, attr->ia_size);
+                err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
                if (err)
                        return err;
@@ -622,7 +638,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
        loff_t off_start, off_end;
        int ret = 0;
-        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+        ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
        if (ret)
                return ret;
@@ -678,7 +694,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        if (ret)
                return ret;
-        ret = f2fs_convert_inline_data(inode, offset + len);
+        ret = f2fs_convert_inline_data(inode, offset + len, NULL);
        if (ret)
                return ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d90ccc3..943a31db7cc3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
                 * 3. IO subsystem is idle by checking the # of requests in
                 *    bdev's request list.
                 *
-                 * Note) We have to avoid triggering GCs too much frequently.
+                 * Note) We have to avoid triggering GCs frequently.
                 * Because it is possible that some segments can be
                 * invalidated soon after by user update or deletion.
                 * So, I'd like to wait some time to collect dirty segments.
@@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
        u = (vblocks * 100) >> sbi->log_blocks_per_seg;
-        /* Handle if the system time is changed by user */
+        /* Handle if the system time has changed by the user */
        if (mtime < sit_i->min_mtime)
                sit_i->min_mtime = mtime;
        if (mtime > sit_i->max_mtime)
@@ -593,7 +593,7 @@ next_step:
                if (phase == 2) {
                        inode = f2fs_iget(sb, dni.ino);
-                        if (IS_ERR(inode))
+                        if (IS_ERR(inode) || is_bad_inode(inode))
                                continue;
                        start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -693,7 +693,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
-        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+        if (unlikely(f2fs_cp_error(sbi)))
                goto stop;
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
        block_t invalid_user_blocks = sbi->user_block_count -
                                        written_block_count(sbi);
        /*
-         * Background GC is triggered with the following condition.
+         * Background GC is triggered with the following conditions.
         * 1. There are a number of invalid blocks.
         * 2. There is not enough free space.
         */
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf7281..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
        buf[1] += b1;
 }
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+                                unsigned int *buf, int num)
 {
        unsigned pad, val;
        int i;
@@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 {
        __u32 hash;
        f2fs_hash_t f2fs_hash;
-        const char *p;
+        const unsigned char *p;
        __u32 in[8], buf[4];
-        const char *name = name_info->name;
+        const unsigned char *name = name_info->name;
        size_t len = name_info->len;
        if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beeccef9ae1..3e8ecdf3742b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -68,7 +68,7 @@ out:
 static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 {
-        int err;
+        int err = 0;
        struct page *ipage;
        struct dnode_of_data dn;
        void *src_addr, *dst_addr;
@@ -86,6 +86,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
                goto out;
        }
+        /* someone else converted inline_data already */
+        if (!f2fs_has_inline_data(inode))
+                goto out;
        /*
         * i_addr[0] is not used for inline data,
         * so reserving new block will not destroy inline data
@@ -124,9 +128,10 @@ out:
        return err;
 }
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+                                                struct page *page)
 {
-        struct page *page;
+        struct page *new_page = page;
        int err;
        if (!f2fs_has_inline_data(inode))
@@ -134,17 +139,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
        else if (to_size <= MAX_INLINE_DATA)
                return 0;
-        page = grab_cache_page(inode->i_mapping, 0);
+        if (!page || page->index != 0) {
-        if (!page)
+                new_page = grab_cache_page(inode->i_mapping, 0);
-                return -ENOMEM;
+                if (!new_page)
+                        return -ENOMEM;
+        }
-        err = __f2fs_convert_inline_data(inode, page);
+        err = __f2fs_convert_inline_data(inode, new_page);
-        f2fs_put_page(page, 1);
+        if (!page || page->index != 0)
+                f2fs_put_page(new_page, 1);
        return err;
 }
 int f2fs_write_inline_data(struct inode *inode,
-                           struct page *page, unsigned size)
+                                struct page *page, unsigned size)
 {
        void *src_addr, *dst_addr;
        struct page *ipage;
@@ -199,7 +207,7 @@ void truncate_inline_data(struct inode *inode, u64 from)
        f2fs_put_page(ipage, 1);
 }
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode *ri = NULL;
@@ -218,7 +226,7 @@ int recover_inline_data(struct inode *inode, struct page *npage)
                ri = F2FS_INODE(npage);
        if (f2fs_has_inline_data(inode) &&
-                        ri && ri->i_inline & F2FS_INLINE_DATA) {
+                        ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 process_inline:
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(IS_ERR(ipage));
@@ -230,7 +238,7 @@ process_inline:
                memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
-                return -1;
+                return true;
        }
        if (f2fs_has_inline_data(inode)) {
@@ -242,10 +250,10 @@ process_inline:
                clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
-        } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+        } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
-                truncate_blocks(inode, 0);
+                truncate_blocks(inode, 0, false);
                set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
                goto process_inline;
        }
-        return 0;
+        return false;
 }
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b03776ffd2..ee103fd7283c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -134,9 +134,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        return 0;
 out:
        clear_nlink(inode);
-        unlock_new_inode(inode);
+        iget_failed(inode);
-        make_bad_inode(inode);
-        iput(inode);
        alloc_nid_failed(sbi, ino);
        return err;
 }
@@ -229,7 +227,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
        f2fs_delete_entry(de, page, inode);
        f2fs_unlock_op(sbi);
-        /* In order to evict this inode,  we set it dirty */
+        /* In order to evict this inode, we set it dirty */
        mark_inode_dirty(inode);
 fail:
        trace_f2fs_unlink_exit(inode, err);
@@ -267,9 +265,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        return err;
 out:
        clear_nlink(inode);
-        unlock_new_inode(inode);
+        iget_failed(inode);
-        make_bad_inode(inode);
-        iput(inode);
        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -308,9 +304,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 out_fail:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
        clear_nlink(inode);
-        unlock_new_inode(inode);
+        iget_failed(inode);
-        make_bad_inode(inode);
-        iput(inode);
        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -354,9 +348,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
        return 0;
 out:
        clear_nlink(inode);
-        unlock_new_inode(inode);
+        iget_failed(inode);
-        make_bad_inode(inode);
-        iput(inode);
        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -688,9 +680,7 @@ release_out:
 out:
        f2fs_unlock_op(sbi);
        clear_nlink(inode);
-        unlock_new_inode(inode);
+        iget_failed(inode);
-        make_bad_inode(inode);
-        iput(inode);
        alloc_nid_failed(sbi, inode->i_ino);
        return err;
 }
@@ -704,7 +694,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .mkdir          = f2fs_mkdir,
        .rmdir          = f2fs_rmdir,
        .mknod          = f2fs_mknod,
-        .rename         = f2fs_rename,
        .rename2        = f2fs_rename2,
        .tmpfile        = f2fs_tmpfile,
        .getattr        = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d284631..45378196e19a 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -237,7 +237,7 @@ retry:
                        nat_get_blkaddr(e) != NULL_ADDR &&
                        new_blkaddr == NEW_ADDR);
-        /* increament version no as node is removed */
+        /* increment version no as node is removed */
        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
                unsigned char version = nat_get_version(e);
                nat_set_version(e, inc_node_version(version));
@@ -274,7 +274,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 }
 /*
- * This function returns always success
+ * This function always returns success
 */
 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 {
@@ -650,7 +650,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
        /* get indirect nodes in the path */
        for (i = 0; i < idx + 1; i++) {
-                /* refernece count'll be increased */
+                /* reference count'll be increased */
                pages[i] = get_node_page(sbi, nid[i]);
                if (IS_ERR(pages[i])) {
                        err = PTR_ERR(pages[i]);
@@ -823,22 +823,26 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 */
 void remove_inode_page(struct inode *inode)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        struct page *page;
-        nid_t ino = inode->i_ino;
        struct dnode_of_data dn;
-        page = get_node_page(sbi, ino);
+        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-        if (IS_ERR(page))
+        if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
                return;
-        if (truncate_xattr_node(inode, page)) {
+        if (truncate_xattr_node(inode, dn.inode_page)) {
-                f2fs_put_page(page, 1);
+                f2fs_put_dnode(&dn);
                return;
        }
-        /* 0 is possible, after f2fs_new_inode() is failed */
+        /* remove potential inline_data blocks */
+        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                                S_ISLNK(inode->i_mode))
+                truncate_data_blocks_range(&dn, 1);
+        /* 0 is possible, after f2fs_new_inode() has failed */
        f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
-        set_new_dnode(&dn, inode, page, page, ino);
+        /* will put inode & node pages */
        truncate_node(&dn);
 }
@@ -1129,8 +1133,11 @@ continue_unlock:
                                set_fsync_mark(page, 0);
                                set_dentry_mark(page, 0);
                        }
-                        NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
-                        wrote++;
+                        if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+                                unlock_page(page);
+                        else
+                                wrote++;
                        if (--wbc->nr_to_write == 0)
                                break;
@@ -1212,6 +1219,8 @@ static int f2fs_write_node_page(struct page *page,
        if (unlikely(sbi->por_doing))
                goto redirty_out;
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto redirty_out;
        f2fs_wait_on_page_writeback(page, NODE);
@@ -1540,15 +1549,6 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
                kmem_cache_free(free_nid_slab, i);
 }
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
-                struct f2fs_summary *sum, struct node_info *ni,
-                block_t new_blkaddr)
-{
-        rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
-        set_node_addr(sbi, ni, new_blkaddr, false);
-        clear_node_page_dirty(page);
-}
 void recover_inline_xattr(struct inode *inode, struct page *page)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1557,40 +1557,33 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
        struct page *ipage;
        struct f2fs_inode *ri;
-        if (!f2fs_has_inline_xattr(inode))
-                return;
-        if (!IS_INODE(page))
-                return;
-        ri = F2FS_INODE(page);
-        if (!(ri->i_inline & F2FS_INLINE_XATTR))
-                return;
        ipage = get_node_page(sbi, inode->i_ino);
        f2fs_bug_on(IS_ERR(ipage));
+        ri = F2FS_INODE(page);
+        if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+                clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+                goto update_inode;
+        }
        dst_addr = inline_xattr_addr(ipage);
        src_addr = inline_xattr_addr(page);
        inline_size = inline_xattr_size(inode);
        f2fs_wait_on_page_writeback(ipage, NODE);
        memcpy(dst_addr, src_addr, inline_size);
+update_inode:
        update_inode(inode, ipage);
        f2fs_put_page(ipage, 1);
 }
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
        nid_t new_xnid = nid_of_node(page);
        struct node_info ni;
-        if (!f2fs_has_xattr_block(ofs_of_node(page)))
-                return false;
        /* 1: invalidate the previous xattr nid */
        if (!prev_xnid)
                goto recover_xnid;
@@ -1618,7 +1611,6 @@ recover_xnid:
        set_node_addr(sbi, &ni, blkaddr, false);
        update_inode_page(inode);
-        return true;
 }
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1629,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        if (!ipage)
                return -ENOMEM;
-        /* Should not use this inode  from free nid list */
+        /* Should not use this inode from free nid list */
        remove_free_nid(NM_I(sbi), ino);
        SetPageUptodate(ipage);
@@ -1651,6 +1643,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        dst->i_blocks = cpu_to_le64(1);
        dst->i_links = cpu_to_le32(1);
        dst->i_xattr_nid = 0;
+        dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
        new_ni = old_ni;
        new_ni.ino = ino;
@@ -1659,13 +1652,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
        inc_valid_inode_count(sbi);
+        set_page_dirty(ipage);
        f2fs_put_page(ipage, 1);
        return 0;
 }
 /*
 * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
 */
 static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
                                int start, int nrpages)
@@ -1709,7 +1703,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
        for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
                nrpages = min(last_offset - i, bio_blocks);
-                /* read ahead node pages */
+                /* readahead node pages */
                nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
                if (!nrpages)
                        return -ENOMEM;
@@ -1967,7 +1961,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
        /* not used nids: 0, node, meta, (and root counted as valid node) */
-        nm_i->available_nids = nm_i->max_nid - 3;
+        nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
        nm_i->fcnt = 0;
        nm_i->nat_cnt = 0;
        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d921ba2..756c41cd2582 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -62,8 +62,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
        }
 retry:
        de = f2fs_find_entry(dir, &name, &page);
-        if (de && inode->i_ino == le32_to_cpu(de->ino))
+        if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
                goto out_unmap_put;
+        }
        if (de) {
                einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
                if (IS_ERR(einode)) {
@@ -300,14 +302,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct node_info ni;
        int err = 0, recovered = 0;
-        recover_inline_xattr(inode, page);
+        /* step 1: recover xattr */
+        if (IS_INODE(page)) {
-        if (recover_inline_data(inode, page))
+                recover_inline_xattr(inode, page);
+        } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+                recover_xattr_data(inode, page, blkaddr);
                goto out;
+        }
-        if (recover_xattr_data(inode, page, blkaddr))
+        /* step 2: recover inline data */
+        if (recover_inline_data(inode, page))
                goto out;
+        /* step 3: recover data indices */
        start = start_bidx_of_node(ofs_of_node(page), fi);
        end = start + ADDRS_PER_PAGE(page, fi);
@@ -364,8 +371,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        fill_node_footer(dn.node_page, dn.nid, ni.ino,
                                        ofs_of_node(page), false);
        set_page_dirty(dn.node_page);
-        recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
 err:
        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
@@ -452,6 +457,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        /* step #1: find fsynced inode numbers */
        sbi->por_doing = true;
+        /* prevent checkpoint */
+        mutex_lock(&sbi->cp_mutex);
        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,7 +473,8 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        /* step #2: recover data */
        err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-        f2fs_bug_on(!list_empty(&inode_list));
+        if (!err)
+                f2fs_bug_on(!list_empty(&inode_list));
 out:
        destroy_fsync_dnodes(&inode_list);
        kmem_cache_destroy(fsync_entry_slab);
@@ -482,8 +491,13 @@ out:
                /* Flush all the NAT/SIT pages */
                while (get_pages(sbi, F2FS_DIRTY_META))
                        sync_meta_pages(sbi, META, LONG_MAX);
+                set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+                mutex_unlock(&sbi->cp_mutex);
        } else if (need_writecp) {
+                mutex_unlock(&sbi->cp_mutex);
                write_checkpoint(sbi, false);
+        } else {
+                mutex_unlock(&sbi->cp_mutex);
        }
        return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeebae2a50..0aa337cd5bba 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -62,7 +62,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 }
 /*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
 * f2fs_set_bit makes MSB and LSB reversed in a byte.
 * Example:
 *                             LSB <--> MSB
@@ -808,7 +808,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
 }
 /*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
 * manner, so it should recover the existing segment information of valid blocks
 */
 static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -1103,55 +1103,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
        mutex_unlock(&curseg->curseg_mutex);
 }
-void rewrite_node_page(struct f2fs_sb_info *sbi,
-                        struct page *page, struct f2fs_summary *sum,
-                        block_t old_blkaddr, block_t new_blkaddr)
-{
-        struct sit_info *sit_i = SIT_I(sbi);
-        int type = CURSEG_WARM_NODE;
-        struct curseg_info *curseg;
-        unsigned int segno, old_cursegno;
-        block_t next_blkaddr = next_blkaddr_of_node(page);
-        unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
-        struct f2fs_io_info fio = {
-                .type = NODE,
-                .rw = WRITE_SYNC,
-        };
-        curseg = CURSEG_I(sbi, type);
-        mutex_lock(&curseg->curseg_mutex);
-        mutex_lock(&sit_i->sentry_lock);
-        segno = GET_SEGNO(sbi, new_blkaddr);
-        old_cursegno = curseg->segno;
-        /* change the current segment */
-        if (segno != curseg->segno) {
-                curseg->next_segno = segno;
-                change_curseg(sbi, type, true);
-        }
-        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
-        __add_sum_entry(sbi, type, sum);
-        /* change the current log to the next block addr in advance */
-        if (next_segno != segno) {
-                curseg->next_segno = next_segno;
-                change_curseg(sbi, type, true);
-        }
-        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-        /* rewrite node page */
-        set_page_writeback(page);
-        f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
-        f2fs_submit_merged_bio(sbi, NODE, WRITE);
-        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-        locate_dirty_segment(sbi, old_cursegno);
-        mutex_unlock(&sit_i->sentry_lock);
-        mutex_unlock(&curseg->curseg_mutex);
-}
 static inline bool is_merged_page(struct f2fs_sb_info *sbi,
                                        struct page *page, enum page_type type)
 {
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7b0330..ff483257283b 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -549,7 +549,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 }
 /*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
 */
 static inline void check_block_count(struct f2fs_sb_info *sbi,
                int segno, struct f2fs_sit_entry *raw_sit)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582fc7601..41bdf511003d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -432,9 +432,15 @@ static void f2fs_put_super(struct super_block *sb)
        stop_gc_thread(sbi);
        /* We don't need to do checkpoint when it's clean */
-        if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+        if (sbi->s_dirty)
                write_checkpoint(sbi, true);
+        /*
+         * normally superblock is clean, so we need to release this.
+         * In addition, EIO will skip do checkpoint, we need this as well.
+         */
+        release_dirty_inode(sbi);
        iput(sbi->node_inode);
        iput(sbi->meta_inode);
@@ -457,9 +463,6 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
        trace_f2fs_sync_fs(sb, sync);
-        if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
-                return 0;
        if (sync) {
                mutex_lock(&sbi->gc_mutex);
                write_checkpoint(sbi, false);
@@ -505,8 +508,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
        buf->f_bavail = user_block_count - valid_user_blocks(sbi);
-        buf->f_files = sbi->total_node_count;
+        buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-        buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+        buf->f_ffree = buf->f_files - valid_inode_count(sbi);
        buf->f_namelen = F2FS_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
@@ -663,7 +666,7 @@ restore_gc:
        if (need_restart_gc) {
                if (start_gc_thread(sbi))
                        f2fs_msg(sbi->sb, KERN_WARNING,
-                                "background gc thread is stop");
+                                "background gc thread has stopped");
        } else if (need_stop_gc) {
                stop_gc_thread(sbi);
        }
@@ -812,7 +815,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
        if (unlikely(fsmeta >= total))
                return 1;
-        if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+        if (unlikely(f2fs_cp_error(sbi))) {
                f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
                return 1;
        }
@@ -899,8 +902,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct buffer_head *raw_super_buf;
        struct inode *root;
        long err = -EINVAL;
+        bool retry = true;
        int i;
+try_onemore:
        /* allocate memory for f2fs-specific super block info */
        sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -1080,9 +1085,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        /* recover fsynced data */
        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
                err = recover_fsync_data(sbi);
-                if (err)
+                if (err) {
                        f2fs_msg(sb, KERN_ERR,
                                "Cannot recover all fsync data errno=%ld", err);
+                        goto free_kobj;
+                }
        }
        /*
@@ -1123,6 +1130,13 @@ free_sb_buf:
        brelse(raw_super_buf);
 free_sbi:
        kfree(sbi);
+        /* give only one another chance */
+        if (retry) {
+                retry = 0;
+                shrink_dcache_sb(sb);
+                goto try_onemore;
+        }
        return err;
 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..728a5dc3dc16 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
                int free;
                /*
                 * If value is NULL, it is remove operation.
-                 * In case of update operation, we caculate free.
+                 * In case of update operation, we calculate free.
                 */
                free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
                if (found)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4556ce1af5b0..5ddaf8625d3b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb)
        return;
 }
-static int isofs_read_inode(struct inode *);
+static int isofs_read_inode(struct inode *, int relocated);
 static int isofs_statfs (struct dentry *, struct kstatfs *);
 static struct kmem_cache *isofs_inode_cachep;
@@ -1259,7 +1259,7 @@ out_toomany:
        goto out;
 }
-static int isofs_read_inode(struct inode *inode)
+static int isofs_read_inode(struct inode *inode, int relocated)
 {
        struct super_block *sb = inode->i_sb;
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode)
         */
        if (!high_sierra) {
-                parse_rock_ridge_inode(de, inode);
+                parse_rock_ridge_inode(de, inode, relocated);
                /* if we want uid/gid set, override the rock ridge setting */
                if (sbi->s_uid_set)
                        inode->i_uid = sbi->s_uid;
@@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
 * offset that point to the underlying meta-data for the inode.  The
 * code below is otherwise similar to the iget() code in
 * include/linux/fs.h */
-struct inode *isofs_iget(struct super_block *sb,
+struct inode *__isofs_iget(struct super_block *sb,
-                         unsigned long block,
+                           unsigned long block,
-                         unsigned long offset)
+                           unsigned long offset,
+                           int relocated)
 {
        unsigned long hashval;
        struct inode *inode;
@@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb,
                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
-                ret = isofs_read_inode(inode);
+                ret = isofs_read_inode(inode, relocated);
                if (ret < 0) {
                        iget_failed(inode);
                        inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 99167238518d..0ac4c1f73fbd 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -107,7 +107,7 @@ extern int iso_date(char *, int);
 struct inode;           /* To make gcc happy */
-extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *);
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
 extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
 extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
@@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int
 extern struct buffer_head *isofs_bread(struct inode *, sector_t);
 extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
-extern struct inode *isofs_iget(struct super_block *sb,
+struct inode *__isofs_iget(struct super_block *sb,
-                                unsigned long block,
+                           unsigned long block,
-                                unsigned long offset);
+                           unsigned long offset,
+                           int relocated);
+static inline struct inode *isofs_iget(struct super_block *sb,
+                                       unsigned long block,
+                                       unsigned long offset)
+{
+        return __isofs_iget(sb, block, offset, 0);
+}
+static inline struct inode *isofs_iget_reloc(struct super_block *sb,
+                                             unsigned long block,
+                                             unsigned long offset)
+{
+        return __isofs_iget(sb, block, offset, 1);
+}
 /* Because the inode number is no longer relevant to finding the
 * underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c0bf42472e40..f488bbae541a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -288,12 +288,16 @@ eio:
        goto out;
 }
+#define RR_REGARD_XA 1
+#define RR_RELOC_DE 2
 static int
 parse_rock_ridge_inode_internal(struct iso_directory_record *de,
-                                struct inode *inode, int regard_xa)
+                                struct inode *inode, int flags)
 {
        int symlink_len = 0;
        int cnt, sig;
+        unsigned int reloc_block;
        struct inode *reloc;
        struct rock_ridge *rr;
        int rootflag;
@@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
        init_rock_state(&rs, inode);
        setup_rock_ridge(de, inode, &rs);
-        if (regard_xa) {
+        if (flags & RR_REGARD_XA) {
                rs.chr += 14;
                rs.len -= 14;
                if (rs.len < 0)
@@ -485,12 +489,22 @@ repeat:
                                        "relocated directory\n");
                        goto out;
                case SIG('C', 'L'):
-                        ISOFS_I(inode)->i_first_extent =
+                        if (flags & RR_RELOC_DE) {
-                            isonum_733(rr->u.CL.location);
+                                printk(KERN_ERR
-                        reloc =
+                                       "ISOFS: Recursive directory relocation "
-                            isofs_iget(inode->i_sb,
+                                       "is not supported\n");
-                                       ISOFS_I(inode)->i_first_extent,
+                                goto eio;
-                                       0);
+                        }
+                        reloc_block = isonum_733(rr->u.CL.location);
+                        if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
+                            ISOFS_I(inode)->i_iget5_offset == 0) {
+                                printk(KERN_ERR
+                                       "ISOFS: Directory relocation points to "
+                                       "itself\n");
+                                goto eio;
+                        }
+                        ISOFS_I(inode)->i_first_extent = reloc_block;
+                        reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
                        if (IS_ERR(reloc)) {
                                ret = PTR_ERR(reloc);
                                goto out;
@@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
        return rpnt;
 }
-int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
+                           int relocated)
 {
-        int result = parse_rock_ridge_inode_internal(de, inode, 0);
+        int flags = relocated ? RR_RELOC_DE : 0;
+        int result = parse_rock_ridge_inode_internal(de, inode, flags);
        /*
         * if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
         */
        if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
            && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
-                result = parse_rock_ridge_inode_internal(de, inode, 14);
+                result = parse_rock_ridge_inode_internal(de, inode,
+                                                         flags | RR_REGARD_XA);
        }
        return result;
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
        struct commit_header *h;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
        return checksum;
 }
-static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
                                   unsigned long long block)
 {
        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
-        if (tag_bytes > JBD2_TAG_SIZE32)
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 }
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
        struct jbd2_journal_block_tail *tail;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
                                    struct buffer_head *bh, __u32 sequence)
 {
+        journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
        struct page *page = bh->b_page;
        __u8 *addr;
        __u32 csum32;
        __be32 seq;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
                             bh->b_size);
        kunmap_atomic(addr);
-        /* We only have space to store the lower 16 bits of the crc32c. */
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
-        tag->t_checksum = cpu_to_be16(csum32);
+                tag3->t_checksum = cpu_to_be32(csum32);
+        else
+                tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
 * jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        LIST_HEAD(io_bufs);
        LIST_HEAD(log_bufs);
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_block_tail);
        /*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        tag_flag |= JBD2_FLAG_SAME_UUID;
                tag = (journal_block_tag_t *) tagp;
-                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+                write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
                tag->t_flags = cpu_to_be16(tag_flag);
                jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
                                        commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..19d74d86d99c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 /* Checksumming functions */
 static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 {
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
-        if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+        if (jbd2_journal_has_csum_v2or3(journal) &&
-            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+            JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
                /* Can't have checksum v1 and v2 on at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
                       "at the same time!\n");
                goto out;
        }
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
+            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+                /* Can't have checksum v2 and v3 at the same time! */
+                printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+                       "at the same time!\n");
+                goto out;
+        }
        if (!jbd2_verify_csum_type(journal, sb)) {
                printk(KERN_ERR "JBD2: Unknown checksum type\n");
                goto out;
        }
        /* Load the checksum driver */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+        if (jbd2_journal_has_csum_v2or3(journal)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
        }
        /* Precompute checksum seed for all metadata */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                return 0;
-        /* Asking for checksumming v2 and v1?  Only give them v2. */
+        /* If enabling v2 checksums, turn on v3 instead */
-        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
+                incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
+                incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
+        }
+        /* Asking for checksumming v3 and v1?  Only give them v3. */
+        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
            compat & JBD2_FEATURE_COMPAT_CHECKSUM)
                compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        sb = journal->j_superblock;
-        /* If enabling v2 checksums, update superblock */
+        /* If enabling v3 checksums, update superblock */
-        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
                sb->s_feature_compat &=
                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                }
                /* Precompute checksum seed for all metadata */
-                if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                if (jbd2_journal_has_csum_v2or3(journal))
-                                              JBD2_FEATURE_INCOMPAT_CSUM_V2))
                        journal->j_csum_seed = jbd2_chksum(journal, ~0,
                                                           sb->s_uuid,
                                                           sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        /* If enabling v1 checksums, downgrade superblock */
        if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
                sb->s_feature_incompat &=
-                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
+                                     JBD2_FEATURE_INCOMPAT_CSUM_V3);
        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
 */
 size_t journal_tag_bytes(journal_t *journal)
 {
-        journal_block_tag_t tag;
+        size_t sz;
-        size_t x = 0;
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+                return sizeof(journal_block_tag3_t);
+        sz = sizeof(journal_block_tag_t);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
-                x += sizeof(tag.t_checksum);
+                sz += sizeof(__u16);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
-                return x + JBD2_TAG_SIZE64;
+                return sz;
        else
-                return x + JBD2_TAG_SIZE32;
+                return sz - sizeof(__u32);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..9b329b55ffe3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
        int                     nr = 0, size = journal->j_blocksize;
        int                     tag_bytes = journal_tag_bytes(journal);
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                size -= sizeof(struct jbd2_journal_block_tail);
        tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
        return err;
 }
-static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+static inline unsigned long long read_tag_block(journal_t *journal,
+                                                journal_block_tag_t *tag)
 {
        unsigned long long block = be32_to_cpu(tag->t_blocknr);
-        if (tag_bytes > JBD2_TAG_SIZE32)
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
        return block;
 }
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
                                      void *buf, __u32 sequence)
 {
+        journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
        __u32 csum32;
        __be32 seq;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        seq = cpu_to_be32(sequence);
        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
        csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
-        return tag->t_checksum == cpu_to_be16(csum32);
+        if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+                return tag3->t_checksum == cpu_to_be32(csum32);
+        else
+                return tag->t_checksum == cpu_to_be16(csum32);
 }
 static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
        int                     tag_bytes = journal_tag_bytes(journal);
        __u32                   crc32_sum = ~0; /* Transactional Checksums */
        int                     descr_csum_size = 0;
+        int                     block_error = 0;
        /*
         * First thing is to establish what we expect to find in the log
@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
                switch(blocktype) {
                case JBD2_DESCRIPTOR_BLOCK:
                        /* Verify checksum first */
-                        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                        if (jbd2_journal_has_csum_v2or3(journal))
-                                        JBD2_FEATURE_INCOMPAT_CSUM_V2))
                                descr_csum_size =
                                        sizeof(struct jbd2_journal_block_tail);
                        if (descr_csum_size > 0 &&
@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
                                        unsigned long long blocknr;
                                        J_ASSERT(obh != NULL);
-                                        blocknr = read_tag_block(tag_bytes,
+                                        blocknr = read_tag_block(journal,
                                                                 tag);
                                        /* If the block has been
@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
                                                       "checksum recovering "
                                                       "block %llu in log\n",
                                                       blocknr);
-                                                continue;
+                                                block_error = 1;
+                                                goto skip_write;
                                        }
                                        /* Find a buffer for the new
@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
                                success = -EIO;
                }
        }
+        if (block_error && success == 0)
+                success = -EIO;
        return success;
 failed:
@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
        __be32 provided;
        __u32 calculated;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return 1;
        tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..d5e95a175c92 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,8 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/bio.h>
-#endif
 #include <linux/log2.h>
+#endif
 static struct kmem_cache *jbd2_revoke_record_cache;
 static struct kmem_cache *jbd2_revoke_table_cache;
@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
        offset = *offsetp;
        /* Do we need to leave space at the end for a checksum? */
-        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_revoke_tail);
        /* Make sure we have a descriptor with space left for the record */
@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
        struct jbd2_journal_revoke_tail *tail;
        __u32 csum;
-        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+        if (!jbd2_journal_has_csum_v2or3(j))
                return;
        tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..ec9e082f9ecd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -253,13 +253,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
        error = make_socks(serv, net);
        if (error < 0)
-                goto err_socks;
+                goto err_bind;
        set_grace_period(net);
        dprintk("lockd_up_net: per-net data created; net=%p\n", net);
        return 0;
-err_socks:
-        svc_rpcb_cleanup(serv, net);
 err_bind:
        ln->nlmsvc_users--;
        return error;
diff --git a/fs/locks.c b/fs/locks.c
index cb66fb05ad4a..bb08857f90b5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1619,7 +1619,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
        smp_mb();
        error = check_conflicting_open(dentry, arg);
        if (error)
-                locks_unlink_lock(flp);
+                locks_unlink_lock(before);
 out:
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..215e44254c53 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
+#include <linux/hash.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd)
 static __always_inline void set_root(struct nameidata *nd)
 {
-        if (!nd->root.mnt)
+        get_fs_root(current->fs, &nd->root);
-                get_fs_root(current->fs, &nd->root);
 }
 static int link_path_walk(const char *, struct nameidata *);
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
 {
-        if (!nd->root.mnt) {
+        struct fs_struct *fs = current->fs;
-                struct fs_struct *fs = current->fs;
+        unsigned seq, res;
-                unsigned seq;
-                do {
+        do {
-                        seq = read_seqcount_begin(&fs->seq);
+                seq = read_seqcount_begin(&fs->seq);
-                        nd->root = fs->root;
+                nd->root = fs->root;
-                        nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+                res = __read_seqcount_begin(&nd->root.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+        } while (read_seqcount_retry(&fs->seq, seq));
-        }
+        return res;
 }
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
                        return PTR_ERR(s);
                }
                if (*s == '/') {
-                        set_root(nd);
+                        if (!nd->root.mnt)
+                                set_root(nd);
                        path_put(&nd->path);
                        nd->path = nd->root;
                        path_get(&nd->root);
@@ -1137,13 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 */
                *inode = path->dentry->d_inode;
        }
-        return read_seqretry(&mount_lock, nd->m_seq) &&
+        return !read_seqretry(&mount_lock, nd->m_seq) &&
                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
-        set_root_rcu(nd);
+        struct inode *inode = nd->inode;
+        if (!nd->root.mnt)
+                set_root_rcu(nd);
        while (1) {
                if (nd->path.dentry == nd->root.dentry &&
@@ -1155,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        struct dentry *parent = old->d_parent;
                        unsigned seq;
+                        inode = parent->d_inode;
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
                                goto failed;
@@ -1164,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                }
                if (!follow_up_rcu(&nd->path))
                        break;
+                inode = nd->path.dentry->d_inode;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
        }
        while (d_mountpoint(nd->path.dentry)) {
@@ -1173,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        break;
                nd->path.mnt = &mounted->mnt;
                nd->path.dentry = mounted->mnt.mnt_root;
+                inode = nd->path.dentry->d_inode;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-                if (!read_seqretry(&mount_lock, nd->m_seq))
+                if (read_seqretry(&mount_lock, nd->m_seq))
                        goto failed;
        }
-        nd->inode = nd->path.dentry->d_inode;
+        nd->inode = inode;
        return 0;
 failed:
@@ -1256,7 +1261,8 @@ static void follow_mount(struct path *path)
 static void follow_dotdot(struct nameidata *nd)
 {
-        set_root(nd);
+        if (!nd->root.mnt)
+                set_root(nd);
        while(1) {
                struct dentry *old = nd->path.dentry;
@@ -1634,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 static inline unsigned int fold_hash(unsigned long hash)
 {
-        hash += hash >> (8*sizeof(int));
+        return hash_64(hash, 32);
-        return hash;
 }
 #else   /* 32-bit case */
@@ -1669,13 +1674,14 @@ EXPORT_SYMBOL(full_name_hash);
 /*
 * Calculate the length and hash of the path component, and
- * return the length of the component;
+ * fill in the qstr. return the "len" as the result.
 */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline unsigned long hash_name(const char *name, struct qstr *res)
 {
        unsigned long a, b, adata, bdata, mask, hash, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+        res->name = name;
        hash = a = 0;
        len = -sizeof(unsigned long);
        do {
@@ -1691,9 +1697,10 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
        mask = create_zero_mask(adata | bdata);
        hash += a & zero_bytemask(mask);
-        *hashp = fold_hash(hash);
+        len += find_zero(mask);
+        res->hash_len = hashlen_create(fold_hash(hash), len);
-        return len + find_zero(mask);
+        return len;
 }
 #else
@@ -1711,18 +1718,19 @@ EXPORT_SYMBOL(full_name_hash);
 * We know there's a real path component here of at least
 * one character.
 */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline long hash_name(const char *name, struct qstr *res)
 {
        unsigned long hash = init_name_hash();
        unsigned long len = 0, c;
+        res->name = name;
        c = (unsigned char)*name;
        do {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
-        *hashp = end_name_hash(hash);
+        res->hash_len = hashlen_create(end_name_hash(hash), len);
        return len;
 }
@@ -1756,9 +1764,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                if (err)
                        break;
-                len = hash_name(name, &this.hash);
+                len = hash_name(name, &this);
-                this.name = name;
-                this.len = len;
                type = LAST_NORM;
                if (name[0] == '.') switch (len) {
@@ -1852,7 +1858,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        if (*name=='/') {
                if (flags & LOOKUP_RCU) {
                        rcu_read_lock();
-                        set_root_rcu(nd);
+                        nd->seq = set_root_rcu(nd);
                } else {
                        set_root(nd);
                        path_get(&nd->root);
@@ -1903,7 +1909,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        }
        nd->inode = nd->path.dentry->d_inode;
-        return 0;
+        if (!(flags & LOOKUP_RCU))
+                return 0;
+        if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+                return 0;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        return -ECHILD;
 }
 static inline int lookup_last(struct nameidata *nd, struct path *path)
diff --git a/fs/namespace.c b/fs/namespace.c
index a01c7730e9af..ef42d9bee212 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1217,6 +1217,11 @@ static void namespace_unlock(void)
        head.first->pprev = &head.first;
        INIT_HLIST_HEAD(&unmounted);
+        /* undo decrements we'd done in umount_tree() */
+        hlist_for_each_entry(mnt, &head, mnt_hash)
+                if (mnt->mnt_ex_mountpoint.mnt)
+                        mntget(mnt->mnt_ex_mountpoint.mnt);
        up_write(&namespace_sem);
        synchronize_rcu();
@@ -1253,6 +1258,9 @@ void umount_tree(struct mount *mnt, int how)
                hlist_add_head(&p->mnt_hash, &tmp_list);
        }
+        hlist_for_each_entry(p, &tmp_list, mnt_hash)
+                list_del_init(&p->mnt_child);
        if (how)
                propagate_umount(&tmp_list);
@@ -1263,9 +1271,9 @@ void umount_tree(struct mount *mnt, int how)
                p->mnt_ns = NULL;
                if (how < 2)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-                list_del_init(&p->mnt_child);
                if (mnt_has_parent(p)) {
                        put_mountpoint(p->mnt_mp);
+                        mnt_add_count(p->mnt_parent, -1);
                        /* move the reference to mountpoint into ->mnt_ex_mountpoint */
                        p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
                        p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1c5ff6d58385..6a4f3666e273 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1412,24 +1412,18 @@ int nfs_fs_proc_net_init(struct net *net)
        p = proc_create("volumes", S_IFREG|S_IRUGO,
                        nn->proc_nfsfs, &nfs_volume_list_fops);
        if (!p)
-                goto error_2;
+                goto error_1;
        return 0;
-error_2:
-        remove_proc_entry("servers", nn->proc_nfsfs);
 error_1:
-        remove_proc_entry("fs/nfsfs", NULL);
+        remove_proc_subtree("nfsfs", net->proc_net);
 error_0:
        return -ENOMEM;
 }
 void nfs_fs_proc_net_exit(struct net *net)
 {
-        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        remove_proc_subtree("nfsfs", net->proc_net);
-        remove_proc_entry("volumes", nn->proc_nfsfs);
-        remove_proc_entry("servers", nn->proc_nfsfs);
-        remove_proc_entry("fs/nfsfs", NULL);
 }
 /*
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a27393..90978075f730 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
 static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
 {
        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-        struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+        struct pnfs_commit_bucket *bucket;
        struct pnfs_layout_segment *freeme;
        int i;
-        for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+        for (i = idx; i < fl_cinfo->nbuckets; i++) {
+                bucket = &fl_cinfo->buckets[i];
                if (list_empty(&bucket->committing))
                        continue;
                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d0fec260132a..24c6898159cc 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -129,7 +129,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                .rpc_argp       = &args,
                .rpc_resp       = &fattr,
        };
-        int status;
+        int status = 0;
+        if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+                goto out;
        status = -EOPNOTSUPP;
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
 */
 struct nfs4_lock_state {
-        struct list_head                ls_locks;   /* Other lock stateids */
+        struct list_head        ls_locks;       /* Other lock stateids */
-        struct nfs4_state *             ls_state;   /* Pointer to open state */
+        struct nfs4_state *     ls_state;       /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-        unsigned long                   ls_flags;
+        unsigned long           ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-        nfs4_stateid                    ls_stateid;
+        nfs4_stateid            ls_stateid;
-        atomic_t                        ls_count;
+        atomic_t                ls_count;
-        fl_owner_t                      ls_owner;
+        fl_owner_t              ls_owner;
-        struct work_struct              ls_release;
 };
 /* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75ae8d22f067..7dd8aca31c29 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2560,6 +2560,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
+        nfs4_stateid *res_stateid = NULL;
        dprintk("%s: begin!\n", __func__);
        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2570,12 +2571,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
-                        if (calldata->roc)
+                        res_stateid = &calldata->res.stateid;
+                        if (calldata->arg.fmode == 0 && calldata->roc)
                                pnfs_roc_set_barrier(state->inode,
                                                     calldata->roc_barrier);
-                        nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
-                        goto out_release;
+                        break;
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_OLD_STATEID:
@@ -2589,7 +2590,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                                goto out_release;
                        }
        }
-        nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+        nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
 out_release:
        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2601,6 +2602,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct inode *inode = calldata->inode;
+        bool is_rdonly, is_wronly, is_rdwr;
        int call_close = 0;
        dprintk("%s: begin!\n", __func__);
@@ -2608,18 +2610,24 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                goto out_wait;
        task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-        calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
        spin_lock(&state->owner->so_lock);
+        is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+        is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+        is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+        /* Calculate the current open share mode */
+        calldata->arg.fmode = 0;
+        if (is_rdonly || is_rdwr)
+                calldata->arg.fmode |= FMODE_READ;
+        if (is_wronly || is_rdwr)
+                calldata->arg.fmode |= FMODE_WRITE;
        /* Calculate the change in open mode */
        if (state->n_rdwr == 0) {
                if (state->n_rdonly == 0) {
-                        call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+                        call_close |= is_rdonly || is_rdwr;
-                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
                        calldata->arg.fmode &= ~FMODE_READ;
                }
                if (state->n_wronly == 0) {
-                        call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+                        call_close |= is_wronly || is_rdwr;
-                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
                        calldata->arg.fmode &= ~FMODE_WRITE;
                }
        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..22fe35104c0c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
        return NULL;
 }
-static void
-free_lock_state_work(struct work_struct *work)
-{
-        struct nfs4_lock_state *lsp = container_of(work,
-                                        struct nfs4_lock_state, ls_release);
-        struct nfs4_state *state = lsp->ls_state;
-        struct nfs_server *server = state->owner->so_server;
-        struct nfs_client *clp = server->nfs_client;
-        clp->cl_mvops->free_lock_state(server, lsp);
-}
 /*
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        if (lsp->ls_seqid.owner_id < 0)
                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
-        INIT_WORK(&lsp->ls_release, free_lock_state_work);
        return lsp;
 out_free:
        kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+        server = state->owner->so_server;
-                queue_work(nfsiod_workqueue, &lsp->ls_release);
+        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-        else {
+                struct nfs_client *clp = server->nfs_client;
-                server = state->owner->so_server;
+                clp->cl_mvops->free_lock_state(server, lsp);
+        } else
                nfs4_free_lock_state(server, lsp);
-        }
 }
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ba491926df5f..be7cbce6e4c7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
                if (atomic_read(&c->io_count) == 0)
                        break;
                ret = nfs_wait_bit_killable(&q.key);
-        } while (atomic_read(&c->io_count) != 0);
+        } while (atomic_read(&c->io_count) != 0 && !ret);
        finish_wait(wq, &q.wait);
        return ret;
 }
@@ -139,26 +139,49 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
 /*
 * nfs_page_group_lock - lock the head of the page group
 * @req - request in group that is to be locked
+ * @nonblock - if true don't block waiting for lock
 *
 * this lock must be held if modifying the page group list
 *
- * returns result from wait_on_bit_lock: 0 on success, < 0 on error
+ * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
+ * result from wait_on_bit_lock
+ *
+ * NOTE: calling with nonblock=false should always have set the
+ *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
+ *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
 */
 int
-nfs_page_group_lock(struct nfs_page *req, bool wait)
+nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 {
        struct nfs_page *head = req->wb_head;
-        int ret;
        WARN_ON_ONCE(head != head->wb_head);
-        do {
+        if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
-                ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                return 0;
-                        TASK_UNINTERRUPTIBLE);
-        } while (wait && ret != 0);
-        WARN_ON_ONCE(ret > 0);
+        if (!nonblock)
-        return ret;
+                return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                                TASK_UNINTERRUPTIBLE);
+        return -EAGAIN;
+}
+/*
+ * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
+ * @req - a request in the group
+ *
+ * This is a blocking call to wait for the group lock to be cleared.
+ */
+void
+nfs_page_group_lock_wait(struct nfs_page *req)
+{
+        struct nfs_page *head = req->wb_head;
+        WARN_ON_ONCE(head != head->wb_head);
+        wait_on_bit(&head->wb_flags, PG_HEADLOCK,
+                TASK_UNINTERRUPTIBLE);
 }
 /*
@@ -219,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
        bool ret;
-        nfs_page_group_lock(req, true);
+        nfs_page_group_lock(req, false);
        ret = nfs_page_group_sync_on_bit_locked(req, bit);
        nfs_page_group_unlock(req);
@@ -701,10 +724,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                     struct nfs_pgio_header *hdr)
 {
        struct nfs_page         *req;
-        struct page             **pages;
+        struct page             **pages,
+                                *last_page;
        struct list_head *head = &desc->pg_list;
        struct nfs_commit_info cinfo;
-        unsigned int pagecount;
+        unsigned int pagecount, pageused;
        pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
        if (!nfs_pgarray_set(&hdr->page_array, pagecount))
@@ -712,12 +736,23 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
        pages = hdr->page_array.pagevec;
+        last_page = NULL;
+        pageused = 0;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &hdr->pages);
-                *pages++ = req->wb_page;
+                if (WARN_ON_ONCE(pageused >= pagecount))
+                        return nfs_pgio_error(desc, hdr);
+                if (!last_page || last_page != req->wb_page) {
+                        *pages++ = last_page = req->wb_page;
+                        pageused++;
+                }
        }
+        if (WARN_ON_ONCE(pageused != pagecount))
+                return nfs_pgio_error(desc, hdr);
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -788,6 +823,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                        return false;
                if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
                        return false;
+                if (req->wb_page == prev->wb_page) {
+                        if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+                                return false;
+                } else {
+                        if (req->wb_pgbase != 0 ||
+                            prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+                                return false;
+                }
        }
        size = pgio->pg_ops->pg_test(pgio, prev, req);
        WARN_ON_ONCE(size > req->wb_bytes);
@@ -858,13 +901,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
-        int ret;
-        ret = nfs_page_group_lock(req, false);
+        nfs_page_group_lock(req, false);
-        if (ret < 0) {
-                desc->pg_error = ret;
-                return 0;
-        }
        subreq = req;
        bytes_left = subreq->wb_bytes;
@@ -886,11 +924,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        if (desc->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
-                        ret = nfs_page_group_lock(req, false);
+                        nfs_page_group_lock(req, false);
-                        if (ret < 0) {
-                                desc->pg_error = ret;
-                                return 0;
-                        }
                        continue;
                }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b5cf28bdc5..175d5d073ccf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -241,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
        unsigned int pos = 0;
        unsigned int len = nfs_page_length(req->wb_page);
-        nfs_page_group_lock(req, true);
+        nfs_page_group_lock(req, false);
        do {
                tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -478,10 +478,23 @@ try_again:
                return NULL;
        }
-        /* lock each request in the page group */
+        /* holding inode lock, so always make a non-blocking call to try the
-        ret = nfs_page_group_lock(head, false);
+         * page group lock */
-        if (ret < 0)
+        ret = nfs_page_group_lock(head, true);
+        if (ret < 0) {
+                spin_unlock(&inode->i_lock);
+                if (!nonblock && ret == -EAGAIN) {
+                        nfs_page_group_lock_wait(head);
+                        nfs_release_request(head);
+                        goto try_again;
+                }
+                nfs_release_request(head);
                return ERR_PTR(ret);
+        }
+        /* lock each request in the page group */
        subreq = head;
        do {
                /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f9821ce6658a..e94457c33ad6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2657,6 +2657,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        struct xdr_stream *xdr = cd->xdr;
        int start_offset = xdr->buf->len;
        int cookie_offset;
+        u32 name_and_cookie;
        int entry_bytes;
        __be32 nfserr = nfserr_toosmall;
        __be64 wire_offset;
@@ -2718,7 +2719,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        cd->rd_maxcount -= entry_bytes;
        if (!cd->rd_dircount)
                goto fail;
-        cd->rd_dircount--;
+        /*
+         * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+         * let's always let through the first entry, at least:
+         */
+        name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+        if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+                goto fail;
+        cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
        cd->cookie_offset = cookie_offset;
 skip_entry:
        cd->common.err = nfs_ok;
@@ -3321,6 +3329,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        }
        maxcount = min_t(int, maxcount-16, bytes_left);
+        /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+        if (!readdir->rd_dircount)
+                readdir->rd_dircount = INT_MAX;
        readdir->xdr = xdr;
        readdir->rd_maxcount = maxcount;
        readdir->common.err = 0;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 {
        struct {
                struct file_handle handle;
-                u8 pad[64];
+                u8 pad[MAX_HANDLE_SZ];
        } f;
        int size, ret, i;
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
        size = f.handle.handle_bytes >> 2;
        ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
-        if ((ret == 255) || (ret == -ENOSPC)) {
+        if ((ret == FILEID_INVALID) || (ret < 0)) {
                WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
                return 0;
        }
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
        }
 out:
-        spin_unlock(&qs->qs_lock);
+        if (fence) {
-        if (fence)
+                spin_unlock(&qs->qs_lock);
                o2quo_fence_self();
+        } else {
+                mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+                        "connected: %d, lowest: %d (%sreachable)\n",
+                        qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+                        lowest_reachable ? "" : "un");
+                spin_unlock(&qs->qs_lock);
+        }
 }
 static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..ea34952f9496 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
        return ret;
 }
+static int o2net_set_usertimeout(struct socket *sock)
+{
+        int user_timeout = O2NET_TCP_USER_TIMEOUT;
+        return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+                                (char *)&user_timeout, sizeof(user_timeout));
+}
 static void o2net_initialize_handshake(void)
 {
        o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
 #endif
        printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
-               "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+               "idle for %lu.%lu secs.\n",
-               msecs / 1000, msecs % 1000);
+               SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
-        /*
+        /* idle timerout happen, don't shutdown the connection, but
-         * Initialize the nn_timeout so that the next connection attempt
+         * make fence decision. Maybe the connection can recover before
-         * will continue in o2net_start_connect.
+         * the decision is made.
         */
        atomic_set(&nn->nn_timeout, 1);
+        o2quo_conn_err(o2net_num_from_nn(nn));
+        queue_delayed_work(o2net_wq, &nn->nn_still_up,
+                        msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+        o2net_sc_reset_idle_timer(sc);
-        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
 {
+        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+        /* clear fence decision since the connection recover from timeout*/
+        if (atomic_read(&nn->nn_timeout)) {
+                o2quo_conn_up(o2net_num_from_nn(nn));
+                cancel_delayed_work(&nn->nn_still_up);
+                atomic_set(&nn->nn_timeout, 0);
+        }
        /* Only push out an existing timer */
        if (timer_pending(&sc->sc_idle_timeout))
                o2net_sc_reset_idle_timer(sc);
@@ -1650,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work)
                goto out;
        }
+        ret = o2net_set_usertimeout(sock);
+        if (ret) {
+                mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+                goto out;
+        }
        o2net_register_callbacks(sc->sc_sock->sk, sc);
        spin_lock(&nn->nn_lock);
@@ -1831,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
                goto out;
        }
+        ret = o2net_set_usertimeout(new_sock);
+        if (ret) {
+                mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+                goto out;
+        }
        slen = sizeof(sin);
        ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
                                       &slen, 1);
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        2000
 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT           30000
+#define O2NET_TCP_USER_TIMEOUT                  0x7fffffff
 /* TODO: figure this out.... */
 static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
                copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
 /*
- * This call is void because we are already reporting an error that may
+ * This is just a best-effort to tell userspace that this request
- * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * caused the error.
- * just a best-effort to tell userspace that this request caused the error.
 */
 static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
 static int ocfs2_info_handle_blocksize(struct inode *inode,
                                       struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_blocksize oib;
        if (o2info_from_user(oib, req))
-                goto bail;
+                return -EFAULT;
        oib.ib_blocksize = inode->i_sb->s_blocksize;
        o2info_set_request_filled(&oib.ib_req);
        if (o2info_to_user(oib, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oib.ib_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_clustersize(struct inode *inode,
                                         struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_clustersize oic;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oic, req))
-                goto bail;
+                return -EFAULT;
        oic.ic_clustersize = osb->s_clustersize;
        o2info_set_request_filled(&oic.ic_req);
        if (o2info_to_user(oic, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oic.ic_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_maxslots(struct inode *inode,
                                      struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_maxslots oim;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oim, req))
-                goto bail;
+                return -EFAULT;
        oim.im_max_slots = osb->max_slots;
        o2info_set_request_filled(&oim.im_req);
        if (o2info_to_user(oim, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oim.im_req, req);
-        return status;
 }
 static int ocfs2_info_handle_label(struct inode *inode,
                                   struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_label oil;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oil, req))
-                goto bail;
+                return -EFAULT;
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
        o2info_set_request_filled(&oil.il_req);
        if (o2info_to_user(oil, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oil.il_req, req);
-        return status;
 }
 static int ocfs2_info_handle_uuid(struct inode *inode,
                                  struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_uuid oiu;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oiu, req))
-                goto bail;
+                return -EFAULT;
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
        o2info_set_request_filled(&oiu.iu_req);
        if (o2info_to_user(oiu, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oiu.iu_req, req);
-        return status;
+        return 0;
 }
 static int ocfs2_info_handle_fs_features(struct inode *inode,
                                         struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_fs_features oif;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oif, req))
-                goto bail;
+                return -EFAULT;
        oif.if_compat_features = osb->s_feature_compat;
        oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
        o2info_set_request_filled(&oif.if_req);
        if (o2info_to_user(oif, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oif.if_req, req);
-        return status;
 }
 static int ocfs2_info_handle_journal_size(struct inode *inode,
                                          struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_journal_size oij;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (o2info_from_user(oij, req))
-                goto bail;
+                return -EFAULT;
        oij.ij_journal_size = i_size_read(osb->journal->j_inode);
        o2info_set_request_filled(&oij.ij_req);
        if (o2info_to_user(oij, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oij.ij_req, req);
-        return status;
 }
 static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
        u32 i;
        u64 blkno = -1;
        char namebuf[40];
-        int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+        int status, type = INODE_ALLOC_SYSTEM_INODE;
        struct ocfs2_info_freeinode *oifi = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
                goto out_err;
        }
-        if (o2info_from_user(*oifi, req))
+        if (o2info_from_user(*oifi, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        oifi->ifi_slotnum = osb->max_slots;
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
        o2info_set_request_filled(&oifi->ifi_req);
-        if (o2info_to_user(*oifi, req))
+        if (o2info_to_user(*oifi, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        status = 0;
 bail:
        if (status)
                o2info_set_request_error(&oifi->ifi_req, req);
+out_free:
        kfree(oifi);
 out_err:
        return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
 {
        u64 blkno = -1;
        char namebuf[40];
-        int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+        int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
        struct ocfs2_info_freefrag *oiff;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
                goto out_err;
        }
-        if (o2info_from_user(*oiff, req))
+        if (o2info_from_user(*oiff, req)) {
-                goto bail;
+                status = -EFAULT;
+                goto out_free;
+        }
        /*
         * chunksize from userspace should be power of 2.
         */
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
        if (o2info_to_user(*oiff, req)) {
                status = -EFAULT;
-                goto bail;
+                goto out_free;
        }
        status = 0;
 bail:
        if (status)
                o2info_set_request_error(&oiff->iff_req, req);
+out_free:
        kfree(oiff);
 out_err:
        return status;
@@ -727,23 +690,17 @@ out_err:
 static int ocfs2_info_handle_unknown(struct inode *inode,
                                     struct ocfs2_info_request __user *req)
 {
-        int status = -EFAULT;
        struct ocfs2_info_request oir;
        if (o2info_from_user(oir, req))
-                goto bail;
+                return -EFAULT;
        o2info_clear_request_filled(&oir);
        if (o2info_to_user(oir, req))
-                goto bail;
+                return -EFAULT;
-        status = 0;
+        return 0;
-bail:
-        if (status)
-                o2info_set_request_error(&oir, req);
-        return status;
 }
 /*
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
                 * other children
                 */
                if (child && list_empty(&child->mnt_mounts)) {
+                        list_del_init(&child->mnt_child);
                        hlist_del_init_rcu(&child->mnt_hash);
                        hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
                }
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
                return ret;
        return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
        udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
-struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 {
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct logicalVolIntegrityDescImpUse *lvidiu;
+        int err;
        inode = new_inode(sb);
-        if (!inode) {
+        if (!inode)
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return NULL;
-        }
-        *err = -ENOSPC;
        iinfo = UDF_I(inode);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
        }
        if (!iinfo->i_ext.i_data) {
                iput(inode);
-                *err = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return NULL;
        }
+        err = -ENOSPC;
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
-                              start, err);
+                              start, &err);
-        if (*err) {
+        if (err) {
                iput(inode);
-                return NULL;
+                return ERR_PTR(err);
        }
        lvidiu = udf_sb_lvidiu(sb);
        if (lvidiu) {
                iinfo->i_unique = lvid_get_unique_id(sb);
+                inode->i_generation = iinfo->i_unique;
                mutex_lock(&sbi->s_alloc_mutex);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                iinfo->i_crtime = current_fs_time(inode->i_sb);
-        insert_inode_hash(inode);
+        if (unlikely(insert_inode_locked(inode) < 0)) {
+                make_bad_inode(inode);
+                iput(inode);
+                return ERR_PTR(-EIO);
+        }
        mark_inode_dirty(inode);
-        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..08598843288f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
 static umode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
-static void udf_fill_inode(struct inode *, struct buffer_head *);
 static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
        return 0;
 }
-static void __udf_read_inode(struct inode *inode)
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+static int udf_read_inode(struct inode *inode)
 {
        struct buffer_head *bh = NULL;
        struct fileEntry *fe;
+        struct extendedFileEntry *efe;
        uint16_t ident;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+        struct kernel_lb_addr *iloc = &iinfo->i_location;
+        unsigned int link_count;
+        unsigned int indirections = 0;
+        int ret = -EIO;
+reread:
+        if (iloc->logicalBlockNum >=
+            sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+                udf_debug("block=%d, partition=%d out of range\n",
+                          iloc->logicalBlockNum, iloc->partitionReferenceNum);
+                return -EIO;
+        }
        /*
         * Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-        bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+        bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
        if (!bh) {
                udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
-                make_bad_inode(inode);
+                return -EIO;
-                return;
        }
        if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
            ident != TAG_IDENT_USE) {
                udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
                        inode->i_ino, ident);
-                brelse(bh);
+                goto out;
-                make_bad_inode(inode);
-                return;
        }
        fe = (struct fileEntry *)bh->b_data;
+        efe = (struct extendedFileEntry *)bh->b_data;
        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;
-                ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
+                ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
-                                        &ident);
                if (ident == TAG_IDENT_IE && ibh) {
-                        struct buffer_head *nbh = NULL;
                        struct kernel_lb_addr loc;
                        struct indirectEntry *ie;
                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);
-                        if (ie->indirectICB.extLength &&
+                        if (ie->indirectICB.extLength) {
-                                (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
+                                brelse(ibh);
-                                                        &ident))) {
+                                memcpy(&iinfo->i_location, &loc,
-                                if (ident == TAG_IDENT_FE ||
+                                       sizeof(struct kernel_lb_addr));
-                                        ident == TAG_IDENT_EFE) {
+                                if (++indirections > UDF_MAX_ICB_NESTING) {
-                                        memcpy(&iinfo->i_location,
+                                        udf_err(inode->i_sb,
-                                                &loc,
+                                                "too many ICBs in ICB hierarchy"
-                                                sizeof(struct kernel_lb_addr));
+                                                " (max %d supported)\n",
-                                        brelse(bh);
+                                                UDF_MAX_ICB_NESTING);
-                                        brelse(ibh);
+                                        goto out;
-                                        brelse(nbh);
-                                        __udf_read_inode(inode);
-                                        return;
                                }
-                                brelse(nbh);
+                                brelse(bh);
+                                goto reread;
                        }
                }
                brelse(ibh);
        } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
                udf_err(inode->i_sb, "unsupported strategy type: %d\n",
                        le16_to_cpu(fe->icbTag.strategyType));
-                brelse(bh);
+                goto out;
-                make_bad_inode(inode);
-                return;
        }
-        udf_fill_inode(inode, bh);
-        brelse(bh);
-}
-static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
-{
-        struct fileEntry *fe;
-        struct extendedFileEntry *efe;
-        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
-        struct udf_inode_info *iinfo = UDF_I(inode);
-        unsigned int link_count;
-        fe = (struct fileEntry *)bh->b_data;
-        efe = (struct extendedFileEntry *)bh->b_data;
        if (fe->icbTag.strategyType == cpu_to_le16(4))
                iinfo->i_strat4096 = 0;
        else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                        sizeof(struct extendedFileEntry))) {
+                                        sizeof(struct extendedFileEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
                       inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                                sizeof(struct fileEntry))) {
+                                                sizeof(struct fileEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct fileEntry),
                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
-                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-                                        sizeof(struct unallocSpaceEntry))) {
+                                        sizeof(struct unallocSpaceEntry));
-                        make_bad_inode(inode);
+                if (ret)
-                        return;
+                        goto out;
-                }
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
                       inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
-                return;
+                return 0;
        }
+        ret = -EIO;
        read_lock(&sbi->s_cred_lock);
        i_uid_write(inode, le32_to_cpu(fe->uid));
        if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        read_unlock(&sbi->s_cred_lock);
        link_count = le16_to_cpu(fe->fileLinkCount);
-        if (!link_count)
+        if (!link_count) {
-                link_count = 1;
+                ret = -ESTALE;
+                goto out;
+        }
        set_nlink(inode, link_count);
        inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1484,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
                iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
        }
+        inode->i_generation = iinfo->i_unique;
        switch (fe->icbTag.fileType) {
        case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1534,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        default:
                udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
                        inode->i_ino, fe->icbTag.fileType);
-                make_bad_inode(inode);
+                goto out;
-                return;
        }
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                struct deviceSpec *dsea =
@@ -1549,8 +1545,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                                      le32_to_cpu(dsea->minorDeviceIdent)));
                        /* Developer ID ??? */
                } else
-                        make_bad_inode(inode);
+                        goto out;
        }
+        ret = 0;
+out:
+        brelse(bh);
+        return ret;
 }
 static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1664,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
        fe->permissions = cpu_to_le32(udfperms);
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
                fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
        else
                fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1830,32 +1830,23 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
+        int err;
        if (!inode)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
-        if (inode->i_state & I_NEW) {
-                memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
-                __udf_read_inode(inode);
-                unlock_new_inode(inode);
-        }
-        if (is_bad_inode(inode))
+        if (!(inode->i_state & I_NEW))
-                goto out_iput;
+                return inode;
-        if (ino->logicalBlockNum >= UDF_SB(sb)->
+        memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
-                        s_partmaps[ino->partitionReferenceNum].s_partition_len) {
+        err = udf_read_inode(inode);
-                udf_debug("block=%d, partition=%d out of range\n",
+        if (err < 0) {
-                          ino->logicalBlockNum, ino->partitionReferenceNum);
+                iget_failed(inode);
-                make_bad_inode(inode);
+                return ERR_PTR(err);
-                goto out_iput;
        }
+        unlock_new_inode(inode);
        return inode;
- out_iput:
-        iput(inode);
-        return NULL;
 }
 int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9737cba1357d..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                                                NULL, 0),
                };
                inode = udf_iget(dir->i_sb, lb);
-                if (!inode) {
+                if (IS_ERR(inode))
-                        return ERR_PTR(-EACCES);
+                        return inode;
-                }
        } else
 #endif /* UDF_RECOVERY */
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
-                if (!inode) {
+                if (IS_ERR(inode))
-                        return ERR_PTR(-EACCES);
+                        return ERR_CAST(inode);
-                }
        }
        return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
        return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
-                      bool excl)
 {
+        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct inode *dir = dentry->d_parent->d_inode;
        struct udf_fileident_bh fibh;
-        struct inode *inode;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct udf_inode_info *iinfo;
-        inode = udf_new_inode(dir, mode, &err);
-        if (!inode) {
-                return err;
-        }
-        iinfo = UDF_I(inode);
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                inode->i_data.a_ops = &udf_adinicb_aops;
-        else
-                inode->i_data.a_ops = &udf_aops;
-        inode->i_op = &udf_file_inode_operations;
-        inode->i_fop = &udf_file_operations;
-        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
-        if (!fi) {
+        if (unlikely(!fi)) {
                inode_dec_link_count(inode);
+                unlock_new_inode(inode);
                iput(inode);
                return err;
        }
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+                      bool excl)
 {
-        struct inode *inode;
+        struct inode *inode = udf_new_inode(dir, mode);
-        struct udf_inode_info *iinfo;
-        int err;
-        inode = udf_new_inode(dir, mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                return err;
-        iinfo = UDF_I(inode);
+        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                inode->i_data.a_ops = &udf_adinicb_aops;
        else
                inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_fop = &udf_file_operations;
        mark_inode_dirty(inode);
+        return udf_add_nondir(dentry, inode);
+}
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct inode *inode = udf_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        else
+                inode->i_data.a_ops = &udf_aops;
+        inode->i_op = &udf_file_inode_operations;
+        inode->i_fop = &udf_file_operations;
+        mark_inode_dirty(inode);
        d_tmpfile(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
 }
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
                     dev_t rdev)
 {
        struct inode *inode;
-        struct udf_fileident_bh fibh;
-        struct fileIdentDesc cfi, *fi;
-        int err;
-        struct udf_inode_info *iinfo;
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        err = -EIO;
+        inode = udf_new_inode(dir, mode);
-        inode = udf_new_inode(dir, mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
-        iinfo = UDF_I(inode);
        init_special_inode(inode, mode, rdev);
-        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+        return udf_add_nondir(dentry, inode);
-        if (!fi) {
-                inode_dec_link_count(inode);
-                iput(inode);
-                return err;
-        }
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
-        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
-        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                mark_inode_dirty(dir);
-        mark_inode_dirty(inode);
-        if (fibh.sbh != fibh.ebh)
-                brelse(fibh.ebh);
-        brelse(fibh.sbh);
-        d_instantiate(dentry, inode);
-        err = 0;
-out:
-        return err;
 }
 static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        err = -EIO;
+        inode = udf_new_inode(dir, S_IFDIR | mode);
-        inode = udf_new_inode(dir, S_IFDIR | mode, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
        iinfo = UDF_I(inode);
        inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
        if (!fi) {
                inode_dec_link_count(inode);
+                unlock_new_inode(inode);
                iput(inode);
                goto out;
        }
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (!fi) {
                clear_nlink(inode);
                mark_inode_dirty(inode);
+                unlock_new_inode(inode);
                iput(inode);
                goto out;
        }
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        inc_nlink(dir);
        mark_inode_dirty(dir);
+        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
 static int udf_symlink(struct inode *dir, struct dentry *dentry,
                       const char *symname)
 {
-        struct inode *inode;
+        struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
        struct pathComponent *pc;
        const char *compstart;
-        struct udf_fileident_bh fibh;
        struct extent_position epos = {};
        int eoffset, elen = 0;
-        struct fileIdentDesc *fi;
-        struct fileIdentDesc cfi;
        uint8_t *ea;
        int err;
        int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct udf_inode_info *iinfo;
        struct super_block *sb = dir->i_sb;
-        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
+        if (IS_ERR(inode))
-        if (!inode)
+                return PTR_ERR(inode);
-                goto out;
        iinfo = UDF_I(inode);
        down_write(&iinfo->i_data_sem);
@@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        mark_inode_dirty(inode);
        up_write(&iinfo->i_data_sem);
-        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+        err = udf_add_nondir(dentry, inode);
-        if (!fi)
-                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
-        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(lvid_get_unique_id(sb));
-        }
-        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-                mark_inode_dirty(dir);
-        if (fibh.sbh != fibh.ebh)
-                brelse(fibh.ebh);
-        brelse(fibh.sbh);
-        d_instantiate(dentry, inode);
-        err = 0;
 out:
        kfree(name);
        return err;
@@ -1037,6 +990,7 @@ out:
 out_no_entry:
        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        goto out;
 }
@@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct udf_fileident_bh fibh;
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
-                goto out_unlock;
+                return ERR_PTR(-EACCES);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
@@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
        tloc = lelb_to_cpu(cfi.icb.extLocation);
        inode = udf_iget(child->d_inode->i_sb, &tloc);
-        if (!inode)
+        if (IS_ERR(inode))
-                goto out_unlock;
+                return ERR_CAST(inode);
        return d_obtain_alias(inode);
-out_unlock:
-        return ERR_PTR(-EACCES);
 }
@@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
        loc.partitionReferenceNum = partref;
        inode = udf_iget(sb, &loc);
-        if (inode == NULL)
+        if (IS_ERR(inode))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 813da94d447b..5401fc33f5cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -961,12 +961,14 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
        metadata_fe = udf_iget(sb, &addr);
-        if (metadata_fe == NULL)
+        if (IS_ERR(metadata_fe)) {
                udf_warn(sb, "metadata inode efe not found\n");
-        else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+                return metadata_fe;
+        }
+        if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
                udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
                iput(metadata_fe);
-                metadata_fe = NULL;
+                return ERR_PTR(-EIO);
        }
        return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
        struct kernel_lb_addr addr;
+        struct inode *fe;
        map = &sbi->s_partmaps[partition];
        mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Metadata file location: block = %d part = %d\n",
                  mdata->s_meta_file_loc, map->s_partition_num);
-        mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+        fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
-                mdata->s_meta_file_loc, map->s_partition_num);
+                                         map->s_partition_num);
+        if (IS_ERR(fe)) {
-        if (mdata->s_metadata_fe == NULL) {
                /* mirror file entry */
                udf_debug("Mirror metadata file location: block = %d part = %d\n",
                          mdata->s_mirror_file_loc, map->s_partition_num);
-                mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+                fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
-                        mdata->s_mirror_file_loc, map->s_partition_num);
+                                                 map->s_partition_num);
-                if (mdata->s_mirror_fe == NULL) {
+                if (IS_ERR(fe)) {
                        udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
-                        return -EIO;
+                        return PTR_ERR(fe);
                }
-        }
+                mdata->s_mirror_fe = fe;
+        } else
+                mdata->s_metadata_fe = fe;
        /*
         * bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                udf_debug("Bitmap file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-                mdata->s_bitmap_fe = udf_iget(sb, &addr);
+                fe = udf_iget(sb, &addr);
-                if (mdata->s_bitmap_fe == NULL) {
+                if (IS_ERR(fe)) {
                        if (sb->s_flags & MS_RDONLY)
                                udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
                        else {
                                udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
-                                return -EIO;
+                                return PTR_ERR(fe);
                        }
-                }
+                } else
+                        mdata->s_bitmap_fe = fe;
        }
        udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
+                struct inode *inode;
-                map->s_uspace.s_table = udf_iget(sb, &loc);
+                inode = udf_iget(sb, &loc);
-                if (!map->s_uspace.s_table) {
+                if (IS_ERR(inode)) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                  p_index);
-                        return -EIO;
+                        return PTR_ERR(inode);
                }
+                map->s_uspace.s_table = inode;
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
                udf_debug("unallocSpaceTable (part %d) @ %ld\n",
                          p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                                phd->freedSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
+                struct inode *inode;
-                map->s_fspace.s_table = udf_iget(sb, &loc);
+                inode = udf_iget(sb, &loc);
-                if (!map->s_fspace.s_table) {
+                if (IS_ERR(inode)) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
                                  p_index);
-                        return -EIO;
+                        return PTR_ERR(inode);
                }
+                map->s_fspace.s_table = inode;
                map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
                udf_debug("freedSpaceTable (part %d) @ %ld\n",
                          p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
        sector_t vat_block;
        struct kernel_lb_addr ino;
+        struct inode *inode;
        /*
         * VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
        ino.partitionReferenceNum = type1_index;
        for (vat_block = start_block;
             vat_block >= map->s_partition_root &&
-             vat_block >= start_block - 3 &&
+             vat_block >= start_block - 3; vat_block--) {
-             !sbi->s_vat_inode; vat_block--) {
                ino.logicalBlockNum = vat_block - map->s_partition_root;
-                sbi->s_vat_inode = udf_iget(sb, &ino);
+                inode = udf_iget(sb, &ino);
+                if (!IS_ERR(inode)) {
+                        sbi->s_vat_inode = inode;
+                        break;
+                }
        }
 }
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
        inode = udf_iget(sb, &rootdir);
-        if (!inode) {
+        if (IS_ERR(inode)) {
                udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
                       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
-                ret = -EIO;
+                ret = PTR_ERR(inode);
                goto error_out;
        }
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..742557be9936 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -143,7 +143,6 @@ extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
 extern int udf_setsize(struct inode *, loff_t);
-extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +208,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
 /* truncate.c */
 extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c97990e..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-        if (want_delete) {
+        if (want_delete)
-                lock_ufs(inode->i_sb);
+                ufs_free_inode(inode);
-                ufs_free_inode (inode);
-                unlock_ufs(inode->i_sb);
-        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..2df62a73f20c 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -126,12 +126,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        lock_ufs(dir->i_sb);
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
-                goto out;
+                goto out_notlocked;
+        lock_ufs(dir->i_sb);
        if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
                /* slow symlink */
                inode->i_op = &ufs_symlink_inode_operations;
@@ -181,13 +181,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
        struct inode * inode;
        int err;
-        lock_ufs(dir->i_sb);
-        inode_inc_link_count(dir);
        inode = ufs_new_inode(dir, S_IFDIR|mode);
-        err = PTR_ERR(inode);
        if (IS_ERR(inode))
-                goto out_dir;
+                return PTR_ERR(inode);
        inode->i_op = &ufs_dir_inode_operations;
        inode->i_fop = &ufs_dir_operations;
@@ -195,6 +191,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
        inode_inc_link_count(inode);
+        lock_ufs(dir->i_sb);
+        inode_inc_link_count(dir);
        err = ufs_make_empty(inode, dir);
        if (err)
                goto out_fail;
@@ -212,7 +211,6 @@ out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
        iput (inode);
-out_dir:
        inode_dec_link_count(dir);
        unlock_ufs(dir->i_sb);
        goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d32844..86df952d3e24 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents(
        struct xfs_bmap_free    *flist,
        int                     num_exts)
 {
-        struct xfs_btree_cur            *cur;
+        struct xfs_btree_cur            *cur = NULL;
        struct xfs_bmbt_rec_host        *gotp;
        struct xfs_bmbt_irec            got;
        struct xfs_bmbt_irec            left;
@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents(
        int                             error = 0;
        int                             i;
        int                             whichfork = XFS_DATA_FORK;
-        int                             logflags;
+        int                             logflags = 0;
        xfs_filblks_t                   blockcount = 0;
        int                             total_extents;
@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents(
                }
        }
-        /* We are going to change core inode */
-        logflags = XFS_ILOG_CORE;
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
-        } else {
-                cur = NULL;
-                logflags |= XFS_ILOG_DEXT;
        }
        /*
@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents(
                        blockcount = left.br_blockcount +
                                got.br_blockcount;
                        xfs_iext_remove(ip, *current_ext, 1, 0);
+                        logflags |= XFS_ILOG_CORE;
                        if (cur) {
                                error = xfs_btree_delete(cur, &i);
                                if (error)
                                        goto del_cursor;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        } else {
+                                logflags |= XFS_ILOG_DEXT;
                        }
                        XFS_IFORK_NEXT_SET(ip, whichfork,
                                XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents(
                        got.br_startoff = startoff;
                }
+                logflags |= XFS_ILOG_CORE;
                if (cur) {
                        error = xfs_bmbt_update(cur, got.br_startoff,
                                                got.br_startblock,
@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents(
                                                got.br_state);
                        if (error)
                                goto del_cursor;
+                } else {
+                        logflags |= XFS_ILOG_DEXT;
                }
                (*current_ext)++;
@@ -5597,6 +5598,7 @@ del_cursor:
                xfs_btree_del_cursor(cur,
                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-        xfs_trans_log_inode(tp, ip, logflags);
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
        return error;
 }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4caa54f..b984647c24db 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+        struct page             *page)
+{
+        struct address_space    *mapping = page->mapping;
+        struct inode            *inode = mapping->host;
+        loff_t                  end_offset;
+        loff_t                  offset;
+        int                     newly_dirty;
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
+        end_offset = i_size_read(inode);
+        offset = page_offset(page);
+        spin_lock(&mapping->private_lock);
+        if (page_has_buffers(page)) {
+                struct buffer_head *head = page_buffers(page);
+                struct buffer_head *bh = head;
+                do {
+                        if (offset < end_offset)
+                                set_buffer_dirty(bh);
+                        bh = bh->b_this_page;
+                        offset += 1 << inode->i_blkbits;
+                } while (bh != head);
+        }
+        newly_dirty = !TestSetPageDirty(page);
+        spin_unlock(&mapping->private_lock);
+        if (newly_dirty) {
+                /* sigh - __set_page_dirty() is static, so copy it here, too */
+                unsigned long flags;
+                spin_lock_irqsave(&mapping->tree_lock, flags);
+                if (page->mapping) {    /* Race with truncate? */
+                        WARN_ON_ONCE(!PageUptodate(page));
+                        account_page_dirtied(page, mapping);
+                        radix_tree_tag_set(&mapping->page_tree,
+                                        page_index(page), PAGECACHE_TAG_DIRTY);
+                }
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        }
+        return newly_dirty;
+}
 const struct address_space_operations xfs_address_space_operations = {
        .readpage               = xfs_vm_readpage,
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
+        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d39a35..1707980f9a4b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,6 +1470,26 @@ xfs_collapse_file_space(
        start_fsb = XFS_B_TO_FSB(mp, offset + len);
        shift_fsb = XFS_B_TO_FSB(mp, len);
+        /*
+         * Writeback the entire file and force remove any post-eof blocks. The
+         * writeback prevents changes to the extent list via concurrent
+         * writeback and the eofblocks trim prevents the extent shift algorithm
+         * from running into a post-eof delalloc extent.
+         *
+         * XXX: This is a temporary fix until the extent shift loop below is
+         * converted to use offsets and lookups within the ILOCK rather than
+         * carrying around the index into the extent list for the next
+         * iteration.
+         */
+        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+        if (error)
+                return error;
+        if (xfs_can_free_eofblocks(ip, true)) {
+                error = xfs_free_eofblocks(mp, ip, false);
+                if (error)
+                        return error;
+        }
        error = xfs_free_file_space(ip, offset, len);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b1708d134..de5368c803f9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -291,12 +291,22 @@ xfs_file_read_iter(
                if (inode->i_mapping->nrpages) {
                        ret = filemap_write_and_wait_range(
                                                        VFS_I(ip)->i_mapping,
-                                                        pos, -1);
+                                                        pos, pos + size - 1);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
                        }
-                        truncate_pagecache_range(VFS_I(ip), pos, -1);
+                        /*
+                         * Invalidate whole pages. This can return an error if
+                         * we fail to invalidate a page, but this should never
+                         * happen on XFS. Warn if it does fail.
+                         */
+                        ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+                                        pos >> PAGE_CACHE_SHIFT,
+                                        (pos + size - 1) >> PAGE_CACHE_SHIFT);
+                        WARN_ON_ONCE(ret);
+                        ret = 0;
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
@@ -632,10 +642,19 @@ xfs_file_dio_aio_write(
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, -1);
+                                                    pos, pos + count - 1);
                if (ret)
                        goto out;
-                truncate_pagecache_range(VFS_I(ip), pos, -1);
+                /*
+                 * Invalidate whole pages. This can return an error if
+                 * we fail to invalidate a page, but this should never
+                 * happen on XFS. Warn if it does fail.
+                 */
+                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+                                        pos >> PAGE_CACHE_SHIFT,
+                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                WARN_ON_ONCE(ret);
+                ret = 0;
        }
        /*