111 files changed, 5488 insertions, 2455 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index db5dc1598716..664991afe0c0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
 menu "Caches"
diff --git a/fs/Makefile b/fs/Makefile
index 90c88529892b..da0bbb456d3f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS)		+= qnx6/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
+obj-$(CONFIG_OVERLAY_FS)        += overlayfs/
 obj-$(CONFIG_UDF_FS)            += udf/
 obj-$(CONFIG_SUN_OPENPROMFS)    += openpromfs/
 obj-$(CONFIG_OMFS_FS)           += omfs/
diff --git a/fs/aio.c b/fs/aio.c
index 84a751005f5b..14b93159ef83 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,6 +165,15 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
+/* Backing dev info for aio fs.
+ * -no dirty page accounting or writeback happens
+ */
+static struct backing_dev_info aio_fs_backing_dev_info = {
+        .name           = "aiofs",
+        .state          = 0,
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
+};
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
        struct qstr this = QSTR_INIT("[aio]", 5);
@@ -176,6 +185,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
        inode->i_mapping->a_ops = &aio_ctx_aops;
        inode->i_mapping->private_data = ctx;
+        inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
        inode->i_size = PAGE_SIZE * nr_pages;
        path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -220,6 +230,9 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
+        if (bdi_init(&aio_fs_backing_dev_info))
+                panic("Failed to init aio fs backing dev info.");
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -281,11 +294,6 @@ static const struct file_operations aio_ring_fops = {
        .mmap = aio_ring_mmap,
 };
-static int aio_set_page_dirty(struct page *page)
-{
-        return 0;
-}
 #if IS_ENABLED(CONFIG_MIGRATION)
 static int aio_migratepage(struct address_space *mapping, struct page *new,
                        struct page *old, enum migrate_mode mode)
@@ -357,7 +365,7 @@ out:
 #endif
 static const struct address_space_operations aio_ctx_aops = {
-        .set_page_dirty = aio_set_page_dirty,
+        .set_page_dirty = __set_page_dirty_no_writeback,
 #if IS_ENABLED(CONFIG_MIGRATION)
        .migratepage    = aio_migratepage,
 #endif
@@ -412,7 +420,6 @@ static int aio_setup_ring(struct kioctx *ctx)
                pr_debug("pid(%d) page[%d]->count=%d\n",
                         current->pid, i, page_count(page));
                SetPageUptodate(page);
-                SetPageDirty(page);
                unlock_page(page);
                ctx->ring_pages[i] = page;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d4114cda0..1d9c9f3754f8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL_GPL(blkdev_write_iter);
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = file->f_mapping->host;
@@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
        iov_iter_truncate(to, size);
        return generic_file_read_iter(iocb, to);
 }
+EXPORT_SYMBOL_GPL(blkdev_read_iter);
 /*
 * Try to release a page associated with block device when the system
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d3220d31d3cb..dcd9be32ac57 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1011,8 +1011,6 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
                bytes = min(bytes, working_bytes);
                kaddr = kmap_atomic(page_out);
                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
-                if (*pg_index == (vcnt - 1) && *pg_offset == 0)
-                        memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
                kunmap_atomic(kaddr);
                flush_dcache_page(page_out);
@@ -1054,3 +1052,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
        return 1;
 }
+/*
+ * When uncompressing data, we need to make sure and zero any parts of
+ * the biovec that were not filled in by the decompression code.  pg_index
+ * and pg_offset indicate the last page and the last offset of that page
+ * that have been filled in.  This will zero everything remaining in the
+ * biovec.
+ */
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+                                   unsigned long pg_index,
+                                   unsigned long pg_offset)
+{
+        while (pg_index < vcnt) {
+                struct page *page = bvec[pg_index].bv_page;
+                unsigned long off = bvec[pg_index].bv_offset;
+                unsigned long len = bvec[pg_index].bv_len;
+                if (pg_offset < off)
+                        pg_offset = off;
+                if (pg_offset < off + len) {
+                        unsigned long bytes = off + len - pg_offset;
+                        char *kaddr;
+                        kaddr = kmap_atomic(page);
+                        memset(kaddr + pg_offset, 0, bytes);
+                        kunmap_atomic(kaddr);
+                }
+                pg_index++;
+                pg_offset = 0;
+        }
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 0c803b4fbf93..d181f70caae0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -45,7 +45,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+                                   unsigned long pg_index,
+                                   unsigned long pg_offset);
 struct btrfs_compress_op {
        struct list_head *(*alloc_workspace)(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19bc6162fb8e..150822ee0a0b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 {
        int i;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        /* lockdep really cares that we take all of these spinlocks
-         * in the right order.  If any of the locks in the path are not
-         * currently blocking, it is going to complain.  So, make really
-         * really sure by forcing the path to blocking before we clear
-         * the path blocking.
-         */
        if (held) {
                btrfs_set_lock_blocking_rw(held, held_rw);
                if (held_rw == BTRFS_WRITE_LOCK)
@@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
                        held_rw = BTRFS_READ_LOCK_BLOCKING;
        }
        btrfs_set_path_blocking(p);
-#endif
        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
                if (p->nodes[i] && p->locks[i]) {
@@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
                }
        }
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (held)
                btrfs_clear_lock_blocking_rw(held, held_rw);
-#endif
 }
 /* this also releases the path */
@@ -2893,7 +2883,7 @@ cow_done:
                                        }
                                        p->locks[level] = BTRFS_WRITE_LOCK;
                                } else {
-                                        err = btrfs_try_tree_read_lock(b);
+                                        err = btrfs_tree_read_lock_atomic(b);
                                        if (!err) {
                                                btrfs_set_path_blocking(p);
                                                btrfs_tree_read_lock(b);
@@ -3025,7 +3015,7 @@ again:
                        }
                        level = btrfs_header_level(b);
-                        err = btrfs_try_tree_read_lock(b);
+                        err = btrfs_tree_read_lock_atomic(b);
                        if (!err) {
                                btrfs_set_path_blocking(p);
                                btrfs_tree_read_lock(b);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d557264ee974..fe69edda11fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
                                 unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 bytenr,
                             u64 offset, int metadata, u64 *refs, u64 *flags);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ad0f47ac850..1bf9f897065d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        struct btrfs_super_block *sb = fs_info->super_copy;
        int ret = 0;
-        if (sb->root_level > BTRFS_MAX_LEVEL) {
+        if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
-                printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+                printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
-                                sb->root_level, BTRFS_MAX_LEVEL);
+                                btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }
-        if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+        if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
-                printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+                printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
-                                sb->chunk_root_level, BTRFS_MAX_LEVEL);
+                                btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }
-        if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+        if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
-                printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+                printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
-                                sb->log_root_level, BTRFS_MAX_LEVEL);
+                                btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }
@@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         * The common minimum, we don't know if we can trust the nodesize/sectorsize
         * items yet, they'll be verified later. Issue just a warning.
         */
-        if (!IS_ALIGNED(sb->root, 4096))
+        if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
                                sb->root);
-        if (!IS_ALIGNED(sb->chunk_root, 4096))
+        if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
                                sb->chunk_root);
-        if (!IS_ALIGNED(sb->log_root, 4096))
+        if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-                                sb->log_root);
+                                btrfs_super_log_root(sb));
        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
                printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
@@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         * Hint to catch really bogus numbers, bitflips or so, more exact checks are
         * done later
         */
-        if (sb->num_devices > (1UL << 31))
+        if (btrfs_super_num_devices(sb) > (1UL << 31))
                printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
-                                sb->num_devices);
+                                btrfs_super_num_devices(sb));
-        if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+        if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
                printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
-                                sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+                                btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
                ret = -EINVAL;
        }
@@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         * The generation is a global counter, we'll trust it more than the others
         * but it's still possible that it's the one that's wrong.
         */
-        if (sb->generation < sb->chunk_root_generation)
+        if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
                printk(KERN_WARNING
                        "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
-                        sb->generation, sb->chunk_root_generation);
+                        btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
-        if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+        if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+            && btrfs_super_cache_generation(sb) != (u64)-1)
                printk(KERN_WARNING
                        "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
-                        sb->generation, sb->cache_generation);
+                        btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
        return ret;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d56589571012..47c1ba141082 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
-/* simple helper to search for an existing extent at a given offset */
+/* simple helper to search for an existing data extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 {
        int ret;
        struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
        key.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
                                0, 0);
-        if (ret > 0) {
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-                if (key.objectid == start &&
-                    key.type == BTRFS_METADATA_ITEM_KEY)
-                        ret = 0;
-        }
        btrfs_free_path(path);
        return ret;
 }
@@ -786,7 +780,6 @@ search_again:
        else
                key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
                                &key, path, 0, 0);
        if (ret < 0)
@@ -802,13 +795,6 @@ again:
                            key.offset == root->nodesize)
                                ret = 0;
                }
-                if (ret) {
-                        key.objectid = bytenr;
-                        key.type = BTRFS_EXTENT_ITEM_KEY;
-                        key.offset = root->nodesize;
-                        btrfs_release_path(path);
-                        goto again;
-                }
        }
        if (ret == 0) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a94355efd..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        ret = 0;
 fail:
        while (ret < 0 && !list_empty(&tmplist)) {
-                sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+                sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
                list_del(&sums->list);
                kfree(sums);
        }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d2b76e29d3b..4399f0c3a4ce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,23 +765,6 @@ out:
        return ret;
 }
-/*  copy of check_sticky in fs/namei.c()
-* It's inline, so penalty for filesystems that don't use sticky bit is
-* minimal.
-*/
-static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
-{
-        kuid_t fsuid = current_fsuid();
-        if (!(dir->i_mode & S_ISVTX))
-                return 0;
-        if (uid_eq(inode->i_uid, fsuid))
-                return 0;
-        if (uid_eq(dir->i_uid, fsuid))
-                return 0;
-        return !capable(CAP_FOWNER);
-}
 /*  copy of may_delete in fs/namei.c()
 *      Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
@@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;
-        if (btrfs_check_sticky(dir, victim->d_inode)||
+        if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
-                IS_APPEND(victim->d_inode)||
            IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
                return -EPERM;
        if (isdir) {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5665d2149249..f8229ef1b46d 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -128,6 +128,26 @@ again:
 }
 /*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+        if (atomic_read(&eb->blocking_writers))
+                return 0;
+        read_lock(&eb->lock);
+        if (atomic_read(&eb->blocking_writers)) {
+                read_unlock(&eb->lock);
+                return 0;
+        }
+        atomic_inc(&eb->read_locks);
+        atomic_inc(&eb->spinning_readers);
+        return 1;
+}
+/*
 * returns 1 if we get the read lock and 0 if we don't
 * this won't wait for blocking writers
 */
@@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
            atomic_read(&eb->blocking_readers))
                return 0;
-        if (!write_trylock(&eb->lock))
+        write_lock(&eb->lock);
-                return 0;
        if (atomic_read(&eb->blocking_writers) ||
            atomic_read(&eb->blocking_readers)) {
                write_unlock(&eb->lock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b81e0e9a4894..c44a9d5f5362 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
 static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
 {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 78285f30909e..617553cdb7d3 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -373,6 +373,8 @@ cont:
        }
 done:
        kunmap(pages_in[page_in_index]);
+        if (!ret)
+                btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
        return ret;
 }
@@ -410,10 +412,23 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
                goto out;
        }
+        /*
+         * the caller is already checking against PAGE_SIZE, but lets
+         * move this check closer to the memcpy/memset
+         */
+        destlen = min_t(unsigned long, destlen, PAGE_SIZE);
        bytes = min_t(unsigned long, destlen, out_len - start_byte);
        kaddr = kmap_atomic(dest_page);
        memcpy(kaddr, workspace->buf + start_byte, bytes);
+        /*
+         * btrfs_getblock is doing a zero on the tail of the page too,
+         * but this will cover anything missing from the decompressed
+         * data.
+         */
+        if (bytes < destlen)
+                memset(kaddr+bytes, 0, destlen-bytes);
        kunmap_atomic(kaddr);
 out:
        return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2b97ef10317..54bd91ece35b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2151,6 +2151,7 @@ static void __exit exit_btrfs_fs(void)
        extent_map_exit();
        extent_io_exit();
        btrfs_interface_exit();
+        btrfs_end_io_wq_exit();
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1475979e5718..286213cec861 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                         * is this extent already allocated in the extent
                         * allocation tree?  If so, just add a reference
                         */
-                        ret = btrfs_lookup_extent(root, ins.objectid,
+                        ret = btrfs_lookup_data_extent(root, ins.objectid,
                                                ins.offset);
                        if (ret == 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 759fa4e2de8f..fb22fd8d8fb8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -299,6 +299,8 @@ done:
        zlib_inflateEnd(&workspace->strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
+        if (!ret)
+                btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
        return ret;
 }
@@ -310,10 +312,14 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret = 0;
        int wbits = MAX_WBITS;
-        unsigned long bytes_left = destlen;
+        unsigned long bytes_left;
        unsigned long total_out = 0;
+        unsigned long pg_offset = 0;
        char *kaddr;
+        destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+        bytes_left = destlen;
        workspace->strm.next_in = data_in;
        workspace->strm.avail_in = srclen;
        workspace->strm.total_in = 0;
@@ -341,7 +347,6 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
                unsigned long buf_start;
                unsigned long buf_offset;
                unsigned long bytes;
-                unsigned long pg_offset = 0;
                ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
@@ -384,6 +389,17 @@ next:
                ret = 0;
        zlib_inflateEnd(&workspace->strm);
+        /*
+         * this should only happen if zlib returned fewer bytes than we
+         * expected.  btrfs_get_block is responsible for zeroing from the
+         * end of the inline extent (destlen) to the end of the page
+         */
+        if (pg_offset < destlen) {
+                kaddr = kmap_atomic(dest_page);
+                memset(kaddr + pg_offset, 0, destlen - pg_offset);
+                kunmap_atomic(kaddr);
+        }
        return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 9614adc7e754..20805db2c987 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,21 +128,15 @@ __clear_page_buffers(struct page *page)
        page_cache_release(page);
 }
+static void buffer_io_error(struct buffer_head *bh, char *msg)
-static int quiet_error(struct buffer_head *bh)
-{
-        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
-                return 0;
-        return 1;
-}
-static void buffer_io_error(struct buffer_head *bh)
 {
        char b[BDEVNAME_SIZE];
-        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+        if (!test_bit(BH_Quiet, &bh->b_state))
+                printk_ratelimited(KERN_ERR
+                        "Buffer I/O error on dev %s, logical block %llu%s\n",
                        bdevname(bh->b_bdev, b),
-                        (unsigned long long)bh->b_blocknr);
+                        (unsigned long long)bh->b_blocknr, msg);
 }
 /*
@@ -177,17 +171,10 @@ EXPORT_SYMBOL(end_buffer_read_sync);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-        char b[BDEVNAME_SIZE];
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!quiet_error(bh)) {
+                buffer_io_error(bh, ", lost sync page write");
-                        buffer_io_error(bh);
-                        printk(KERN_WARNING "lost page write due to "
-                                        "I/O error on %s\n",
-                                       bdevname(bh->b_bdev, b));
-                }
                set_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
@@ -304,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
-                if (!quiet_error(bh))
+                buffer_io_error(bh, ", async page read");
-                        buffer_io_error(bh);
                SetPageError(page);
        }
@@ -353,7 +339,6 @@ still_busy:
 */
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
-        char b[BDEVNAME_SIZE];
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
@@ -365,12 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!quiet_error(bh)) {
+                buffer_io_error(bh, ", lost async page write");
-                        buffer_io_error(bh);
-                        printk(KERN_WARNING "lost page write due to "
-                                        "I/O error on %s\n",
-                               bdevname(bh->b_bdev, b));
-                }
                set_bit(AS_EIO, &page->mapping->flags);
                set_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
@@ -993,7 +973,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 */
 static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-                pgoff_t index, int size, int sizebits)
+              pgoff_t index, int size, int sizebits, gfp_t gfp)
 {
        struct inode *inode = bdev->bd_inode;
        struct page *page;
@@ -1002,8 +982,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
        int ret = 0;            /* Will call free_more_memory() */
        gfp_t gfp_mask;
-        gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+        gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
-        gfp_mask |= __GFP_MOVABLE;
        /*
         * XXX: __getblk_slow() can not really deal with failure and
         * will endlessly loop on improvised global reclaim.  Prefer
@@ -1060,7 +1040,7 @@ failed:
 * that page was dirty, the buffers are set dirty also.
 */
 static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
 {
        pgoff_t index;
        int sizebits;
@@ -1087,11 +1067,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
        }
        /* Create a page with the proper size buffers.. */
-        return grow_dev_page(bdev, block, index, size, sizebits);
+        return grow_dev_page(bdev, block, index, size, sizebits, gfp);
 }
-static struct buffer_head *
+struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+__getblk_slow(struct block_device *bdev, sector_t block,
+             unsigned size, gfp_t gfp)
 {
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1113,13 +1094,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
                if (bh)
                        return bh;
-                ret = grow_buffers(bdev, block, size);
+                ret = grow_buffers(bdev, block, size, gfp);
                if (ret < 0)
                        return NULL;
                if (ret == 0)
                        free_more_memory();
        }
 }
+EXPORT_SYMBOL(__getblk_slow);
 /*
 * The relationship between dirty buffers and dirty pages:
@@ -1373,24 +1355,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 EXPORT_SYMBOL(__find_get_block);
 /*
- * __getblk will locate (and, if necessary, create) the buffer_head
+ * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
- * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
+ * __getblk_gfp() will lock up the machine if grow_dev_page's
- * attempt is failing.  FIXME, perhaps?
+ * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
 */
 struct buffer_head *
-__getblk(struct block_device *bdev, sector_t block, unsigned size)
+__getblk_gfp(struct block_device *bdev, sector_t block,
+             unsigned size, gfp_t gfp)
 {
        struct buffer_head *bh = __find_get_block(bdev, block, size);
        might_sleep();
        if (bh == NULL)
-                bh = __getblk_slow(bdev, block, size);
+                bh = __getblk_slow(bdev, block, size, gfp);
        return bh;
 }
-EXPORT_SYMBOL(__getblk);
+EXPORT_SYMBOL(__getblk_gfp);
 /*
 * Do async read-ahead on a buffer..
@@ -1406,24 +1389,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 EXPORT_SYMBOL(__breadahead);
 /**
- *  __bread() - reads a specified block and returns the bh
+ *  __bread_gfp() - reads a specified block and returns the bh
 *  @bdev: the block_device to read from
 *  @block: number of block
 *  @size: size (in bytes) to read
- * 
+ *  @gfp: page allocation flag
+ *
 *  Reads a specified block, and returns buffer head that contains it.
+ *  The page cache can be allocated from non-movable area
+ *  not to prevent page migration if you set gfp to zero.
 *  It returns NULL if the block was unreadable.
 */
 struct buffer_head *
-__bread(struct block_device *bdev, sector_t block, unsigned size)
+__bread_gfp(struct block_device *bdev, sector_t block,
+                   unsigned size, gfp_t gfp)
 {
-        struct buffer_head *bh = __getblk(bdev, block, size);
+        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
 }
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
 /*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
@@ -2082,6 +2069,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
+        loff_t old_size = inode->i_size;
        int i_size_changed = 0;
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2101,6 +2089,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
        unlock_page(page);
        page_cache_release(page);
+        if (old_size < pos)
+                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea9e6f7..cefca661464b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((dirty & (1 << i)) &&
-                    flush_tid == ci->i_cap_flush_tid[i])
+                    (u16)flush_tid == ci->i_cap_flush_tid[i])
                        cleaned |= 1 << i;
        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
diff --git a/fs/dcache.c b/fs/dcache.c
index d5a23fd0da90..5bc72b07fde2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -778,6 +778,7 @@ restart:
                        struct dentry *parent = lock_parent(dentry);
                        if (likely(!dentry->d_lockref.count)) {
                                __dentry_kill(dentry);
+                                dput(parent);
                                goto restart;
                        }
                        if (parent)
@@ -2673,11 +2674,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        if (!IS_ROOT(new)) {
                                spin_unlock(&inode->i_lock);
                                dput(new);
+                                iput(inode);
                                return ERR_PTR(-EIO);
                        }
                        if (d_ancestor(new, dentry)) {
                                spin_unlock(&inode->i_lock);
                                dput(new);
+                                iput(inode);
                                return ERR_PTR(-EIO);
                        }
                        write_seqlock(&rename_lock);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3bf924..c4cd1fd86cc2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        s->s_blocksize = path.dentry->d_sb->s_blocksize;
        s->s_magic = ECRYPTFS_SUPER_MAGIC;
+        s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+        rc = -EINVAL;
+        if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+                pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+                goto out_free;
+        }
        inode = ecryptfs_get_inode(path.dentry->d_inode, s);
        rc = PTR_ERR(inode);
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 389ba8312d5d..b47c7b8dc275 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -4,7 +4,7 @@
 # Copyright (C) 2008 Panasas Inc.  All rights reserved.
 #
 # Authors:
-#   Boaz Harrosh <bharrosh@panasas.com>
+#   Boaz Harrosh <ooo@electrozaur.com>
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 3bbd46956d77..7d88ef566213 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -4,7 +4,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 49f51ab4caac..d7defd557601 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index fffe86fd7a42..ad9cac670a47 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 71bf8e4fb5d4..1a376b42d305 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3f9cafd73931..f1d3d4eb8c4f 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 4731fd991efe..28907460e8fa 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index cfc0205d62c4..7bd8ac8dfb28 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * This file is part of exofs.
 *
@@ -29,7 +29,7 @@
 #include "ore_raid.h"
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
 MODULE_LICENSE("GPL");
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 84529b8a331b..27cbdb697649 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * This file is part of the objects raid engine (ore).
 *
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index cf6375d82129..a6e746775570 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) from 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * This file is part of the objects raid engine (ore).
 *
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index ed73ed8ebbee..95965503afcb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 4dd687c3e747..832e2624b80b 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -2,7 +2,7 @@
 * Copyright (C) 2005, 2006
 * Avishay Traeger (avishay@gmail.com)
 * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * Copyrights for code taken from ext2:
 *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 1b4f2f95fc37..5e6a2c0a1f0b 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -1,7 +1,7 @@
 /*
 * Copyright (C) 2012
 * Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
 *
 * This file is part of exofs.
 *
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7015db0bafd1..eb742d0e67ff 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1354,13 +1354,6 @@ set_qf_format:
                                        "not specified.");
                        return 0;
                }
-        } else {
-                if (sbi->s_jquota_fmt) {
-                        ext3_msg(sb, KERN_ERR, "error: journaled quota format "
-                                        "specified with no journaling "
-                                        "enabled.");
-                        return 0;
-                }
        }
 #endif
        return 1;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 581ef40fbe90..83a6f497c4e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
 }
 /* Initializes an uninitialized block bitmap */
-static void ext4_init_block_bitmap(struct super_block *sb,
+static int ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t block_group,
                                   struct ext4_group_desc *gdp)
@@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb,
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
-                ext4_error(sb, "Checksum bad for group %u", block_group);
                grp = ext4_get_group_info(sb, block_group);
                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
                                           count);
                }
                set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
-                return;
+                return -EIO;
        }
        memset(bh->b_data, 0, sb->s_blocksize);
@@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
                             sb->s_blocksize * 8, bh->b_data);
        ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
        ext4_group_desc_csum_set(sb, block_group, gdp);
+        return 0;
 }
 /* Return the number of free blocks in a block group.  It is used when
@@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                ext4_init_block_bitmap(sb, bh, block_group, desc);
+                int err;
+                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
+                if (err)
+                        ext4_error(sb, "Checksum bad for grp %u", block_group);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
@@ -636,8 +640,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) &&
+        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
-            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
                dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 3285aa5a706a..b610779a958c 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0bb3f9ea0832..c24143ea9c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
                                        &file->f_ra, file,
                                        index, 1);
                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+                        bh = ext4_bread(NULL, inode, map.m_lblk, 0);
+                        if (IS_ERR(bh))
+                                return PTR_ERR(bh);
                }
-                /*
-                 * We ignore I/O errors on directories so users have a chance
-                 * of recovering data when there's a bad sector
-                 */
                if (!bh) {
                        if (!dir_has_error) {
                                EXT4_ERROR_FILE(file, 0,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0c225cdb52c..c55a1faaed58 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -572,15 +572,15 @@ enum {
 /*
 * The bit position of these flags must not overlap with any of the
- * EXT4_GET_BLOCKS_*.  They are used by ext4_ext_find_extent(),
+ * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
-#define EXT4_EX_NOCACHE                         0x0400
+#define EXT4_EX_NOCACHE                         0x40000000
-#define EXT4_EX_FORCE_CACHE                     0x0800
+#define EXT4_EX_FORCE_CACHE                     0x20000000
 /*
 * Flags used by ext4_free_blocks
@@ -890,6 +890,7 @@ struct ext4_inode_info {
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_lru;
+        unsigned int i_es_all_nr;       /* protected by i_es_lock */
        unsigned int i_es_lru_nr;       /* protected by i_es_lock */
        unsigned long i_touch_when;     /* jiffies of last accessing */
@@ -1174,6 +1175,9 @@ struct ext4_super_block {
 #define EXT4_MF_MNTDIR_SAMPLED  0x0001
 #define EXT4_MF_FS_ABORTED      0x0002  /* Fatal error detected */
+/* Number of quota types we support */
+#define EXT4_MAXQUOTAS 2
 /*
 * fourth extended-fs super-block data in memory
 */
@@ -1237,7 +1241,7 @@ struct ext4_sb_info {
        u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+        char *s_qf_names[EXT4_MAXQUOTAS];       /* Names of quota files with journalled quota */
        int s_jquota_fmt;                       /* Format of quota to use */
 #endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -1330,8 +1334,7 @@ struct ext4_sb_info {
        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_lru;
-        unsigned long s_es_last_sorted;
+        struct ext4_es_stats s_es_stats;
-        struct percpu_counter s_extent_cache_cnt;
        struct mb_cache *s_mb_cache;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
@@ -1399,7 +1402,6 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
-        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
@@ -2086,10 +2088,8 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
-struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
-                                                ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
-struct buffer_head *ext4_bread(handle_t *, struct inode *,
-                                                ext4_lblk_t, int, int *);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
@@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle,
 #define CONVERT_INLINE_DATA      2
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -2332,10 +2333,18 @@ extern int ext4_register_li_request(struct super_block *sb,
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
 {
        return EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                          EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+                                          EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
-                                          EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+               (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+        WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+                     !EXT4_SB(sb)->s_chksum_driver);
+        return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
        return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2731,21 +2740,26 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *,
-                                  struct ext4_ext_path *,
+                                  struct ext4_ext_path **,
                                  struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
-                                                  struct ext4_ext_path *,
+                                              struct ext4_ext_path **,
-                                                  int flags);
+                                              int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 extern int ext4_find_delalloc_range(struct inode *inode,
                                    ext4_lblk_t lblk_start,
                                    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+                                struct inode *inode2, ext4_lblk_t lblk1,
+                             ext4_lblk_t lblk2,  ext4_lblk_t count,
+                             int mark_unwritten,int *err);
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2755,8 +2769,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
-extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-                            struct ext4_extent **extent);
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a867f5ca9991..3c9381547094 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh)
 struct ext4_ext_path {
        ext4_fsblk_t                    p_block;
        __u16                           p_depth;
+        __u16                           p_maxdepth;
        struct ext4_extent              *p_ext;
        struct ext4_extent_idx          *p_idx;
        struct ext4_extent_header       *p_hdr;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0074e0d23d6e..3445035c7e01 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
        set_buffer_prio(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
-                /* Errors can only happen if there is a bug */
+                /* Errors can only happen due to aborted journal or a nasty bug */
-                if (WARN_ON_ONCE(err)) {
+                if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        if (inode == NULL) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff202f2..9c5b49fb281e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -102,9 +102,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
-#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74292a71b384..0b16fb4c06d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
 {
        struct ext4_extent_tail *et;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        et = find_ext4_extent_tail(eh);
@@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 {
        struct ext4_extent_tail *et;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        et = find_ext4_extent_tail(eh);
@@ -98,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 static int ext4_split_extent(handle_t *handle,
                                struct inode *inode,
-                                struct ext4_ext_path *path,
+                                struct ext4_ext_path **ppath,
                                struct ext4_map_blocks *map,
                                int split_flag,
                                int flags);
 static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
-                             struct ext4_ext_path *path,
+                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags);
@@ -291,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
        return size;
 }
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+                           int nofail)
+{
+        struct ext4_ext_path *path = *ppath;
+        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+                        EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
+                        (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
 /*
 * Calculate the number of metadata blocks needed
 * to allocate @blocks
@@ -695,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
-        int depth = path->p_depth;
+        int depth, i;
-        int i;
+        if (!path)
+                return;
+        depth = path->p_depth;
        for (i = 0; i <= depth; i++, path++)
                if (path->p_bh) {
                        brelse(path->p_bh);
@@ -841,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 }
 struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
+ext4_find_extent(struct inode *inode, ext4_lblk_t block,
-                     struct ext4_ext_path *path, int flags)
+                 struct ext4_ext_path **orig_path, int flags)
 {
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
-        short int depth, i, ppos = 0, alloc = 0;
+        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
+        short int depth, i, ppos = 0;
        int ret;
        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
-        /* account possible depth increase */
+        if (path) {
+                ext4_ext_drop_refs(path);
+                if (depth > path[0].p_maxdepth) {
+                        kfree(path);
+                        *orig_path = path = NULL;
+                }
+        }
        if (!path) {
+                /* account possible depth increase */
                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
                                GFP_NOFS);
-                if (!path)
+                if (unlikely(!path))
                        return ERR_PTR(-ENOMEM);
-                alloc = 1;
+                path[0].p_maxdepth = depth + 1;
        }
        path[0].p_hdr = eh;
        path[0].p_bh = NULL;
@@ -876,7 +898,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
                                            flags);
-                if (IS_ERR(bh)) {
+                if (unlikely(IS_ERR(bh))) {
                        ret = PTR_ERR(bh);
                        goto err;
                }
@@ -910,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 err:
        ext4_ext_drop_refs(path);
-        if (alloc)
+        kfree(path);
-                kfree(path);
+        if (orig_path)
+                *orig_path = NULL;
        return ERR_PTR(ret);
 }
@@ -1238,16 +1261,24 @@ cleanup:
 *   just created block
 */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                 unsigned int flags,
+                                 unsigned int flags)
-                                 struct ext4_extent *newext)
 {
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
-        ext4_fsblk_t newblock;
+        ext4_fsblk_t newblock, goal = 0;
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        int err = 0;
-        newblock = ext4_ext_new_meta_block(handle, inode, NULL,
+        /* Try to prepend new index to old one */
-                newext, &err, flags);
+        if (ext_depth(inode))
+                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
+        if (goal > le32_to_cpu(es->s_first_data_block)) {
+                flags |= EXT4_MB_HINT_TRY_GOAL;
+                goal--;
+        } else
+                goal = ext4_inode_to_goal_block(inode);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                        NULL, &err);
        if (newblock == 0)
                return err;
@@ -1314,9 +1345,10 @@ out:
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
                                    unsigned int mb_flags,
                                    unsigned int gb_flags,
-                                    struct ext4_ext_path *path,
+                                    struct ext4_ext_path **ppath,
                                    struct ext4_extent *newext)
 {
+        struct ext4_ext_path *path = *ppath;
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
@@ -1340,23 +1372,21 @@ repeat:
                        goto out;
                /* refill path */
-                ext4_ext_drop_refs(path);
+                path = ext4_find_extent(inode,
-                path = ext4_ext_find_extent(inode,
                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                    path, gb_flags);
+                                    ppath, gb_flags);
                if (IS_ERR(path))
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-                err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
+                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
                if (err)
                        goto out;
                /* refill path */
-                ext4_ext_drop_refs(path);
+                path = ext4_find_extent(inode,
-                path = ext4_ext_find_extent(inode,
                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                    path, gb_flags);
+                                    ppath, gb_flags);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -1559,7 +1589,7 @@ found_extent:
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
 */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
        int depth;
@@ -1802,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
                sizeof(struct ext4_extent_idx);
        s += sizeof(struct ext4_extent_header);
+        path[1].p_maxdepth = path[0].p_maxdepth;
        memcpy(path[0].p_hdr, path[1].p_hdr, s);
        path[0].p_depth = 0;
        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
@@ -1896,9 +1927,10 @@ out:
 * creating new leaf in the no-space case.
 */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *path,
+                                struct ext4_ext_path **ppath,
                                struct ext4_extent *newext, int gb_flags)
 {
+        struct ext4_ext_path *path = *ppath;
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
@@ -1907,6 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        ext4_lblk_t next;
        int mb_flags = 0, unwritten;
+        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                return -EIO;
@@ -1925,7 +1959,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                /*
                 * Try to see whether we should rather test the extent on
                 * right from ex, or from the left of ex. This is because
-                 * ext4_ext_find_extent() can return either extent on the
+                 * ext4_find_extent() can return either extent on the
                 * left, or on the right from the searched position. This
                 * will make merging more effective.
                 */
@@ -2008,7 +2042,7 @@ prepend:
        if (next != EXT_MAX_BLOCKS) {
                ext_debug("next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
-                npath = ext4_ext_find_extent(inode, next, NULL, 0);
+                npath = ext4_find_extent(inode, next, NULL, 0);
                if (IS_ERR(npath))
                        return PTR_ERR(npath);
                BUG_ON(npath->p_depth != path->p_depth);
@@ -2028,9 +2062,9 @@ prepend:
         * We're gonna add a new leaf in the tree.
         */
        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
-                mb_flags = EXT4_MB_USE_RESERVED;
+                mb_flags |= EXT4_MB_USE_RESERVED;
        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
-                                       path, newext);
+                                       ppath, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
@@ -2108,10 +2142,8 @@ merge:
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 cleanup:
-        if (npath) {
+        ext4_ext_drop_refs(npath);
-                ext4_ext_drop_refs(npath);
+        kfree(npath);
-                kfree(npath);
-        }
        return err;
 }
@@ -2133,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
-                if (path && ext_depth(inode) != depth) {
+                path = ext4_find_extent(inode, block, &path, 0);
-                        /* depth was changed. we have to realloc path */
-                        kfree(path);
-                        path = NULL;
-                }
-                path = ext4_ext_find_extent(inode, block, path, 0);
                if (IS_ERR(path)) {
                        up_read(&EXT4_I(inode)->i_data_sem);
                        err = PTR_ERR(path);
@@ -2156,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
-                ext4_ext_drop_refs(path);
                flags = 0;
                exists = 0;
@@ -2266,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                block = es.es_lblk + es.es_len;
        }
-        if (path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(path);
+        kfree(path);
-                kfree(path);
-        }
        return err;
 }
@@ -2826,7 +2848,7 @@ again:
                ext4_lblk_t ee_block;
                /* find extent for this block */
-                path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+                path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
                        return PTR_ERR(path);
@@ -2854,24 +2876,14 @@ again:
                 */
                if (end >= ee_block &&
                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
-                        int split_flag = 0;
-                        if (ext4_ext_is_unwritten(ex))
-                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
-                                             EXT4_EXT_MARK_UNWRIT2;
                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
                         * fail removing space due to ENOSPC so try to use
                         * reserved block if that happens.
                         */
-                        err = ext4_split_extent_at(handle, inode, path,
+                        err = ext4_force_split_extent_at(handle, inode, &path,
-                                        end + 1, split_flag,
+                                                         end + 1, 1);
-                                        EXT4_EX_NOCACHE |
-                                        EXT4_GET_BLOCKS_PRE_IO |
-                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
                        if (err < 0)
                                goto out;
                }
@@ -2893,7 +2905,7 @@ again:
                        ext4_journal_stop(handle);
                        return -ENOMEM;
                }
-                path[0].p_depth = depth;
+                path[0].p_maxdepth = path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
                i = 0;
@@ -3013,10 +3025,9 @@ again:
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
-        if (err == -EAGAIN) {
+        path = NULL;
-                path = NULL;
+        if (err == -EAGAIN)
                goto again;
-        }
        ext4_journal_stop(handle);
        return err;
@@ -3130,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 */
 static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
-                             struct ext4_ext_path *path,
+                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags)
 {
+        struct ext4_ext_path *path = *ppath;
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3205,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                ext4_ext_mark_unwritten(ex2);
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
                        if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3271,11 +3283,12 @@ fix_extent_len:
 */
 static int ext4_split_extent(handle_t *handle,
                              struct inode *inode,
-                              struct ext4_ext_path *path,
+                              struct ext4_ext_path **ppath,
                              struct ext4_map_blocks *map,
                              int split_flag,
                              int flags)
 {
+        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len, depth;
@@ -3298,7 +3311,7 @@ static int ext4_split_extent(handle_t *handle,
                                       EXT4_EXT_MARK_UNWRIT2;
                if (split_flag & EXT4_EXT_DATA_VALID2)
                        split_flag1 |= EXT4_EXT_DATA_VALID1;
-                err = ext4_split_extent_at(handle, inode, path,
+                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
                        goto out;
@@ -3309,8 +3322,7 @@ static int ext4_split_extent(handle_t *handle,
         * Update path is required because previous ext4_split_extent_at() may
         * result in split of original leaf or extent zeroout.
         */
-        ext4_ext_drop_refs(path);
+        path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
-        path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = ext_depth(inode);
@@ -3330,7 +3342,7 @@ static int ext4_split_extent(handle_t *handle,
                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
                                                     EXT4_EXT_MARK_UNWRIT2);
                }
-                err = ext4_split_extent_at(handle, inode, path,
+                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk, split_flag1, flags);
                if (err)
                        goto out;
@@ -3364,9 +3376,10 @@ out:
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
-                                           struct ext4_ext_path *path,
+                                           struct ext4_ext_path **ppath,
                                           int flags)
 {
+        struct ext4_ext_path *path = *ppath;
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
@@ -3590,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                }
        }
-        allocated = ext4_split_extent(handle, inode, path,
+        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
-                                      &split_map, split_flag, flags);
+                                flags);
-        if (allocated < 0)
+        if (err > 0)
-                err = allocated;
+                err = 0;
 out:
        /* If we have gotten a failure, don't zero out status tree */
        if (!err)
@@ -3629,9 +3641,10 @@ out:
 static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
-                                        struct ext4_ext_path *path,
+                                        struct ext4_ext_path **ppath,
                                        int flags)
 {
+        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t eof_block;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
@@ -3665,74 +3678,15 @@ static int ext4_split_convert_extents(handle_t *handle,
                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
        }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
-        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
 }
-static int ext4_convert_initialized_extents(handle_t *handle,
-                                            struct inode *inode,
-                                            struct ext4_map_blocks *map,
-                                            struct ext4_ext_path *path)
-{
-        struct ext4_extent *ex;
-        ext4_lblk_t ee_block;
-        unsigned int ee_len;
-        int depth;
-        int err = 0;
-        depth = ext_depth(inode);
-        ex = path[depth].p_ext;
-        ee_block = le32_to_cpu(ex->ee_block);
-        ee_len = ext4_ext_get_actual_len(ex);
-        ext_debug("%s: inode %lu, logical"
-                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
-                  (unsigned long long)ee_block, ee_len);
-        if (ee_block != map->m_lblk || ee_len > map->m_len) {
-                err = ext4_split_convert_extents(handle, inode, map, path,
-                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
-                if (err < 0)
-                        goto out;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
-                        goto out;
-                }
-                depth = ext_depth(inode);
-                ex = path[depth].p_ext;
-                if (!ex) {
-                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
-                                         (unsigned long) map->m_lblk);
-                        err = -EIO;
-                        goto out;
-                }
-        }
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
-        /* first mark the extent as unwritten */
-        ext4_ext_mark_unwritten(ex);
-        /* note: ext4_ext_correct_indexes() isn't needed here because
-         * borders are not changed
-         */
-        ext4_ext_try_to_merge(handle, inode, path, ex);
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
-        ext4_ext_show_leaf(inode, path);
-        return err;
-}
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
-                                                struct ext4_ext_path *path)
+                                                struct ext4_ext_path **ppath)
 {
+        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
@@ -3761,16 +3715,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-                err = ext4_split_convert_extents(handle, inode, map, path,
+                err = ext4_split_convert_extents(handle, inode, map, ppath,
                                                 EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
-                        goto out;
+                        return err;
-                ext4_ext_drop_refs(path);
+                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+                if (IS_ERR(path))
-                if (IS_ERR(path)) {
+                        return PTR_ERR(path);
-                        err = PTR_ERR(path);
-                        goto out;
-                }
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
        }
@@ -3963,12 +3914,16 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 static int
-ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+convert_initialized_extent(handle_t *handle, struct inode *inode,
-                        struct ext4_map_blocks *map,
+                           struct ext4_map_blocks *map,
-                        struct ext4_ext_path *path, int flags,
+                           struct ext4_ext_path **ppath, int flags,
-                        unsigned int allocated, ext4_fsblk_t newblock)
+                           unsigned int allocated, ext4_fsblk_t newblock)
 {
-        int ret = 0;
+        struct ext4_ext_path *path = *ppath;
+        struct ext4_extent *ex;
+        ext4_lblk_t ee_block;
+        unsigned int ee_len;
+        int depth;
        int err = 0;
        /*
@@ -3978,28 +3933,67 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
-        ret = ext4_convert_initialized_extents(handle, inode, map,
+        depth = ext_depth(inode);
-                                                path);
+        ex = path[depth].p_ext;
-        if (ret >= 0) {
+        ee_block = le32_to_cpu(ex->ee_block);
-                ext4_update_inode_fsync_trans(handle, inode, 1);
+        ee_len = ext4_ext_get_actual_len(ex);
-                err = check_eofblocks_fl(handle, inode, map->m_lblk,
-                                         path, map->m_len);
+        ext_debug("%s: inode %lu, logical"
-        } else
+                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
-                err = ret;
+                  (unsigned long long)ee_block, ee_len);
+        if (ee_block != map->m_lblk || ee_len > map->m_len) {
+                err = ext4_split_convert_extents(handle, inode, map, ppath,
+                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+                if (err < 0)
+                        return err;
+                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+                if (IS_ERR(path))
+                        return PTR_ERR(path);
+                depth = ext_depth(inode);
+                ex = path[depth].p_ext;
+                if (!ex) {
+                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                         (unsigned long) map->m_lblk);
+                        return -EIO;
+                }
+        }
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                return err;
+        /* first mark the extent as unwritten */
+        ext4_ext_mark_unwritten(ex);
+        /* note: ext4_ext_correct_indexes() isn't needed here because
+         * borders are not changed
+         */
+        ext4_ext_try_to_merge(handle, inode, path, ex);
+        /* Mark modified extent as dirty */
+        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+        if (err)
+                return err;
+        ext4_ext_show_leaf(inode, path);
+        ext4_update_inode_fsync_trans(handle, inode, 1);
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
+        if (err)
+                return err;
        map->m_flags |= EXT4_MAP_UNWRITTEN;
        if (allocated > map->m_len)
                allocated = map->m_len;
        map->m_len = allocated;
+        return allocated;
-        return err ? err : allocated;
 }
 static int
 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
-                        struct ext4_ext_path *path, int flags,
+                        struct ext4_ext_path **ppath, int flags,
                        unsigned int allocated, ext4_fsblk_t newblock)
 {
+        struct ext4_ext_path *path = *ppath;
        int ret = 0;
        int err = 0;
        ext4_io_end_t *io = ext4_inode_aio(inode);
@@ -4021,8 +4015,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
        /* get_block() before submit the IO, split the extent */
        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
-                ret = ext4_split_convert_extents(handle, inode, map,
+                ret = ext4_split_convert_extents(handle, inode, map, ppath,
-                                         path, flags | EXT4_GET_BLOCKS_CONVERT);
+                                         flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -4040,7 +4034,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
        /* IO end_io complete, convert the filled extent to written */
        if (flags & EXT4_GET_BLOCKS_CONVERT) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
-                                                        path);
+                                                           ppath);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
@@ -4078,7 +4072,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -4279,7 +4273,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* find extent for this block */
-        path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
+        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -4291,7 +4285,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * consistent leaf must not be empty;
         * this situation is possible, though, _during_ tree modification;
-         * this is why assert can't be put in ext4_ext_find_extent()
+         * this is why assert can't be put in ext4_find_extent()
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
@@ -4331,15 +4325,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                         */
                        if ((!ext4_ext_is_unwritten(ex)) &&
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
-                                allocated = ext4_ext_convert_initialized_extent(
+                                allocated = convert_initialized_extent(
-                                                handle, inode, map, path, flags,
+                                                handle, inode, map, &path,
-                                                allocated, newblock);
+                                                flags, allocated, newblock);
                                goto out2;
                        } else if (!ext4_ext_is_unwritten(ex))
                                goto out;
                        ret = ext4_ext_handle_unwritten_extents(
-                                handle, inode, map, path, flags,
+                                handle, inode, map, &path, flags,
                                allocated, newblock);
                        if (ret < 0)
                                err = ret;
@@ -4376,7 +4370,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * If we are doing bigalloc, check to see if the extent returned
-         * by ext4_ext_find_extent() implies a cluster we can use.
+         * by ext4_find_extent() implies a cluster we can use.
         */
        if (cluster_offset && ex &&
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
@@ -4451,6 +4445,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ar.flags = 0;
        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@ -4486,7 +4482,7 @@ got_allocated_blocks:
                err = check_eofblocks_fl(handle, inode, map->m_lblk,
                                         path, ar.len);
        if (!err)
-                err = ext4_ext_insert_extent(handle, inode, path,
+                err = ext4_ext_insert_extent(handle, inode, &path,
                                             &newex, flags);
        if (!err && set_unwritten) {
@@ -4619,10 +4615,8 @@ out:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-        if (path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(path);
+        kfree(path);
-                kfree(path);
-        }
        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
@@ -4799,7 +4793,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                max_blocks -= lblk;
        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
-                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+                EXT4_EX_NOCACHE;
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
@@ -4837,15 +4832,21 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                ext4_inode_block_unlocked_dio(inode);
                inode_dio_wait(inode);
+                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+                                             flags, mode);
+                if (ret)
+                        goto out_dio;
                /*
                 * Remove entire range from the extent status tree.
+                 *
+                 * ext4_es_remove_extent(inode, lblk, max_blocks) is
+                 * NOT sufficient.  I'm not sure why this is the case,
+                 * but let's be conservative and remove the extent
+                 * status tree for the entire inode.  There should be
+                 * no outstanding delalloc extents thanks to the
+                 * filemap_write_and_wait_range() call above.
                 */
-                ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+                ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
-                if (ret)
-                        goto out_dio;
-                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-                                             flags, mode);
                if (ret)
                        goto out_dio;
        }
@@ -5304,36 +5305,31 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
        struct ext4_ext_path *path;
        int ret = 0, depth;
        struct ext4_extent *extent;
-        ext4_lblk_t stop_block, current_block;
+        ext4_lblk_t stop_block;
        ext4_lblk_t ex_start, ex_end;
        /* Let path point to the last extent */
-        path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = path->p_depth;
        extent = path[depth].p_ext;
-        if (!extent) {
+        if (!extent)
-                ext4_ext_drop_refs(path);
+                goto out;
-                kfree(path);
-                return ret;
-        }
        stop_block = le32_to_cpu(extent->ee_block) +
                        ext4_ext_get_actual_len(extent);
-        ext4_ext_drop_refs(path);
-        kfree(path);
        /* Nothing to shift, if hole is at the end of file */
        if (start >= stop_block)
-                return ret;
+                goto out;
        /*
         * Don't start shifting extents until we make sure the hole is big
         * enough to accomodate the shift.
         */
-        path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+        path = ext4_find_extent(inode, start - 1, &path, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = path->p_depth;
@@ -5346,8 +5342,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                ex_start = 0;
                ex_end = 0;
        }
-        ext4_ext_drop_refs(path);
-        kfree(path);
        if ((start == ex_start && shift > ex_start) ||
            (shift > start - ex_end))
@@ -5355,7 +5349,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
        /* Its safe to start updating extents */
        while (start < stop_block) {
-                path = ext4_ext_find_extent(inode, start, NULL, 0);
+                path = ext4_find_extent(inode, start, &path, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
@@ -5365,27 +5359,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                                         (unsigned long) start);
                        return -EIO;
                }
+                if (start > le32_to_cpu(extent->ee_block)) {
-                current_block = le32_to_cpu(extent->ee_block);
-                if (start > current_block) {
                        /* Hole, move to the next extent */
-                        ret = mext_next_extent(inode, path, &extent);
+                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
-                        if (ret != 0) {
+                                path[depth].p_ext++;
-                                ext4_ext_drop_refs(path);
+                        } else {
-                                kfree(path);
+                                start = ext4_ext_next_allocated_block(path);
-                                if (ret == 1)
+                                continue;
-                                        ret = 0;
-                                break;
                        }
                }
                ret = ext4_ext_shift_path_extents(path, shift, inode,
                                handle, &start);
-                ext4_ext_drop_refs(path);
-                kfree(path);
                if (ret)
                        break;
        }
+out:
+        ext4_ext_drop_refs(path);
+        kfree(path);
        return ret;
 }
@@ -5508,3 +5498,199 @@ out_mutex:
        mutex_unlock(&inode->i_mutex);
        return ret;
 }
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:     First inode
+ * @inode2:     Second inode
+ * @lblk1:      Start block for first inode
+ * @lblk2:      Start block for second inode
+ * @count:      Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:        Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ *              i_mutex is held for both inodes
+ *              i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *              All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+                     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+                  ext4_lblk_t count, int unwritten, int *erp)
+{
+        struct ext4_ext_path *path1 = NULL;
+        struct ext4_ext_path *path2 = NULL;
+        int replaced_count = 0;
+        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+        BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+        BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+        *erp = ext4_es_remove_extent(inode1, lblk1, count);
+        if (unlikely(*erp))
+                return 0;
+        *erp = ext4_es_remove_extent(inode2, lblk2, count);
+        if (unlikely(*erp))
+                return 0;
+        while (count) {
+                struct ext4_extent *ex1, *ex2, tmp_ex;
+                ext4_lblk_t e1_blk, e2_blk;
+                int e1_len, e2_len, len;
+                int split = 0;
+                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+                if (unlikely(IS_ERR(path1))) {
+                        *erp = PTR_ERR(path1);
+                        path1 = NULL;
+                finish:
+                        count = 0;
+                        goto repeat;
+                }
+                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+                if (unlikely(IS_ERR(path2))) {
+                        *erp = PTR_ERR(path2);
+                        path2 = NULL;
+                        goto finish;
+                }
+                ex1 = path1[path1->p_depth].p_ext;
+                ex2 = path2[path2->p_depth].p_ext;
+                /* Do we have somthing to swap ? */
+                if (unlikely(!ex2 || !ex1))
+                        goto finish;
+                e1_blk = le32_to_cpu(ex1->ee_block);
+                e2_blk = le32_to_cpu(ex2->ee_block);
+                e1_len = ext4_ext_get_actual_len(ex1);
+                e2_len = ext4_ext_get_actual_len(ex2);
+                /* Hole handling */
+                if (!in_range(lblk1, e1_blk, e1_len) ||
+                    !in_range(lblk2, e2_blk, e2_len)) {
+                        ext4_lblk_t next1, next2;
+                        /* if hole after extent, then go to next extent */
+                        next1 = ext4_ext_next_allocated_block(path1);
+                        next2 = ext4_ext_next_allocated_block(path2);
+                        /* If hole before extent, then shift to that extent */
+                        if (e1_blk > lblk1)
+                                next1 = e1_blk;
+                        if (e2_blk > lblk2)
+                                next2 = e1_blk;
+                        /* Do we have something to swap */
+                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+                                goto finish;
+                        /* Move to the rightest boundary */
+                        len = next1 - lblk1;
+                        if (len < next2 - lblk2)
+                                len = next2 - lblk2;
+                        if (len > count)
+                                len = count;
+                        lblk1 += len;
+                        lblk2 += len;
+                        count -= len;
+                        goto repeat;
+                }
+                /* Prepare left boundary */
+                if (e1_blk < lblk1) {
+                        split = 1;
+                        *erp = ext4_force_split_extent_at(handle, inode1,
+                                                &path1, lblk1, 0);
+                        if (unlikely(*erp))
+                                goto finish;
+                }
+                if (e2_blk < lblk2) {
+                        split = 1;
+                        *erp = ext4_force_split_extent_at(handle, inode2,
+                                                &path2,  lblk2, 0);
+                        if (unlikely(*erp))
+                                goto finish;
+                }
+                /* ext4_split_extent_at() may result in leaf extent split,
+                 * path must to be revalidated. */
+                if (split)
+                        goto repeat;
+                /* Prepare right boundary */
+                len = count;
+                if (len > e1_blk + e1_len - lblk1)
+                        len = e1_blk + e1_len - lblk1;
+                if (len > e2_blk + e2_len - lblk2)
+                        len = e2_blk + e2_len - lblk2;
+                if (len != e1_len) {
+                        split = 1;
+                        *erp = ext4_force_split_extent_at(handle, inode1,
+                                                &path1, lblk1 + len, 0);
+                        if (unlikely(*erp))
+                                goto finish;
+                }
+                if (len != e2_len) {
+                        split = 1;
+                        *erp = ext4_force_split_extent_at(handle, inode2,
+                                                &path2, lblk2 + len, 0);
+                        if (*erp)
+                                goto finish;
+                }
+                /* ext4_split_extent_at() may result in leaf extent split,
+                 * path must to be revalidated. */
+                if (split)
+                        goto repeat;
+                BUG_ON(e2_len != e1_len);
+                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+                if (unlikely(*erp))
+                        goto finish;
+                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+                if (unlikely(*erp))
+                        goto finish;
+                /* Both extents are fully inside boundaries. Swap it now */
+                tmp_ex = *ex1;
+                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+                ex1->ee_len = cpu_to_le16(e2_len);
+                ex2->ee_len = cpu_to_le16(e1_len);
+                if (unwritten)
+                        ext4_ext_mark_unwritten(ex2);
+                if (ext4_ext_is_unwritten(&tmp_ex))
+                        ext4_ext_mark_unwritten(ex1);
+                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+                *erp = ext4_ext_dirty(handle, inode2, path2 +
+                                      path2->p_depth);
+                if (unlikely(*erp))
+                        goto finish;
+                *erp = ext4_ext_dirty(handle, inode1, path1 +
+                                      path1->p_depth);
+                /*
+                 * Looks scarry ah..? second inode already points to new blocks,
+                 * and it was successfully dirtied. But luckily error may happen
+                 * only due to journal error, so full transaction will be
+                 * aborted anyway.
+                 */
+                if (unlikely(*erp))
+                        goto finish;
+                lblk1 += len;
+                lblk2 += len;
+                replaced_count += len;
+                count -= len;
+        repeat:
+                ext4_ext_drop_refs(path1);
+                kfree(path1);
+                ext4_ext_drop_refs(path2);
+                kfree(path2);
+                path1 = path2 = NULL;
+        }
+        return replaced_count;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0b7e28e7eaa4..94e7855ae71b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
 */
 #include <linux/rbtree.h>
 #include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include "ext4.h"
 #include "extents_status.h"
@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
         */
        if (!ext4_es_is_delayed(es)) {
                EXT4_I(inode)->i_es_lru_nr++;
-                percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+                                        s_es_stats.es_stats_lru_cnt);
        }
+        EXT4_I(inode)->i_es_all_nr++;
+        percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
        return es;
 }
 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 {
+        EXT4_I(inode)->i_es_all_nr--;
+        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
        /* Decrease the lru counter when this es is not delayed */
        if (!ext4_es_is_delayed(es)) {
                BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
                EXT4_I(inode)->i_es_lru_nr--;
-                percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+                                        s_es_stats.es_stats_lru_cnt);
        }
        kmem_cache_free(ext4_es_cachep, es);
@@ -426,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
        unsigned short ee_len;
        int depth, ee_status, es_status;
-        path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+        path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;
@@ -499,10 +509,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                }
        }
 out:
-        if (path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(path);
+        kfree(path);
-                kfree(path);
-        }
 }
 static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -731,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          struct extent_status *es)
 {
        struct ext4_es_tree *tree;
+        struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;
@@ -767,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
        }
 out:
+        stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
+                stats->es_stats_cache_hits++;
+        } else {
+                stats->es_stats_cache_misses++;
        }
        read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -933,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                            struct ext4_inode_info *locked_ei)
 {
        struct ext4_inode_info *ei;
+        struct ext4_es_stats *es_stats;
        struct list_head *cur, *tmp;
        LIST_HEAD(skipped);
+        ktime_t start_time;
+        u64 scan_time;
        int nr_shrunk = 0;
        int retried = 0, skip_precached = 1, nr_skipped = 0;
+        es_stats = &sbi->s_es_stats;
+        start_time = ktime_get();
        spin_lock(&sbi->s_es_lru_lock);
 retry:
@@ -948,7 +966,8 @@ retry:
                 * If we have already reclaimed all extents from extent
                 * status tree, just stop the loop immediately.
                 */
-                if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+                if (percpu_counter_read_positive(
+                                &es_stats->es_stats_lru_cnt) == 0)
                        break;
                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@@ -958,7 +977,7 @@ retry:
                 * time.  Normally we try hard to avoid shrinking
                 * precached inodes, but we will as a last resort.
                 */
-                if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+                if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
                    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
                                                EXT4_STATE_EXT_PRECACHED))) {
                        nr_skipped++;
@@ -992,7 +1011,7 @@ retry:
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
                list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
-                sbi->s_es_last_sorted = jiffies;
+                es_stats->es_stats_last_sorted = jiffies;
                ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
                                      i_es_lru);
                /*
@@ -1010,6 +1029,22 @@ retry:
        if (locked_ei && nr_shrunk == 0)
                nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        if (likely(es_stats->es_stats_scan_time))
+                es_stats->es_stats_scan_time = (scan_time +
+                                es_stats->es_stats_scan_time*3) / 4;
+        else
+                es_stats->es_stats_scan_time = scan_time;
+        if (scan_time > es_stats->es_stats_max_scan_time)
+                es_stats->es_stats_max_scan_time = scan_time;
+        if (likely(es_stats->es_stats_shrunk))
+                es_stats->es_stats_shrunk = (nr_shrunk +
+                                es_stats->es_stats_shrunk*3) / 4;
+        else
+                es_stats->es_stats_shrunk = nr_shrunk;
+        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+                             nr_skipped, retried);
        return nr_shrunk;
 }
@@ -1020,8 +1055,8 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
        struct ext4_sb_info *sbi;
        sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
-        nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
-        trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
 }
@@ -1033,31 +1068,160 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;
-        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
-        trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
        if (!nr_to_scan)
                return ret;
        nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
-        trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
 }
-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
 {
+        return *pos ? NULL : SEQ_START_TOKEN;
+}
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        return NULL;
+}
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+        struct ext4_sb_info *sbi = seq->private;
+        struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+        struct ext4_inode_info *ei, *max = NULL;
+        unsigned int inode_cnt = 0;
+        if (v != SEQ_START_TOKEN)
+                return 0;
+        /* here we just find an inode that has the max nr. of objects */
+        spin_lock(&sbi->s_es_lru_lock);
+        list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+                inode_cnt++;
+                if (max && max->i_es_all_nr < ei->i_es_all_nr)
+                        max = ei;
+                else if (!max)
+                        max = ei;
+        }
+        spin_unlock(&sbi->s_es_lru_lock);
+        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+                   percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+        seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+                   es_stats->es_stats_cache_hits,
+                   es_stats->es_stats_cache_misses);
+        if (es_stats->es_stats_last_sorted != 0)
+                seq_printf(seq, "  %u ms last sorted interval\n",
+                           jiffies_to_msecs(jiffies -
+                                            es_stats->es_stats_last_sorted));
+        if (inode_cnt)
+                seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
+        seq_printf(seq, "average:\n  %llu us scan time\n",
+            div_u64(es_stats->es_stats_scan_time, 1000));
+        seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
+        if (inode_cnt)
+                seq_printf(seq,
+                    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+                    "  %llu us max scan time\n",
+                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+                    div_u64(es_stats->es_stats_max_scan_time, 1000));
+        return 0;
+}
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
+{
+}
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+        .start = ext4_es_seq_shrinker_info_start,
+        .next  = ext4_es_seq_shrinker_info_next,
+        .stop  = ext4_es_seq_shrinker_info_stop,
+        .show  = ext4_es_seq_shrinker_info_show,
+};
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = PDE_DATA(inode);
+        }
+        return ret;
+}
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+        return seq_release(inode, file);
+}
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+        .owner          = THIS_MODULE,
+        .open           = ext4_es_seq_shrinker_info_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = ext4_es_seq_shrinker_info_release,
+};
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+        int err;
        INIT_LIST_HEAD(&sbi->s_es_lru);
        spin_lock_init(&sbi->s_es_lru_lock);
-        sbi->s_es_last_sorted = 0;
+        sbi->s_es_stats.es_stats_last_sorted = 0;
+        sbi->s_es_stats.es_stats_shrunk = 0;
+        sbi->s_es_stats.es_stats_cache_hits = 0;
+        sbi->s_es_stats.es_stats_cache_misses = 0;
+        sbi->s_es_stats.es_stats_scan_time = 0;
+        sbi->s_es_stats.es_stats_max_scan_time = 0;
+        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
+        if (err)
+                return err;
+        err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
+        if (err)
+                goto err1;
        sbi->s_es_shrinker.scan_objects = ext4_es_scan;
        sbi->s_es_shrinker.count_objects = ext4_es_count;
        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-        register_shrinker(&sbi->s_es_shrinker);
+        err = register_shrinker(&sbi->s_es_shrinker);
+        if (err)
+                goto err2;
+        if (sbi->s_proc)
+                proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+                                 &ext4_es_seq_shrinker_info_fops, sbi);
+        return 0;
+err2:
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+err1:
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+        return err;
 }
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
+        if (sbi->s_proc)
+                remove_proc_entry("es_shrinker_info", sbi->s_proc);
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
        unregister_shrinker(&sbi->s_es_shrinker);
 }
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f1b62a419920..efd5f970b501 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,17 @@ struct ext4_es_tree {
        struct extent_status *cache_es; /* recently accessed extent */
 };
+struct ext4_es_stats {
+        unsigned long es_stats_last_sorted;
+        unsigned long es_stats_shrunk;
+        unsigned long es_stats_cache_hits;
+        unsigned long es_stats_cache_misses;
+        u64 es_stats_scan_time;
+        u64 es_stats_max_scan_time;
+        struct percpu_counter es_stats_all_cnt;
+        struct percpu_counter es_stats_lru_cnt;
+};
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
                       (pb & ~ES_MASK));
 }
-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8131be8c0af3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                        iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
        }
+        iocb->private = &overwrite;
        if (o_direct) {
                blk_start_plug(&plug);
-                iocb->private = &overwrite;
                /* check whether we do a DIO overwrite or not */
                if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b87fc36aab8..ac644c31ca67 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
                struct buffer_head *block_bitmap_bh;
                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+                if (!block_bitmap_bh) {
+                        err = -EIO;
+                        goto out;
+                }
                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
@@ -1011,8 +1015,7 @@ got:
        spin_unlock(&sbi->s_next_gen_lock);
        /* Precompute checksum seed for inode metadata */
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (ext4_has_metadata_csum(sb)) {
-                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = cpu_to_le32(inode->i_generation);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index e75f840000a0..36b369697a13 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -318,34 +318,24 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 *      as described above and return 0.
 */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+static int ext4_alloc_branch(handle_t *handle,
-                             ext4_lblk_t iblock, int indirect_blks,
+                             struct ext4_allocation_request *ar,
-                             int *blks, ext4_fsblk_t goal,
+                             int indirect_blks, ext4_lblk_t *offsets,
-                             ext4_lblk_t *offsets, Indirect *branch)
+                             Indirect *branch)
 {
-        struct ext4_allocation_request  ar;
        struct buffer_head *            bh;
        ext4_fsblk_t                    b, new_blocks[4];
        __le32                          *p;
        int                             i, j, err, len = 1;
-        /*
-         * Set up for the direct block allocation
-         */
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.len = *blks;
-        ar.logical = iblock;
-        if (S_ISREG(inode->i_mode))
-                ar.flags = EXT4_MB_HINT_DATA;
        for (i = 0; i <= indirect_blks; i++) {
                if (i == indirect_blks) {
-                        ar.goal = goal;
+                        new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
-                        new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
                } else
-                        goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+                        ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
-                                                        goal, 0, NULL, &err);
+                                        ar->inode, ar->goal,
+                                        ar->flags & EXT4_MB_DELALLOC_RESERVED,
+                                        NULL, &err);
                if (err) {
                        i--;
                        goto failed;
@@ -354,7 +344,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                if (i == 0)
                        continue;
-                bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+                bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto failed;
@@ -372,7 +362,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                b = new_blocks[i];
                if (i == indirect_blks)
-                        len = ar.len;
+                        len = ar->len;
                for (j = 0; j < len; j++)
                        *p++ = cpu_to_le32(b++);
@@ -381,11 +371,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
                if (err)
                        goto failed;
        }
-        *blks = ar.len;
        return 0;
 failed:
        for (; i >= 0; i--) {
@@ -396,10 +385,10 @@ failed:
                 * existing before ext4_alloc_branch() was called.
                 */
                if (i > 0 && i != indirect_blks && branch[i].bh)
-                        ext4_forget(handle, 1, inode, branch[i].bh,
+                        ext4_forget(handle, 1, ar->inode, branch[i].bh,
                                    branch[i].bh->b_blocknr);
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+                ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
-                                 (i == indirect_blks) ? ar.len : 1, 0);
+                                 (i == indirect_blks) ? ar->len : 1, 0);
        }
        return err;
 }
@@ -419,9 +408,9 @@ failed:
 * inode (->i_blocks, etc.). In case of success we end up with the full
 * chain to new block and return 0.
 */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+static int ext4_splice_branch(handle_t *handle,
-                              ext4_lblk_t block, Indirect *where, int num,
+                              struct ext4_allocation_request *ar,
-                              int blks)
+                              Indirect *where, int num)
 {
        int i;
        int err = 0;
@@ -446,9 +435,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
         * Update the host buffer_head or inode to point to more just allocated
         * direct blocks blocks
         */
-        if (num == 0 && blks > 1) {
+        if (num == 0 && ar->len > 1) {
                current_block = le32_to_cpu(where->key) + 1;
-                for (i = 1; i < blks; i++)
+                for (i = 1; i < ar->len; i++)
                        *(where->p + i) = cpu_to_le32(current_block++);
        }
@@ -465,14 +454,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
                 */
                jbd_debug(5, "splicing indirect only\n");
                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+                err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
                if (err)
                        goto err_out;
        } else {
                /*
                 * OK, we spliced it into the inode itself on a direct block.
                 */
-                ext4_mark_inode_dirty(handle, inode);
+                ext4_mark_inode_dirty(handle, ar->inode);
                jbd_debug(5, "splicing direct\n");
        }
        return err;
@@ -484,11 +473,11 @@ err_out:
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
-                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+        ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
-                         blks, 0);
+                         ar->len, 0);
        return err;
 }
@@ -525,11 +514,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        int flags)
 {
+        struct ext4_allocation_request ar;
        int err = -EIO;
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
-        ext4_fsblk_t goal;
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
@@ -579,7 +568,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                return -ENOSPC;
        }
-        goal = ext4_find_goal(inode, map->m_lblk, partial);
+        /* Set up for the direct block allocation */
+        memset(&ar, 0, sizeof(ar));
+        ar.inode = inode;
+        ar.logical = map->m_lblk;
+        if (S_ISREG(inode->i_mode))
+                ar.flags = EXT4_MB_HINT_DATA;
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+        ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -588,13 +586,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
         * Next look up the indirect map to count the totoal number of
         * direct blocks to allocate for this branch.
         */
-        count = ext4_blks_to_allocate(partial, indirect_blks,
+        ar.len = ext4_blks_to_allocate(partial, indirect_blks,
-                                      map->m_len, blocks_to_boundary);
+                                       map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+        err = ext4_alloc_branch(handle, &ar, indirect_blks,
-                                &count, goal,
                                offsets + (partial - chain), partial);
        /*
@@ -605,14 +603,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, map->m_lblk,
+                err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
-                                         partial, indirect_blks, count);
        if (err)
                goto cleanup;
        map->m_flags |= EXT4_MAP_NEW;
        ext4_update_inode_fsync_trans(handle, inode, 1);
+        count = ar.len;
 got_it:
        map->m_flags |= EXT4_MAP_MAPPED;
        map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bea662bd0ca6..3ea62695abce 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -594,6 +594,7 @@ retry:
        if (ret) {
                unlock_page(page);
                page_cache_release(page);
+                page = NULL;
                ext4_orphan_add(handle, inode);
                up_write(&EXT4_I(inode)->xattr_sem);
                sem_held = 0;
@@ -613,7 +614,8 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        block_commit_write(page, from, to);
+        if (page)
+                block_commit_write(page, from, to);
 out:
        if (page) {
                unlock_page(page);
@@ -1126,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
        memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
                inline_size - EXT4_INLINE_DOTDOT_SIZE);
-        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (ext4_has_metadata_csum(inode->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3aa26e9117c4..3356ab5395f4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
-            !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+            !ext4_has_metadata_csum(inode->i_sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        provided = le16_to_cpu(raw->i_checksum_lo);
@@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
-            !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+            !ext4_has_metadata_csum(inode->i_sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        csum = ext4_inode_csum(inode, raw, ei);
@@ -224,16 +222,15 @@ void ext4_evict_inode(struct inode *inode)
                goto no_delete;
        }
-        if (!is_bad_inode(inode))
+        if (is_bad_inode(inode))
-                dquot_initialize(inode);
+                goto no_delete;
+        dquot_initialize(inode);
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);
        WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
-        if (is_bad_inode(inode))
-                goto no_delete;
        /*
         * Protect us against freezing - iput() caller didn't have to have any
@@ -590,20 +587,12 @@ found:
        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
-         * the write lock of i_data_sem, and call get_blocks()
+         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);
        /*
-         * if the caller is from delayed allocation writeout path
-         * we have already reserved fs blocks for allocation
-         * let the underlying get_block() function know to
-         * avoid double accounting
-         */
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
@@ -631,8 +620,6 @@ found:
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        if (retval > 0) {
                unsigned int status;
@@ -734,11 +721,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t block, int create, int *errp)
+                                ext4_lblk_t block, int create)
 {
        struct ext4_map_blocks map;
        struct buffer_head *bh;
-        int fatal = 0, err;
+        int err;
        J_ASSERT(handle != NULL || create == 0);
@@ -747,21 +734,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        err = ext4_map_blocks(handle, inode, &map,
                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-        /* ensure we send some value back into *errp */
+        if (err == 0)
-        *errp = 0;
+                return create ? ERR_PTR(-ENOSPC) : NULL;
-        if (create && err == 0)
-                err = -ENOSPC;  /* should never happen */
        if (err < 0)
-                *errp = err;
+                return ERR_PTR(err);
-        if (err <= 0)
-                return NULL;
        bh = sb_getblk(inode->i_sb, map.m_pblk);
-        if (unlikely(!bh)) {
+        if (unlikely(!bh))
-                *errp = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                return NULL;
-        }
        if (map.m_flags & EXT4_MAP_NEW) {
                J_ASSERT(create != 0);
                J_ASSERT(handle != NULL);
@@ -775,44 +755,44 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
-                fatal = ext4_journal_get_create_access(handle, bh);
+                err = ext4_journal_get_create_access(handle, bh);
-                if (!fatal && !buffer_uptodate(bh)) {
+                if (unlikely(err)) {
+                        unlock_buffer(bh);
+                        goto errout;
+                }
+                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
-                if (!fatal)
+                if (unlikely(err))
-                        fatal = err;
+                        goto errout;
-        } else {
+        } else
                BUFFER_TRACE(bh, "not a new buffer");
-        }
-        if (fatal) {
-                *errp = fatal;
-                brelse(bh);
-                bh = NULL;
-        }
        return bh;
+errout:
+        brelse(bh);
+        return ERR_PTR(err);
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t block, int create, int *err)
+                               ext4_lblk_t block, int create)
 {
        struct buffer_head *bh;
-        bh = ext4_getblk(handle, inode, block, create, err);
+        bh = ext4_getblk(handle, inode, block, create);
-        if (!bh)
+        if (IS_ERR(bh))
                return bh;
-        if (buffer_uptodate(bh))
+        if (!bh || buffer_uptodate(bh))
                return bh;
        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
        put_bh(bh);
-        *err = -EIO;
+        return ERR_PTR(-EIO);
-        return NULL;
 }
 int ext4_walk_page_buffers(handle_t *handle,
@@ -1536,7 +1516,7 @@ out_unlock:
 }
 /*
- * This is a special get_blocks_t callback which is used by
+ * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
@@ -2011,12 +1991,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible.
         *
-         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
-         * in question are delalloc blocks.  This affects functions in many
+         * the blocks in question are delalloc blocks.  This indicates
-         * different parts of the allocation call path.  This flag exists
+         * that the blocks and quotas has already been checked when
-         * primarily because we don't want to change *many* call functions, so
+         * the data was copied into the page cache.
-         * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
-         * once the inode's allocation semaphore is taken.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
@@ -2515,6 +2493,20 @@ static int ext4_nonda_switch(struct super_block *sb)
        return 0;
 }
+/* We always reserve for an inode update; the superblock could be there too */
+static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
+{
+        if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+                return 1;
+        if (pos + len <= 0x7fffffffULL)
+                return 1;
+        /* We might need to update the superblock to set LARGE_FILE */
+        return 2;
+}
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
@@ -2565,7 +2557,8 @@ retry_grab:
         * of file which has an already mapped buffer.
         */
 retry_journal:
-        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                ext4_da_write_credits(inode, pos, len));
        if (IS_ERR(handle)) {
                page_cache_release(page);
                return PTR_ERR(handle);
@@ -2658,10 +2651,7 @@ static int ext4_da_write_end(struct file *file,
        if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
                if (ext4_has_inline_data(inode) ||
                    ext4_da_should_update_i_disksize(page, end)) {
-                        down_write(&EXT4_I(inode)->i_data_sem);
+                        ext4_update_i_disksize(inode, new_i_size);
-                        if (new_i_size > EXT4_I(inode)->i_disksize)
-                                EXT4_I(inode)->i_disksize = new_i_size;
-                        up_write(&EXT4_I(inode)->i_data_sem);
                        /* We need to mark inode dirty even if
                         * new_i_size is less that inode->i_size
                         * bu greater than i_disksize.(hint delalloc)
@@ -3936,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_extra_isize = 0;
        /* Precompute checksum seed for inode metadata */
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (ext4_has_metadata_csum(sb)) {
-                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
@@ -4127,6 +4116,13 @@ bad_inode:
        return ERR_PTR(ret);
 }
+struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
+{
+        if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+                return ERR_PTR(-EIO);
+        return ext4_iget(sb, ino);
+}
 static int ext4_inode_blocks_set(handle_t *handle,
                                struct ext4_inode *raw_inode,
                                struct ext4_inode_info *ei)
@@ -4226,7 +4222,8 @@ static int ext4_do_update_inode(handle_t *handle,
        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-        if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+        err = ext4_inode_blocks_set(handle, raw_inode, ei);
+        if (err) {
                spin_unlock(&ei->i_raw_lock);
                goto out_brelse;
        }
@@ -4536,8 +4533,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                ext4_orphan_del(NULL, inode);
                                goto err_out;
                        }
-                } else
+                } else {
+                        loff_t oldsize = inode->i_size;
                        i_size_write(inode, attr->ia_size);
+                        pagecache_isize_extended(inode, oldsize, inode->i_size);
+                }
                /*
                 * Blocks are going to be removed from the inode. Wait
@@ -4958,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
-                jbd2_journal_flush(journal);
+                err = jbd2_journal_flush(journal);
+                if (err < 0) {
+                        jbd2_journal_unlock_updates(journal);
+                        ext4_inode_resume_unlocked_dio(inode);
+                        return err;
+                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f2252ec274d..bfda18a15592 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,8 +331,7 @@ flags_out:
                if (!inode_owner_or_capable(inode))
                        return -EPERM;
-                if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                if (ext4_has_metadata_csum(inode->i_sb)) {
-                                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                        ext4_warning(sb, "Setting inode version is not "
                                     "supported with metadata_csum enabled.");
                        return -ENOTTY;
@@ -532,9 +531,17 @@ group_add_out:
        }
        case EXT4_IOC_SWAP_BOOT:
+        {
+                int err;
                if (!(filp->f_mode & FMODE_WRITE))
                        return -EBADF;
-                return swap_inode_boot_loader(sb, inode);
+                err = mnt_want_write_file(filp);
+                if (err)
+                        return err;
+                err = swap_inode_boot_loader(sb, inode);
+                mnt_drop_write_file(filp);
+                return err;
+        }
        case EXT4_IOC_RESIZE_FS: {
                ext4_fsblk_t n_blocks_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 748c9136a60a..dbfe15c2533c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                         "start %lu, size %lu, fe_logical %lu",
                         (unsigned long) start, (unsigned long) size,
                         (unsigned long) ac->ac_o_ex.fe_logical);
+                BUG();
        }
-        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
-                        start > ac->ac_o_ex.fe_logical);
        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
        /* now prepare goal request */
@@ -4410,14 +4409,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        if (IS_NOQUOTA(ar->inode))
                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
-        /*
+        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
-         * For delayed allocation, we could skip the ENOSPC and
-         * EDQUOT check, as blocks and quotas have been already
-         * reserved when data being copied into pagecache.
-         */
-        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
-                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
-        else {
                /* Without delayed allocation we need to verify
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
@@ -4528,8 +4520,7 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        if (!ar->len) {
-                if (!ext4_test_inode_state(ar->inode,
+                if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
-                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                                reserv_clstrs);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d3567f27bae7..a432634f2e6a 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
        ext4_ext_store_pblock(&newext, lb->first_pblock);
        /* Locking only for convinience since we are operating on temp inode */
        down_write(&EXT4_I(inode)->i_data_sem);
-        path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
+        path = ext4_find_extent(inode, lb->first_block, NULL, 0);
        if (IS_ERR(path)) {
                retval = PTR_ERR(path);
                path = NULL;
@@ -81,13 +80,11 @@ static int finish_range(handle_t *handle, struct inode *inode,
                                goto err_out;
                }
        }
-        retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+        retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
 err_out:
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(path);
+        kfree(path);
-                kfree(path);
-        }
        lb->first_pblock = 0;
        return retval;
 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 32bce844c2e1..8313ca3324ec 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
 static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
 {
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
 static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
 {
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 671a74b14fd7..9f2311bc9c4f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,120 +27,26 @@
 * @lblock:     logical block number to find an extent path
 * @path:       pointer to an extent path pointer (for output)
 *
- * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
 * on failure.
 */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-                struct ext4_ext_path **orig_path)
+                struct ext4_ext_path **ppath)
 {
-        int ret = 0;
        struct ext4_ext_path *path;
-        path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+        path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
-                ret = PTR_ERR(path);
+                return PTR_ERR(path);
-        else if (path[ext_depth(inode)].p_ext == NULL)
+        if (path[ext_depth(inode)].p_ext == NULL) {
-                ret = -ENODATA;
+                ext4_ext_drop_refs(path);
-        else
+                kfree(path);
-                *orig_path = path;
+                *ppath = NULL;
+                return -ENODATA;
-        return ret;
-}
-/**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src:        an extent for getting initialize status
- * @dest:       an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
-        if (ext4_ext_is_unwritten(src))
-                ext4_ext_mark_unwritten(dest);
-        else
-                dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode:      inode which is searched
- * @path:       this will obtain data for the next extent
- * @extent:     pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-                      struct ext4_extent **extent)
-{
-        struct ext4_extent_header *eh;
-        int ppos, leaf_ppos = path->p_depth;
-        ppos = leaf_ppos;
-        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
-                /* leaf block */
-                *extent = ++path[ppos].p_ext;
-                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-                return 0;
-        }
-        while (--ppos >= 0) {
-                if (EXT_LAST_INDEX(path[ppos].p_hdr) >
-                    path[ppos].p_idx) {
-                        int cur_ppos = ppos;
-                        /* index block */
-                        path[ppos].p_idx++;
-                        path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
-                        if (path[ppos+1].p_bh)
-                                brelse(path[ppos+1].p_bh);
-                        path[ppos+1].p_bh =
-                                sb_bread(inode->i_sb, path[ppos].p_block);
-                        if (!path[ppos+1].p_bh)
-                                return -EIO;
-                        path[ppos+1].p_hdr =
-                                ext_block_hdr(path[ppos+1].p_bh);
-                        /* Halfway index block */
-                        while (++cur_ppos < leaf_ppos) {
-                                path[cur_ppos].p_idx =
-                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
-                                path[cur_ppos].p_block =
-                                        ext4_idx_pblock(path[cur_ppos].p_idx);
-                                if (path[cur_ppos+1].p_bh)
-                                        brelse(path[cur_ppos+1].p_bh);
-                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
-                                        path[cur_ppos].p_block);
-                                if (!path[cur_ppos+1].p_bh)
-                                        return -EIO;
-                                path[cur_ppos+1].p_hdr =
-                                        ext_block_hdr(path[cur_ppos+1].p_bh);
-                        }
-                        path[leaf_ppos].p_ext = *extent = NULL;
-                        eh = path[leaf_ppos].p_hdr;
-                        if (le16_to_cpu(eh->eh_entries) == 0)
-                                /* empty leaf is found */
-                                return -ENODATA;
-                        /* leaf block */
-                        path[leaf_ppos].p_ext = *extent =
-                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
-                        path[leaf_ppos].p_block =
-                                        ext4_ext_pblock(path[leaf_ppos].p_ext);
-                        return 0;
-                }
        }
-        /* We found the last extent */
+        *ppath = path;
-        return 1;
+        return 0;
 }
 /**
@@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
 }
 /**
- * mext_insert_across_blocks - Insert extents across leaf block
- *
- * @handle:             journal handle
- * @orig_inode:         original inode
- * @o_start:            first original extent to be changed
- * @o_end:              last original extent to be changed
- * @start_ext:          first new extent to be inserted
- * @new_ext:            middle of new extent to be inserted
- * @end_ext:            last new extent to be inserted
- *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
- */
-static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
-                struct ext4_extent *o_start, struct ext4_extent *o_end,
-                struct ext4_extent *start_ext, struct ext4_extent *new_ext,
-                struct ext4_extent *end_ext)
-{
-        struct ext4_ext_path *orig_path = NULL;
-        ext4_lblk_t eblock = 0;
-        int new_flag = 0;
-        int end_flag = 0;
-        int err = 0;
-        if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
-                if (o_start == o_end) {
-                        /*       start_ext   new_ext    end_ext
-                         * donor |---------|-----------|--------|
-                         * orig  |------------------------------|
-                         */
-                        end_flag = 1;
-                } else {
-                        /*       start_ext   new_ext   end_ext
-                         * donor |---------|----------|---------|
-                         * orig  |---------------|--------------|
-                         */
-                        o_end->ee_block = end_ext->ee_block;
-                        o_end->ee_len = end_ext->ee_len;
-                        ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-                }
-                o_start->ee_len = start_ext->ee_len;
-                eblock = le32_to_cpu(start_ext->ee_block);
-                new_flag = 1;
-        } else if (start_ext->ee_len && new_ext->ee_len &&
-                   !end_ext->ee_len && o_start == o_end) {
-                /*       start_ext      new_ext
-                 * donor |--------------|---------------|
-                 * orig  |------------------------------|
-                 */
-                o_start->ee_len = start_ext->ee_len;
-                eblock = le32_to_cpu(start_ext->ee_block);
-                new_flag = 1;
-        } else if (!start_ext->ee_len && new_ext->ee_len &&
-                   end_ext->ee_len && o_start == o_end) {
-                /*        new_ext       end_ext
-                 * donor |--------------|---------------|
-                 * orig  |------------------------------|
-                 */
-                o_end->ee_block = end_ext->ee_block;
-                o_end->ee_len = end_ext->ee_len;
-                ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-                /*
-                 * Set 0 to the extent block if new_ext was
-                 * the first block.
-                 */
-                if (new_ext->ee_block)
-                        eblock = le32_to_cpu(new_ext->ee_block);
-                new_flag = 1;
-        } else {
-                ext4_debug("ext4 move extent: Unexpected insert case\n");
-                return -EIO;
-        }
-        if (new_flag) {
-                err = get_ext_path(orig_inode, eblock, &orig_path);
-                if (err)
-                        goto out;
-                if (ext4_ext_insert_extent(handle, orig_inode,
-                                        orig_path, new_ext, 0))
-                        goto out;
-        }
-        if (end_flag) {
-                err = get_ext_path(orig_inode,
-                                le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-                if (err)
-                        goto out;
-                if (ext4_ext_insert_extent(handle, orig_inode,
-                                           orig_path, end_ext, 0))
-                        goto out;
-        }
-out:
-        if (orig_path) {
-                ext4_ext_drop_refs(orig_path);
-                kfree(orig_path);
-        }
-        return err;
-}
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start:            first original extent to be moved
- * @o_end:              last original extent to be moved
- * @start_ext:          first new extent to be inserted
- * @new_ext:            middle of new extent to be inserted
- * @end_ext:            last new extent to be inserted
- * @eh:                 extent header of target leaf block
- * @range_to_move:      used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
-                              struct ext4_extent *o_end,
-                              struct ext4_extent *start_ext,
-                              struct ext4_extent *new_ext,
-                              struct ext4_extent *end_ext,
-                              struct ext4_extent_header *eh,
-                              int range_to_move)
-{
-        int i = 0;
-        unsigned long len;
-        /* Move the existing extents */
-        if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
-                len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
-                        (unsigned long)(o_end + 1);
-                memmove(o_end + 1 + range_to_move, o_end + 1, len);
-        }
-        /* Insert start entry */
-        if (start_ext->ee_len)
-                o_start[i++].ee_len = start_ext->ee_len;
-        /* Insert new entry */
-        if (new_ext->ee_len) {
-                o_start[i] = *new_ext;
-                ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
-        }
-        /* Insert end entry */
-        if (end_ext->ee_len)
-                o_start[i] = *end_ext;
-        /* Increment the total entries counter on the extent block */
-        le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle:     journal handle
- * @orig_inode: original inode
- * @orig_path:  path indicates first extent to be changed
- * @o_start:    first original extent to be changed
- * @o_end:      last original extent to be changed
- * @start_ext:  first new extent to be inserted
- * @new_ext:    middle of new extent to be inserted
- * @end_ext:    last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
-                         struct ext4_ext_path *orig_path,
-                         struct ext4_extent *o_start,
-                         struct ext4_extent *o_end,
-                         struct ext4_extent *start_ext,
-                         struct ext4_extent *new_ext,
-                         struct ext4_extent *end_ext)
-{
-        struct  ext4_extent_header *eh;
-        unsigned long need_slots, slots_range;
-        int     range_to_move, depth, ret;
-        /*
-         * The extents need to be inserted
-         * start_extent + new_extent + end_extent.
-         */
-        need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
-                (new_ext->ee_len ? 1 : 0);
-        /* The number of slots between start and end */
-        slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
-                / sizeof(struct ext4_extent);
-        /* Range to move the end of extent */
-        range_to_move = need_slots - slots_range;
-        depth = orig_path->p_depth;
-        orig_path += depth;
-        eh = orig_path->p_hdr;
-        if (depth) {
-                /* Register to journal */
-                BUFFER_TRACE(orig_path->p_bh, "get_write_access");
-                ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
-                if (ret)
-                        return ret;
-        }
-        /* Expansion */
-        if (range_to_move > 0 &&
-                (range_to_move > le16_to_cpu(eh->eh_max)
-                        - le16_to_cpu(eh->eh_entries))) {
-                ret = mext_insert_across_blocks(handle, orig_inode, o_start,
-                                        o_end, start_ext, new_ext, end_ext);
-                if (ret < 0)
-                        return ret;
-        } else
-                mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
-                                                end_ext, eh, range_to_move);
-        return ext4_ext_dirty(handle, orig_inode, orig_path);
-}
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle:             journal handle
- * @orig_inode:         original inode
- * @orig_path:          path indicates first extent to be changed
- * @dext:               donor extent
- * @from:               start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
-                     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
-                     ext4_lblk_t *from)
-{
-        struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
-        struct ext4_extent new_ext, start_ext, end_ext;
-        ext4_lblk_t new_ext_end;
-        int oext_alen, new_ext_alen, end_ext_alen;
-        int depth = ext_depth(orig_inode);
-        int ret;
-        start_ext.ee_block = end_ext.ee_block = 0;
-        o_start = o_end = oext = orig_path[depth].p_ext;
-        oext_alen = ext4_ext_get_actual_len(oext);
-        start_ext.ee_len = end_ext.ee_len = 0;
-        new_ext.ee_block = cpu_to_le32(*from);
-        ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
-        new_ext.ee_len = dext->ee_len;
-        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
-        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-        /*
-         * Case: original extent is first
-         * oext      |--------|
-         * new_ext      |--|
-         * start_ext |--|
-         */
-        if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
-                le32_to_cpu(new_ext.ee_block) <
-                le32_to_cpu(oext->ee_block) + oext_alen) {
-                start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
-                                               le32_to_cpu(oext->ee_block));
-                start_ext.ee_block = oext->ee_block;
-                copy_extent_status(oext, &start_ext);
-        } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
-                prev_ext = oext - 1;
-                /*
-                 * We can merge new_ext into previous extent,
-                 * if these are contiguous and same extent type.
-                 */
-                if (ext4_can_extents_be_merged(orig_inode, prev_ext,
-                                               &new_ext)) {
-                        o_start = prev_ext;
-                        start_ext.ee_len = cpu_to_le16(
-                                ext4_ext_get_actual_len(prev_ext) +
-                                new_ext_alen);
-                        start_ext.ee_block = oext->ee_block;
-                        copy_extent_status(prev_ext, &start_ext);
-                        new_ext.ee_len = 0;
-                }
-        }
-        /*
-         * Case: new_ext_end must be less than oext
-         * oext      |-----------|
-         * new_ext       |-------|
-         */
-        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                EXT4_ERROR_INODE(orig_inode,
-                        "new_ext_end(%u) should be less than or equal to "
-                        "oext->ee_block(%u) + oext_alen(%d) - 1",
-                        new_ext_end, le32_to_cpu(oext->ee_block),
-                        oext_alen);
-                ret = -EIO;
-                goto out;
-        }
-        /*
-         * Case: new_ext is smaller than original extent
-         * oext    |---------------|
-         * new_ext |-----------|
-         * end_ext             |---|
-         */
-        if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
-                new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
-                end_ext.ee_len =
-                        cpu_to_le16(le32_to_cpu(oext->ee_block) +
-                        oext_alen - 1 - new_ext_end);
-                copy_extent_status(oext, &end_ext);
-                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
-                ext4_ext_store_pblock(&end_ext,
-                        (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
-                end_ext.ee_block =
-                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
-                        oext_alen - end_ext_alen);
-        }
-        ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
-                                o_end, &start_ext, &new_ext, &end_ext);
-out:
-        return ret;
-}
-/**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext:           the extent that will belong to the original inode
- * @tmp_oext:           the extent that will belong to the donor inode
- * @orig_off:           block offset of original inode
- * @donor_off:          block offset of donor inode
- * @max_count:          the maximum length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
-                              struct ext4_extent *tmp_oext,
-                              ext4_lblk_t orig_off, ext4_lblk_t donor_off,
-                              ext4_lblk_t max_count)
-{
-        ext4_lblk_t diff, orig_diff;
-        struct ext4_extent dext_old, oext_old;
-        BUG_ON(orig_off != donor_off);
-        /* original and donor extents have to cover the same block offset */
-        if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
-            le32_to_cpu(tmp_oext->ee_block) +
-                        ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
-                return -ENODATA;
-        if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
-            le32_to_cpu(tmp_dext->ee_block) +
-                        ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
-                return -ENODATA;
-        dext_old = *tmp_dext;
-        oext_old = *tmp_oext;
-        /* When tmp_dext is too large, pick up the target range. */
-        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-        ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-        le32_add_cpu(&tmp_dext->ee_block, diff);
-        le16_add_cpu(&tmp_dext->ee_len, -diff);
-        if (max_count < ext4_ext_get_actual_len(tmp_dext))
-                tmp_dext->ee_len = cpu_to_le16(max_count);
-        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-        ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-        /* Adjust extent length if donor extent is larger than orig */
-        if (ext4_ext_get_actual_len(tmp_dext) >
-            ext4_ext_get_actual_len(tmp_oext) - orig_diff)
-                tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
-                                                orig_diff);
-        tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-        copy_extent_status(&oext_old, tmp_dext);
-        copy_extent_status(&dext_old, tmp_oext);
-        return 0;
-}
-/**
 * mext_check_coverage - Check that all extents in range has the same type
 *
 * @inode:              inode in question
@@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
        }
        ret = 1;
 out:
-        if (path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(path);
+        kfree(path);
-                kfree(path);
-        }
        return ret;
 }
 /**
- * mext_replace_branches - Replace original extents with new extents
- *
- * @handle:             journal handle
- * @orig_inode:         original inode
- * @donor_inode:        donor inode
- * @from:               block offset of orig_inode
- * @count:              block count to be replaced
- * @err:                pointer to save return value
- *
- * Replace original inode extents and donor inode extents page by page.
- * We implement this replacement in the following three steps:
- * 1. Save the block information of original and donor inodes into
- *    dummy extents.
- * 2. Change the block information of original inode to point at the
- *    donor inode blocks.
- * 3. Change the block information of donor inode to point at the saved
- *    original inode blocks in the dummy extents.
- *
- * Return replaced block count.
- */
-static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
-                           struct inode *donor_inode, ext4_lblk_t from,
-                           ext4_lblk_t count, int *err)
-{
-        struct ext4_ext_path *orig_path = NULL;
-        struct ext4_ext_path *donor_path = NULL;
-        struct ext4_extent *oext, *dext;
-        struct ext4_extent tmp_dext, tmp_oext;
-        ext4_lblk_t orig_off = from, donor_off = from;
-        int depth;
-        int replaced_count = 0;
-        int dext_alen;
-        *err = ext4_es_remove_extent(orig_inode, from, count);
-        if (*err)
-                goto out;
-        *err = ext4_es_remove_extent(donor_inode, from, count);
-        if (*err)
-                goto out;
-        /* Get the original extent for the block "orig_off" */
-        *err = get_ext_path(orig_inode, orig_off, &orig_path);
-        if (*err)
-                goto out;
-        /* Get the donor extent for the head */
-        *err = get_ext_path(donor_inode, donor_off, &donor_path);
-        if (*err)
-                goto out;
-        depth = ext_depth(orig_inode);
-        oext = orig_path[depth].p_ext;
-        tmp_oext = *oext;
-        depth = ext_depth(donor_inode);
-        dext = donor_path[depth].p_ext;
-        if (unlikely(!dext))
-                goto missing_donor_extent;
-        tmp_dext = *dext;
-        *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-                                      donor_off, count);
-        if (*err)
-                goto out;
-        /* Loop for the donor extents */
-        while (1) {
-                /* The extent for donor must be found. */
-                if (unlikely(!dext)) {
-                missing_donor_extent:
-                        EXT4_ERROR_INODE(donor_inode,
-                                   "The extent for donor must be found");
-                        *err = -EIO;
-                        goto out;
-                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        EXT4_ERROR_INODE(donor_inode,
-                                "Donor offset(%u) and the first block of donor "
-                                "extent(%u) should be equal",
-                                donor_off,
-                                le32_to_cpu(tmp_dext.ee_block));
-                        *err = -EIO;
-                        goto out;
-                }
-                /* Set donor extent to orig extent */
-                *err = mext_leaf_block(handle, orig_inode,
-                                           orig_path, &tmp_dext, &orig_off);
-                if (*err)
-                        goto out;
-                /* Set orig extent to donor extent */
-                *err = mext_leaf_block(handle, donor_inode,
-                                           donor_path, &tmp_oext, &donor_off);
-                if (*err)
-                        goto out;
-                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
-                replaced_count += dext_alen;
-                donor_off += dext_alen;
-                orig_off += dext_alen;
-                BUG_ON(replaced_count > count);
-                /* Already moved the expected blocks */
-                if (replaced_count >= count)
-                        break;
-                if (orig_path)
-                        ext4_ext_drop_refs(orig_path);
-                *err = get_ext_path(orig_inode, orig_off, &orig_path);
-                if (*err)
-                        goto out;
-                depth = ext_depth(orig_inode);
-                oext = orig_path[depth].p_ext;
-                tmp_oext = *oext;
-                if (donor_path)
-                        ext4_ext_drop_refs(donor_path);
-                *err = get_ext_path(donor_inode, donor_off, &donor_path);
-                if (*err)
-                        goto out;
-                depth = ext_depth(donor_inode);
-                dext = donor_path[depth].p_ext;
-                tmp_dext = *dext;
-                *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-                                           donor_off, count - replaced_count);
-                if (*err)
-                        goto out;
-        }
-out:
-        if (orig_path) {
-                ext4_ext_drop_refs(orig_path);
-                kfree(orig_path);
-        }
-        if (donor_path) {
-                ext4_ext_drop_refs(donor_path);
-                kfree(donor_path);
-        }
-        return replaced_count;
-}
-/**
 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
 *
 * @inode1:     the inode structure
 * @inode2:     the inode structure
- * @index:      page index
+ * @index1:     page index
+ * @index2:     page index
 * @page:       result page vector
 *
 * Grab two locked pages for inode's by inode order
 */
 static int
 mext_page_double_lock(struct inode *inode1, struct inode *inode2,
-                      pgoff_t index, struct page *page[2])
+                      pgoff_t index1, pgoff_t index2, struct page *page[2])
 {
        struct address_space *mapping[2];
        unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
                mapping[0] = inode1->i_mapping;
                mapping[1] = inode2->i_mapping;
        } else {
+                pgoff_t tmp = index1;
+                index1 = index2;
+                index2 = tmp;
                mapping[0] = inode2->i_mapping;
                mapping[1] = inode1->i_mapping;
        }
-        page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+        page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
        if (!page[0])
                return -ENOMEM;
-        page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+        page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
        if (!page[1]) {
                unlock_page(page[0]);
                page_cache_release(page[0]);
@@ -893,25 +245,27 @@ out:
 * @o_filp:                     file structure of original file
 * @donor_inode:                donor inode
 * @orig_page_offset:           page index on original file
+ * @donor_page_offset:          page index on donor file
 * @data_offset_in_page:        block index where data swapping starts
 * @block_len_in_page:          the number of blocks to be swapped
 * @unwritten:                  orig extent is unwritten or not
 * @err:                        pointer to save return value
 *
 * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling mext_replace_branches().
+ * with donor inode extents by calling ext4_swap_extents().
 * Finally, write out the saved data in new original inode blocks. Return
 * replaced block count.
 */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
-                  pgoff_t orig_page_offset, int data_offset_in_page,
+                     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
-                  int block_len_in_page, int unwritten, int *err)
+                     int data_offset_in_page,
+                     int block_len_in_page, int unwritten, int *err)
 {
        struct inode *orig_inode = file_inode(o_filp);
        struct page *pagep[2] = {NULL, NULL};
        handle_t *handle;
-        ext4_lblk_t orig_blk_offset;
+        ext4_lblk_t orig_blk_offset, donor_blk_offset;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
        unsigned int tmp_data_size, data_size, replaced_size;
@@ -939,6 +293,9 @@ again:
        orig_blk_offset = orig_page_offset * blocks_per_page +
                data_offset_in_page;
+        donor_blk_offset = donor_page_offset * blocks_per_page +
+                data_offset_in_page;
        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -959,7 +316,7 @@ again:
        replaced_size = data_size;
        *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
-                                     pagep);
+                                     donor_page_offset, pagep);
        if (unlikely(*err < 0))
                goto stop_journal;
        /*
@@ -978,7 +335,7 @@ again:
                if (*err)
                        goto drop_data_sem;
-                unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+                unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
                                                 block_len_in_page, 1, err);
                if (*err)
                        goto drop_data_sem;
@@ -994,9 +351,10 @@ again:
                        *err = -EBUSY;
                        goto drop_data_sem;
                }
-                replaced_count = mext_replace_branches(handle, orig_inode,
+                replaced_count = ext4_swap_extents(handle, orig_inode,
-                                                donor_inode, orig_blk_offset,
+                                                   donor_inode, orig_blk_offset,
-                                                block_len_in_page, err);
+                                                   donor_blk_offset,
+                                                   block_len_in_page, 1, err);
        drop_data_sem:
                ext4_double_up_write_data_sem(orig_inode, donor_inode);
                goto unlock_pages;
@@ -1014,9 +372,9 @@ data_copy:
                goto unlock_pages;
        }
        ext4_double_down_write_data_sem(orig_inode, donor_inode);
-        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+        replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
-                                               orig_blk_offset,
+                                               orig_blk_offset, donor_blk_offset,
-                                               block_len_in_page, err);
+                                           block_len_in_page, 1, err);
        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        if (*err) {
                if (replaced_count) {
@@ -1061,9 +419,9 @@ repair_branches:
         * Try to swap extents to it's original places
         */
        ext4_double_down_write_data_sem(orig_inode, donor_inode);
-        replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+        replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
-                                               orig_blk_offset,
+                                               orig_blk_offset, donor_blk_offset,
-                                               block_len_in_page, &err2);
+                                           block_len_in_page, 0, &err2);
        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        if (replaced_count != block_len_in_page) {
                EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode,
                     struct inode *donor_inode, __u64 orig_start,
                     __u64 donor_start, __u64 *len)
 {
-        ext4_lblk_t orig_blocks, donor_blocks;
+        __u64 orig_eof, donor_eof;
        unsigned int blkbits = orig_inode->i_blkbits;
        unsigned int blocksize = 1 << blkbits;
+        orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+        donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
        if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
                ext4_debug("ext4 move extent: suid or sgid is set"
                           " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode,
                ext4_debug("ext4 move extent: The argument files should "
                        "not be swapfile [ino:orig %lu, donor %lu]\n",
                        orig_inode->i_ino, donor_inode->i_ino);
-                return -EINVAL;
+                return -EBUSY;
        }
        /* Ext4 move extent supports only extent based file */
@@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode,
        }
        /* Start offset should be same */
-        if (orig_start != donor_start) {
+        if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+            (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
                ext4_debug("ext4 move extent: orig and donor's start "
-                        "offset are not same [ino:orig %lu, donor %lu]\n",
+                        "offset are not alligned [ino:orig %lu, donor %lu]\n",
                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
        if ((orig_start >= EXT_MAX_BLOCKS) ||
+            (donor_start >= EXT_MAX_BLOCKS) ||
            (*len > EXT_MAX_BLOCKS) ||
+            (donor_start + *len >= EXT_MAX_BLOCKS) ||
            (orig_start + *len >= EXT_MAX_BLOCKS))  {
                ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
+        if (orig_eof < orig_start + *len - 1)
-        if (orig_inode->i_size > donor_inode->i_size) {
+                *len = orig_eof - orig_start;
-                donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+        if (donor_eof < donor_start + *len - 1)
-                /* TODO: eliminate this artificial restriction */
+                *len = donor_eof - donor_start;
-                if (orig_start >= donor_blocks) {
-                        ext4_debug("ext4 move extent: orig start offset "
-                        "[%llu] should be less than donor file blocks "
-                        "[%u] [ino:orig %lu, donor %lu]\n",
-                        orig_start, donor_blocks,
-                        orig_inode->i_ino, donor_inode->i_ino);
-                        return -EINVAL;
-                }
-                /* TODO: eliminate this artificial restriction */
-                if (orig_start + *len > donor_blocks) {
-                        ext4_debug("ext4 move extent: End offset [%llu] should "
-                                "be less than donor file blocks [%u]."
-                                "So adjust length from %llu to %llu "
-                                "[ino:orig %lu, donor %lu]\n",
-                                orig_start + *len, donor_blocks,
-                                *len, donor_blocks - orig_start,
-                                orig_inode->i_ino, donor_inode->i_ino);
-                        *len = donor_blocks - orig_start;
-                }
-        } else {
-                orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
-                if (orig_start >= orig_blocks) {
-                        ext4_debug("ext4 move extent: start offset [%llu] "
-                                "should be less than original file blocks "
-                                "[%u] [ino:orig %lu, donor %lu]\n",
-                                 orig_start, orig_blocks,
-                                orig_inode->i_ino, donor_inode->i_ino);
-                        return -EINVAL;
-                }
-                if (orig_start + *len > orig_blocks) {
-                        ext4_debug("ext4 move extent: Adjust length "
-                                "from %llu to %llu. Because it should be "
-                                "less than original file blocks "
-                                "[ino:orig %lu, donor %lu]\n",
-                                *len, orig_blocks - orig_start,
-                                orig_inode->i_ino, donor_inode->i_ino);
-                        *len = orig_blocks - orig_start;
-                }
-        }
        if (!*len) {
                ext4_debug("ext4 move extent: len should not be 0 "
                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1208,60 +531,26 @@ mext_check_arguments(struct inode *orig_inode,
 *
 * @o_filp:             file structure of the original file
 * @d_filp:             file structure of the donor file
- * @orig_start:         start offset in block for orig
+ * @orig_blk:           start offset in block for orig
- * @donor_start:        start offset in block for donor
+ * @donor_blk:          start offset in block for donor
 * @len:                the number of blocks to be moved
 * @moved_len:          moved block length
 *
 * This function returns 0 and moved block length is set in moved_len
 * if succeed, otherwise returns error value.
 *
- * Note: ext4_move_extents() proceeds the following order.
- * 1:ext4_move_extents() calculates the last block number of moving extent
- *   function by the start block number (orig_start) and the number of blocks
- *   to be moved (len) specified as arguments.
- *   If the {orig, donor}_start points a hole, the extent's start offset
- *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
- *   after hole behind.
- * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
- *   or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call mext_next_extent()
- *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
- *   until find un-continuous extent, the start logical block number exceeds
- *   the block_end or the extent points to the last extent.
- * 4:Exchange the original inode data with donor inode data
- *   from orig_page_offset to seq_end_page.
- *   The start indexes of data are specified as arguments.
- *   That of the original inode is orig_page_offset,
- *   and the donor inode is also orig_page_offset
- *   (To easily handle blocksize != pagesize case, the offset for the
- *   donor inode is block unit).
- * 5:Update holecheck_path and orig_path to points a next proceeding extent,
- *   then returns to step 2.
- * 6:Release holecheck_path, orig_path and set the len to moved_len
- *   which shows the number of moved blocks.
- *   The moved_len is useful for the command to calculate the file offset
- *   for starting next move extent ioctl.
- * 7:Return 0 on success, or a negative error value on failure.
 */
 int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
-                 __u64 orig_start, __u64 donor_start, __u64 len,
+                  __u64 donor_blk, __u64 len, __u64 *moved_len)
-                 __u64 *moved_len)
 {
        struct inode *orig_inode = file_inode(o_filp);
        struct inode *donor_inode = file_inode(d_filp);
-        struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+        struct ext4_ext_path *path = NULL;
-        struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
-        ext4_lblk_t block_start = orig_start;
-        ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
-        ext4_lblk_t rest_blocks;
-        pgoff_t orig_page_offset = 0, seq_end_page;
-        int ret, depth, last_extent = 0;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
-        int data_offset_in_page;
+        ext4_lblk_t o_end, o_start = orig_blk;
-        int block_len_in_page;
+        ext4_lblk_t d_start = donor_blk;
-        int unwritten;
+        int ret;
        if (orig_inode->i_sb != donor_inode->i_sb) {
                ext4_debug("ext4 move extent: The argument files "
@@ -1303,121 +592,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        /* Protect extent tree against block allocations via delalloc */
        ext4_double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
-        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+        ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
-                                    donor_start, &len);
+                                    donor_blk, &len);
        if (ret)
                goto out;
+        o_end = o_start + len;
-        file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+        while (o_start < o_end) {
-        block_end = block_start + len - 1;
+                struct ext4_extent *ex;
-        if (file_end < block_end)
+                ext4_lblk_t cur_blk, next_blk;
-                len -= block_end - file_end;
+                pgoff_t orig_page_index, donor_page_index;
+                int offset_in_page;
+                int unwritten, cur_len;
-        ret = get_ext_path(orig_inode, block_start, &orig_path);
+                ret = get_ext_path(orig_inode, o_start, &path);
-        if (ret)
+                if (ret)
-                goto out;
-        /* Get path structure to check the hole */
-        ret = get_ext_path(orig_inode, block_start, &holecheck_path);
-        if (ret)
-                goto out;
-        depth = ext_depth(orig_inode);
-        ext_cur = holecheck_path[depth].p_ext;
-        /*
-         * Get proper starting location of block replacement if block_start was
-         * within the hole.
-         */
-        if (le32_to_cpu(ext_cur->ee_block) +
-                ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
-                /*
-                 * The hole exists between extents or the tail of
-                 * original file.
-                 */
-                last_extent = mext_next_extent(orig_inode,
-                                        holecheck_path, &ext_cur);
-                if (last_extent < 0) {
-                        ret = last_extent;
-                        goto out;
-                }
-                last_extent = mext_next_extent(orig_inode, orig_path,
-                                                        &ext_dummy);
-                if (last_extent < 0) {
-                        ret = last_extent;
                        goto out;
-                }
+                ex = path[path->p_depth].p_ext;
-                seq_start = le32_to_cpu(ext_cur->ee_block);
+                next_blk = ext4_ext_next_allocated_block(path);
-        } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+                cur_blk = le32_to_cpu(ex->ee_block);
-                /* The hole exists at the beginning of original file. */
+                cur_len = ext4_ext_get_actual_len(ex);
-                seq_start = le32_to_cpu(ext_cur->ee_block);
+                /* Check hole before the start pos */
-        else
+                if (cur_blk + cur_len - 1 < o_start) {
-                seq_start = block_start;
+                        if (next_blk == EXT_MAX_BLOCKS) {
+                                o_start = o_end;
-        /* No blocks within the specified range. */
+                                ret = -ENODATA;
-        if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+                                goto out;
-                ext4_debug("ext4 move extent: The specified range of file "
+                        }
-                                                        "may be the hole\n");
+                        d_start += next_blk - o_start;
-                ret = -EINVAL;
+                        o_start = next_blk;
-                goto out;
-        }
-        /* Adjust start blocks */
-        add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
-                         ext4_ext_get_actual_len(ext_cur), block_end + 1) -
-                     max(le32_to_cpu(ext_cur->ee_block), block_start);
-        while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
-                seq_blocks += add_blocks;
-                /* Adjust tail blocks */
-                if (seq_start + seq_blocks - 1 > block_end)
-                        seq_blocks = block_end - seq_start + 1;
-                ext_prev = ext_cur;
-                last_extent = mext_next_extent(orig_inode, holecheck_path,
-                                                &ext_cur);
-                if (last_extent < 0) {
-                        ret = last_extent;
-                        break;
-                }
-                add_blocks = ext4_ext_get_actual_len(ext_cur);
-                /*
-                 * Extend the length of contiguous block (seq_blocks)
-                 * if extents are contiguous.
-                 */
-                if (ext4_can_extents_be_merged(orig_inode,
-                                               ext_prev, ext_cur) &&
-                    block_end >= le32_to_cpu(ext_cur->ee_block) &&
-                    !last_extent)
                        continue;
+                /* Check hole after the start pos */
-                /* Is original extent is unwritten */
+                } else if (cur_blk > o_start) {
-                unwritten = ext4_ext_is_unwritten(ext_prev);
+                        /* Skip hole */
+                        d_start += cur_blk - o_start;
-                data_offset_in_page = seq_start % blocks_per_page;
+                        o_start = cur_blk;
+                        /* Extent inside requested range ?*/
-                /*
+                        if (cur_blk >= o_end)
-                 * Calculate data blocks count that should be swapped
+                                goto out;
-                 * at the first page.
+                } else { /* in_range(o_start, o_blk, o_len) */
-                 */
+                        cur_len += cur_blk - o_start;
-                if (data_offset_in_page + seq_blocks > blocks_per_page) {
-                        /* Swapped blocks are across pages */
-                        block_len_in_page =
-                                        blocks_per_page - data_offset_in_page;
-                } else {
-                        /* Swapped blocks are in a page */
-                        block_len_in_page = seq_blocks;
                }
+                unwritten = ext4_ext_is_unwritten(ex);
-                orig_page_offset = seq_start >>
+                if (o_end - o_start < cur_len)
-                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                        cur_len = o_end - o_start;
-                seq_end_page = (seq_start + seq_blocks - 1) >>
-                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
-                seq_start = le32_to_cpu(ext_cur->ee_block);
+                                               orig_inode->i_blkbits);
-                rest_blocks = seq_blocks;
+                donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+                                               donor_inode->i_blkbits);
+                offset_in_page = o_start % blocks_per_page;
+                if (cur_len > blocks_per_page- offset_in_page)
+                        cur_len = blocks_per_page - offset_in_page;
                /*
                 * Up semaphore to avoid following problems:
                 * a. transaction deadlock among ext4_journal_start,
@@ -1426,77 +652,29 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 *    in move_extent_per_page
                 */
                ext4_double_up_write_data_sem(orig_inode, donor_inode);
+                /* Swap original branches with new branches */
-                while (orig_page_offset <= seq_end_page) {
+                move_extent_per_page(o_filp, donor_inode,
+                                     orig_page_index, donor_page_index,
-                        /* Swap original branches with new branches */
+                                     offset_in_page, cur_len,
-                        block_len_in_page = move_extent_per_page(
+                                     unwritten, &ret);
-                                                o_filp, donor_inode,
-                                                orig_page_offset,
-                                                data_offset_in_page,
-                                                block_len_in_page,
-                                                unwritten, &ret);
-                        /* Count how many blocks we have exchanged */
-                        *moved_len += block_len_in_page;
-                        if (ret < 0)
-                                break;
-                        if (*moved_len > len) {
-                                EXT4_ERROR_INODE(orig_inode,
-                                        "We replaced blocks too much! "
-                                        "sum of replaced: %llu requested: %llu",
-                                        *moved_len, len);
-                                ret = -EIO;
-                                break;
-                        }
-                        orig_page_offset++;
-                        data_offset_in_page = 0;
-                        rest_blocks -= block_len_in_page;
-                        if (rest_blocks > blocks_per_page)
-                                block_len_in_page = blocks_per_page;
-                        else
-                                block_len_in_page = rest_blocks;
-                }
                ext4_double_down_write_data_sem(orig_inode, donor_inode);
                if (ret < 0)
                        break;
+                o_start += cur_len;
-                /* Decrease buffer counter */
+                d_start += cur_len;
-                if (holecheck_path)
-                        ext4_ext_drop_refs(holecheck_path);
-                ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
-                if (ret)
-                        break;
-                depth = holecheck_path->p_depth;
-                /* Decrease buffer counter */
-                if (orig_path)
-                        ext4_ext_drop_refs(orig_path);
-                ret = get_ext_path(orig_inode, seq_start, &orig_path);
-                if (ret)
-                        break;
-                ext_cur = holecheck_path[depth].p_ext;
-                add_blocks = ext4_ext_get_actual_len(ext_cur);
-                seq_blocks = 0;
        }
+        *moved_len = o_start - orig_blk;
+        if (*moved_len > len)
+                *moved_len = len;
 out:
        if (*moved_len) {
                ext4_discard_preallocations(orig_inode);
                ext4_discard_preallocations(donor_inode);
        }
-        if (orig_path) {
+        ext4_ext_drop_refs(path);
-                ext4_ext_drop_refs(orig_path);
+        kfree(path);
-                kfree(orig_path);
-        }
-        if (holecheck_path) {
-                ext4_ext_drop_refs(holecheck_path);
-                kfree(holecheck_path);
-        }
        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        ext4_inode_resume_unlocked_dio(orig_inode);
        ext4_inode_resume_unlocked_dio(donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 603e4ebbd0ac..426211882f72 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
                                        ext4_lblk_t *block)
 {
        struct buffer_head *bh;
-        int err = 0;
+        int err;
        if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
                     ((inode->i_size >> 10) >=
@@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle,
        *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-        bh = ext4_bread(handle, inode, *block, 1, &err);
+        bh = ext4_bread(handle, inode, *block, 1);
-        if (!bh)
+        if (IS_ERR(bh))
-                return ERR_PTR(err);
+                return bh;
        inode->i_size += inode->i_sb->s_blocksize;
        EXT4_I(inode)->i_disksize = inode->i_size;
        BUFFER_TRACE(bh, "get_write_access");
@@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 {
        struct buffer_head *bh;
        struct ext4_dir_entry *dirent;
-        int err = 0, is_dx_block = 0;
+        int is_dx_block = 0;
-        bh = ext4_bread(NULL, inode, block, 0, &err);
+        bh = ext4_bread(NULL, inode, block, 0);
-        if (!bh) {
+        if (IS_ERR(bh)) {
-                if (err == 0) {
-                        ext4_error_inode(inode, __func__, line, block,
-                                               "Directory hole found");
-                        return ERR_PTR(-EIO);
-                }
                __ext4_warning(inode->i_sb, __func__, line,
-                               "error reading directory block "
+                               "error %ld reading directory block "
-                               "(ino %lu, block %lu)", inode->i_ino,
+                               "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
                               (unsigned long) block);
-                return ERR_PTR(err);
+                return bh;
+        }
+        if (!bh) {
+                ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+                return ERR_PTR(-EIO);
        }
        dirent = (struct ext4_dir_entry *) bh->b_data;
        /* Determine whether or not we have an index block */
@@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
                       "directory leaf block found instead of index block");
                return ERR_PTR(-EIO);
        }
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb) ||
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
            buffer_verified(bh))
                return bh;
@@ -253,8 +252,7 @@ static unsigned dx_node_limit(struct inode *dir);
 static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct inode *dir,
                                 struct dx_hash_info *hinfo,
-                                 struct dx_frame *frame,
+                                 struct dx_frame *frame);
-                                 int *err);
 static void dx_release(struct dx_frame *frames);
 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
@@ -270,8 +268,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 __u32 *start_hash);
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                const struct qstr *d_name,
-                struct ext4_dir_entry_2 **res_dir,
+                struct ext4_dir_entry_2 **res_dir);
-                int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
@@ -340,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 {
        struct ext4_dir_entry_tail *t;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        t = get_dirent_tail(inode, dirent);
@@ -362,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
 {
        struct ext4_dir_entry_tail *t;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        t = get_dirent_tail(inode, dirent);
@@ -438,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
        struct dx_tail *t;
        int count_offset, limit, count;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -468,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
        struct dx_tail *t;
        int count_offset, limit, count;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -557,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+        if (ext4_has_metadata_csum(dir->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
 }
@@ -567,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+        if (ext4_has_metadata_csum(dir->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
 }
@@ -641,7 +632,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
                struct stats stats;
                printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
-                if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+                bh = ext4_bread(NULL,dir, block, 0);
+                if (!bh || IS_ERR(bh))
+                        continue;
                stats = levels?
                   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
                   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
@@ -669,29 +662,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 */
 static struct dx_frame *
 dx_probe(const struct qstr *d_name, struct inode *dir,
-         struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+         struct dx_hash_info *hinfo, struct dx_frame *frame_in)
 {
        unsigned count, indirect;
        struct dx_entry *at, *entries, *p, *q, *m;
        struct dx_root *root;
-        struct buffer_head *bh;
        struct dx_frame *frame = frame_in;
+        struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
        u32 hash;
-        frame->bh = NULL;
+        frame->bh = ext4_read_dirblock(dir, 0, INDEX);
-        bh = ext4_read_dirblock(dir, 0, INDEX);
+        if (IS_ERR(frame->bh))
-        if (IS_ERR(bh)) {
+                return (struct dx_frame *) frame->bh;
-                *err = PTR_ERR(bh);
-                goto fail;
+        root = (struct dx_root *) frame->bh->b_data;
-        }
-        root = (struct dx_root *) bh->b_data;
        if (root->info.hash_version != DX_HASH_TEA &&
            root->info.hash_version != DX_HASH_HALF_MD4 &&
            root->info.hash_version != DX_HASH_LEGACY) {
                ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
                             root->info.hash_version);
-                brelse(bh);
-                *err = ERR_BAD_DX_DIR;
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
@@ -705,16 +694,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (root->info.unused_flags & 1) {
                ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
                             root->info.unused_flags);
-                brelse(bh);
-                *err = ERR_BAD_DX_DIR;
                goto fail;
        }
        if ((indirect = root->info.indirect_levels) > 1) {
                ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
                             root->info.indirect_levels);
-                brelse(bh);
-                *err = ERR_BAD_DX_DIR;
                goto fail;
        }
@@ -724,27 +709,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (dx_get_limit(entries) != dx_root_limit(dir,
                                                   root->info.info_length)) {
                ext4_warning(dir->i_sb, "dx entry: limit != root limit");
-                brelse(bh);
-                *err = ERR_BAD_DX_DIR;
                goto fail;
        }
        dxtrace(printk("Look up %x", hash));
-        while (1)
+        while (1) {
-        {
                count = dx_get_count(entries);
                if (!count || count > dx_get_limit(entries)) {
                        ext4_warning(dir->i_sb,
                                     "dx entry: no count or count > limit");
-                        brelse(bh);
+                        goto fail;
-                        *err = ERR_BAD_DX_DIR;
-                        goto fail2;
                }
                p = entries + 1;
                q = entries + count - 1;
-                while (p <= q)
+                while (p <= q) {
-                {
                        m = p + (q - p)/2;
                        dxtrace(printk("."));
                        if (dx_get_hash(m) > hash)
@@ -753,8 +732,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                                p = m + 1;
                }
-                if (0) // linear search cross check
+                if (0) { // linear search cross check
-                {
                        unsigned n = count - 1;
                        at = entries;
                        while (n--)
@@ -771,38 +749,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                at = p - 1;
                dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
-                frame->bh = bh;
                frame->entries = entries;
                frame->at = at;
-                if (!indirect--) return frame;
+                if (!indirect--)
-                bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+                        return frame;
-                if (IS_ERR(bh)) {
+                frame++;
-                        *err = PTR_ERR(bh);
+                frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
-                        goto fail2;
+                if (IS_ERR(frame->bh)) {
+                        ret_err = (struct dx_frame *) frame->bh;
+                        frame->bh = NULL;
+                        goto fail;
                }
-                entries = ((struct dx_node *) bh->b_data)->entries;
+                entries = ((struct dx_node *) frame->bh->b_data)->entries;
                if (dx_get_limit(entries) != dx_node_limit (dir)) {
                        ext4_warning(dir->i_sb,
                                     "dx entry: limit != node limit");
-                        brelse(bh);
+                        goto fail;
-                        *err = ERR_BAD_DX_DIR;
-                        goto fail2;
                }
-                frame++;
-                frame->bh = NULL;
        }
-fail2:
+fail:
        while (frame >= frame_in) {
                brelse(frame->bh);
                frame--;
        }
-fail:
+        if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
-        if (*err == ERR_BAD_DX_DIR)
                ext4_warning(dir->i_sb,
                             "Corrupt dir inode %lu, running e2fsck is "
                             "recommended.", dir->i_ino);
-        return NULL;
+        return ret_err;
 }
 static void dx_release (struct dx_frame *frames)
@@ -988,9 +963,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
-        frame = dx_probe(NULL, dir, &hinfo, frames, &err);
+        frame = dx_probe(NULL, dir, &hinfo, frames);
-        if (!frame)
+        if (IS_ERR(frame))
-                return err;
+                return PTR_ERR(frame);
        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
@@ -1227,8 +1202,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                                   buffer */
        int num = 0;
        ext4_lblk_t  nblocks;
-        int i, err = 0;
+        int i, namelen;
-        int namelen;
        *res_dir = NULL;
        sb = dir->i_sb;
@@ -1258,17 +1232,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
                goto restart;
        }
        if (is_dx(dir)) {
-                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
+                bh = ext4_dx_find_entry(dir, d_name, res_dir);
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
                 * old fashioned way.
                 */
-                if (err == -ENOENT)
+                if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
-                        return NULL;
-                if (err && err != ERR_BAD_DX_DIR)
-                        return ERR_PTR(err);
-                if (bh)
                        return bh;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
@@ -1298,10 +1268,10 @@ restart:
                                        break;
                                }
                                num++;
-                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
+                                bh = ext4_getblk(NULL, dir, b++, 0);
-                                if (unlikely(err)) {
+                                if (unlikely(IS_ERR(bh))) {
                                        if (ra_max == 0)
-                                                return ERR_PTR(err);
+                                                return bh;
                                        break;
                                }
                                bh_use[ra_max] = bh;
@@ -1366,7 +1336,7 @@ cleanup_and_exit:
 }
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-                       struct ext4_dir_entry_2 **res_dir, int *err)
+                       struct ext4_dir_entry_2 **res_dir)
 {
        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
@@ -1375,25 +1345,23 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
        ext4_lblk_t block;
        int retval;
-        if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+        frame = dx_probe(d_name, dir, &hinfo, frames);
-                return NULL;
+        if (IS_ERR(frame))
+                return (struct buffer_head *) frame;
        do {
                block = dx_get_block(frame->at);
                bh = ext4_read_dirblock(dir, block, DIRENT);
-                if (IS_ERR(bh)) {
+                if (IS_ERR(bh))
-                        *err = PTR_ERR(bh);
                        goto errout;
-                }
                retval = search_dirblock(bh, dir, d_name,
                                         block << EXT4_BLOCK_SIZE_BITS(sb),
                                         res_dir);
-                if (retval == 1) {      /* Success! */
+                if (retval == 1)
-                        dx_release(frames);
+                        goto success;
-                        return bh;
-                }
                brelse(bh);
                if (retval == -1) {
-                        *err = ERR_BAD_DX_DIR;
+                        bh = ERR_PTR(ERR_BAD_DX_DIR);
                        goto errout;
                }
@@ -1402,18 +1370,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
-                             "error reading index page in directory #%lu",
+                             "error %d reading index page in directory #%lu",
-                             dir->i_ino);
+                             retval, dir->i_ino);
-                        *err = retval;
+                        bh = ERR_PTR(retval);
                        goto errout;
                }
        } while (retval == 1);
-        *err = -ENOENT;
+        bh = NULL;
 errout:
        dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
-        dx_release (frames);
+success:
-        return NULL;
+        dx_release(frames);
+        return bh;
 }
 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -1441,7 +1410,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                                         dentry);
                        return ERR_PTR(-EIO);
                }
-                inode = ext4_iget(dir->i_sb, ino);
+                inode = ext4_iget_normal(dir->i_sb, ino);
                if (inode == ERR_PTR(-ESTALE)) {
                        EXT4_ERROR_INODE(dir,
                                         "deleted inode referenced: %u",
@@ -1474,7 +1443,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
                return ERR_PTR(-EIO);
        }
-        return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+        return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
 }
 /*
@@ -1533,7 +1502,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 */
 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                        struct buffer_head **bh,struct dx_frame *frame,
-                        struct dx_hash_info *hinfo, int *error)
+                        struct dx_hash_info *hinfo)
 {
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count, continued;
@@ -1548,16 +1517,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        int     csum_size = 0;
        int     err = 0, i;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+        if (ext4_has_metadata_csum(dir->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        bh2 = ext4_append(handle, dir, &newblock);
        if (IS_ERR(bh2)) {
                brelse(*bh);
                *bh = NULL;
-                *error = PTR_ERR(bh2);
+                return (struct ext4_dir_entry_2 *) bh2;
-                return NULL;
        }
        BUFFER_TRACE(*bh, "get_write_access");
@@ -1617,8 +1584,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
        /* Which block gets the new entry? */
-        if (hinfo->hash >= hash2)
+        if (hinfo->hash >= hash2) {
-        {
                swap(*bh, bh2);
                de = de2;
        }
@@ -1638,8 +1604,7 @@ journal_error:
        brelse(bh2);
        *bh = NULL;
        ext4_std_error(dir->i_sb, err);
-        *error = err;
+        return ERR_PTR(err);
-        return NULL;
 }
 int ext4_find_dest_de(struct inode *dir, struct inode *inode,
@@ -1718,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        int             csum_size = 0;
        int             err;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (ext4_has_metadata_csum(inode->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        if (!de) {
@@ -1786,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct fake_dirent *fde;
        int             csum_size = 0;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (ext4_has_metadata_csum(inode->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        blocksize =  dir->i_sb->s_blocksize;
@@ -1853,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        ext4fs_dirhash(name, namelen, &hinfo);
+        memset(frames, 0, sizeof(frames));
        frame = frames;
        frame->entries = entries;
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
-        ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+        retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
-        ext4_handle_dirty_dirent_node(handle, dir, bh);
+        if (retval)
+                goto out_frames;        
+        retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+        if (retval)
+                goto out_frames;        
-        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+        de = do_split(handle,dir, &bh, frame, &hinfo);
-        if (!de) {
+        if (IS_ERR(de)) {
-                /*
+                retval = PTR_ERR(de);
-                 * Even if the block split failed, we have to properly write
+                goto out_frames;
-                 * out all the changes we did so far. Otherwise we can end up
-                 * with corrupted filesystem.
-                 */
-                ext4_mark_inode_dirty(handle, dir);
-                dx_release(frames);
-                return retval;
        }
        dx_release(frames);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
        return retval;
+out_frames:
+        /*
+         * Even if the block split failed, we have to properly write
+         * out all the changes we did so far. Otherwise we can end up
+         * with corrupted filesystem.
+         */
+        ext4_mark_inode_dirty(handle, dir);
+        dx_release(frames);
+        return retval;
 }
 /*
@@ -1904,8 +1875,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        ext4_lblk_t block, blocks;
        int     csum_size = 0;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (ext4_has_metadata_csum(inode->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        sb = dir->i_sb;
@@ -1982,9 +1952,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        struct ext4_dir_entry_2 *de;
        int err;
-        frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+        frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
-        if (!frame)
+        if (IS_ERR(frame))
-                return err;
+                return PTR_ERR(frame);
        entries = frame->entries;
        at = frame->at;
        bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
@@ -2095,9 +2065,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                }
        }
-        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+        de = do_split(handle, dir, &bh, frame, &hinfo);
-        if (!de)
+        if (IS_ERR(de)) {
+                err = PTR_ERR(de);
                goto cleanup;
+        }
        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
        goto cleanup;
@@ -2167,8 +2139,7 @@ static int ext4_delete_entry(handle_t *handle,
                        return err;
        }
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+        if (ext4_has_metadata_csum(dir->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        BUFFER_TRACE(bh, "get_write_access");
@@ -2387,8 +2358,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
        int csum_size = 0;
        int err;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+        if (ext4_has_metadata_csum(dir->i_sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -2403,10 +2373,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
        dir_block = ext4_append(handle, inode, &block);
        if (IS_ERR(dir_block))
                return PTR_ERR(dir_block);
-        BUFFER_TRACE(dir_block, "get_write_access");
-        err = ext4_journal_get_write_access(handle, dir_block);
-        if (err)
-                goto out;
        de = (struct ext4_dir_entry_2 *)dir_block->b_data;
        ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
        set_nlink(inode, 2);
@@ -2573,7 +2539,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        int err = 0, rc;
        bool dirty = false;
-        if (!sbi->s_journal)
+        if (!sbi->s_journal || is_bad_inode(inode))
                return 0;
        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
@@ -3190,6 +3156,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
        }
 }
+static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+                                              int credits, handle_t **h)
+{
+        struct inode *wh;
+        handle_t *handle;
+        int retries = 0;
+        /*
+         * for inode block, sb block, group summaries,
+         * and inode bitmap
+         */
+        credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
+                    EXT4_XATTR_TRANS_BLOCKS + 4);
+retry:
+        wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+                                         &ent->dentry->d_name, 0, NULL,
+                                         EXT4_HT_DIR, credits);
+        handle = ext4_journal_current_handle();
+        if (IS_ERR(wh)) {
+                if (handle)
+                        ext4_journal_stop(handle);
+                if (PTR_ERR(wh) == -ENOSPC &&
+                    ext4_should_retry_alloc(ent->dir->i_sb, &retries))
+                        goto retry;
+        } else {
+                *h = handle;
+                init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
+                wh->i_op = &ext4_special_inode_operations;
+        }
+        return wh;
+}
 /*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
@@ -3199,7 +3198,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
 * This comes from rename(const char *oldpath, const char *newpath)
 */
 static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
-                       struct inode *new_dir, struct dentry *new_dentry)
+                       struct inode *new_dir, struct dentry *new_dentry,
+                       unsigned int flags)
 {
        handle_t *handle = NULL;
        struct ext4_renament old = {
@@ -3214,6 +3214,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        };
        int force_reread;
        int retval;
+        struct inode *whiteout = NULL;
+        int credits;
+        u8 old_file_type;
        dquot_initialize(old.dir);
        dquot_initialize(new.dir);
@@ -3252,11 +3255,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
                ext4_alloc_da_blocks(old.inode);
-        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+        credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
-                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
-                 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+        if (!(flags & RENAME_WHITEOUT)) {
-        if (IS_ERR(handle))
+                handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
-                return PTR_ERR(handle);
+                if (IS_ERR(handle))
+                        return PTR_ERR(handle);
+        } else {
+                whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+                if (IS_ERR(whiteout))
+                        return PTR_ERR(whiteout);
+        }
        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);
@@ -3284,13 +3293,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
         */
        force_reread = (new.dir->i_ino == old.dir->i_ino &&
                        ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
+        old_file_type = old.de->file_type;
+        if (whiteout) {
+                /*
+                 * Do this before adding a new entry, so the old entry is sure
+                 * to be still pointing to the valid old entry.
+                 */
+                retval = ext4_setent(handle, &old, whiteout->i_ino,
+                                     EXT4_FT_CHRDEV);
+                if (retval)
+                        goto end_rename;
+                ext4_mark_inode_dirty(handle, whiteout);
+        }
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
                        goto end_rename;
        } else {
                retval = ext4_setent(handle, &new,
-                                     old.inode->i_ino, old.de->file_type);
+                                     old.inode->i_ino, old_file_type);
                if (retval)
                        goto end_rename;
        }
@@ -3305,10 +3327,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        old.inode->i_ctime = ext4_current_time(old.inode);
        ext4_mark_inode_dirty(handle, old.inode);
-        /*
+        if (!whiteout) {
-         * ok, that's it
+                /*
-         */
+                 * ok, that's it
-        ext4_rename_delete(handle, &old, force_reread);
+                 */
+                ext4_rename_delete(handle, &old, force_reread);
+        }
        if (new.inode) {
                ext4_dec_count(handle, new.inode);
@@ -3344,6 +3368,12 @@ end_rename:
        brelse(old.dir_bh);
        brelse(old.bh);
        brelse(new.bh);
+        if (whiteout) {
+                if (retval)
+                        drop_nlink(whiteout);
+                unlock_new_inode(whiteout);
+                iput(whiteout);
+        }
        if (handle)
                ext4_journal_stop(handle);
        return retval;
@@ -3476,18 +3506,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry,
                        unsigned int flags)
 {
-        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
        if (flags & RENAME_EXCHANGE) {
                return ext4_cross_rename(old_dir, old_dentry,
                                         new_dir, new_dentry);
        }
-        /*
-         * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+        return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
-         * is equivalent to regular rename.
-         */
-        return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 /*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1e43b905ff98..ca4588388fc3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
                        break;
                if (meta_bg == 0)
-                        backup_block = group * bpg + blk_off;
+                        backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
                else
                        backup_block = (ext4_group_first_block_no(sb, group) +
                                        ext4_bg_has_super(sb, group));
@@ -1212,8 +1212,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
 {
        struct buffer_head *bh;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 0;
        bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 05c159218bc2..2c9e6864abd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 static void ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -141,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
 static int ext4_superblock_csum_verify(struct super_block *sb,
                                       struct ext4_super_block *es)
 {
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return 1;
        return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -152,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (!ext4_has_metadata_csum(sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        es->s_checksum = ext4_superblock_csum(sb, es);
@@ -820,10 +817,9 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-        percpu_counter_destroy(&sbi->s_extent_cache_cnt);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
 #endif
@@ -885,6 +881,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_lru);
+        ei->i_es_all_nr = 0;
        ei->i_es_lru_nr = 0;
        ei->i_touch_when = 0;
        ei->i_reserved_data_blocks = 0;
@@ -1002,7 +999,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
         * Currently we don't know the generation for parent directory, so
         * a generation of 0 means "accept any"
         */
-        inode = ext4_iget(sb, ino);
+        inode = ext4_iget_normal(sb, ino);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
@@ -1124,25 +1121,6 @@ static const struct super_operations ext4_sops = {
        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
-static const struct super_operations ext4_nojournal_sops = {
-        .alloc_inode    = ext4_alloc_inode,
-        .destroy_inode  = ext4_destroy_inode,
-        .write_inode    = ext4_write_inode,
-        .dirty_inode    = ext4_dirty_inode,
-        .drop_inode     = ext4_drop_inode,
-        .evict_inode    = ext4_evict_inode,
-        .sync_fs        = ext4_sync_fs_nojournal,
-        .put_super      = ext4_put_super,
-        .statfs         = ext4_statfs,
-        .remount_fs     = ext4_remount,
-        .show_options   = ext4_show_options,
-#ifdef CONFIG_QUOTA
-        .quota_read     = ext4_quota_read,
-        .quota_write    = ext4_quota_write,
-#endif
-        .bdev_try_to_free_page = bdev_try_to_free_page,
-};
 static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
@@ -1712,13 +1690,6 @@ static int parse_options(char *options, struct super_block *sb,
                                        "not specified");
                        return 0;
                }
-        } else {
-                if (sbi->s_jquota_fmt) {
-                        ext4_msg(sb, KERN_ERR, "journaled quota format "
-                                        "specified with no journaling "
-                                        "enabled");
-                        return 0;
-                }
        }
 #endif
        if (test_opt(sb, DIOREAD_NOLOCK)) {
@@ -2016,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
-        if ((sbi->s_es->s_feature_ro_compat &
+        if (ext4_has_metadata_csum(sbi->s_sb)) {
-             cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
                /* Use new metadata_csum algorithm */
                __le16 save_csum;
                __u32 csum32;
@@ -2035,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
        }
        /* old crc16 code */
+        if (!(sbi->s_es->s_feature_ro_compat &
+              cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+                return 0;
        offset = offsetof(struct ext4_group_desc, bg_checksum);
        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
@@ -2191,7 +2165,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                /* don't clear list on RO mount w/ errors */
                if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
-                        jbd_debug(1, "Errors on filesystem, "
+                        ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
                                  "clearing orphan list.\n");
                        es->s_last_orphan = 0;
                }
@@ -2207,7 +2181,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        /* Needed for iput() to work correctly and not trash data */
        sb->s_flags |= MS_ACTIVE;
        /* Turn on quotas so that they are updated correctly */
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
                        if (ret < 0)
@@ -2263,7 +2237,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
                        dquot_quota_off(sb, i);
        }
@@ -2548,6 +2522,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
        return count;
 }
+static ssize_t es_ui_show(struct ext4_attr *a,
+                           struct ext4_sb_info *sbi, char *buf)
+{
+        unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
+                           a->u.offset);
+        return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
 static ssize_t reserved_clusters_show(struct ext4_attr *a,
                                  struct ext4_sb_info *sbi, char *buf)
 {
@@ -2601,14 +2585,29 @@ static struct ext4_attr ext4_attr_##_name = {			\
                .offset = offsetof(struct ext4_sb_info, _elname),\
        },                                                      \
 }
+#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)           \
+static struct ext4_attr ext4_attr_##_name = {                           \
+        .attr = {.name = __stringify(_name), .mode = _mode },           \
+        .show   = _show,                                                \
+        .store  = _store,                                               \
+        .u = {                                                          \
+                .offset = offsetof(struct ext4_super_block, _elname),   \
+        },                                                              \
+}
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RO_ATTR_ES_UI(name, elname)        \
+        EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
        EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
 #define EXT4_DEPRECATED_ATTR(_name, _val)       \
 static struct ext4_attr ext4_attr_##_name = {                   \
@@ -2641,6 +2640,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
@@ -2664,6 +2666,9 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(warning_ratelimit_burst),
        ATTR_LIST(msg_ratelimit_interval_ms),
        ATTR_LIST(msg_ratelimit_burst),
+        ATTR_LIST(errors_count),
+        ATTR_LIST(first_error_time),
+        ATTR_LIST(last_error_time),
        NULL,
 };
@@ -2723,9 +2728,25 @@ static void ext4_feat_release(struct kobject *kobj)
        complete(&ext4_feat->f_kobj_unregister);
 }
+static ssize_t ext4_feat_show(struct kobject *kobj,
+                              struct attribute *attr, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "supported\n");
+}
+/*
+ * We can not use ext4_attr_show/store because it relies on the kobject
+ * being embedded in the ext4_sb_info structure which is definitely not
+ * true in this case.
+ */
+static const struct sysfs_ops ext4_feat_ops = {
+        .show   = ext4_feat_show,
+        .store  = NULL,
+};
 static struct kobj_type ext4_feat_ktype = {
        .default_attrs  = ext4_feat_attrs,
-        .sysfs_ops      = &ext4_attr_ops,
+        .sysfs_ops      = &ext4_feat_ops,
        .release        = ext4_feat_release,
 };
@@ -3179,8 +3200,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
        int compat, incompat;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (ext4_has_metadata_csum(sb)) {
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                /* journal checksum v3 */
                compat = 0;
                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -3190,6 +3210,10 @@ static int set_journal_csum_feature_set(struct super_block *sb)
                incompat = 0;
        }
+        jbd2_journal_clear_features(sbi->s_journal,
+                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
+                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
@@ -3202,11 +3226,8 @@ static int set_journal_csum_feature_set(struct super_block *sb)
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        } else {
-                jbd2_journal_clear_features(sbi->s_journal,
+                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
-                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
-                                JBD2_FEATURE_INCOMPAT_CSUM_V3 |
-                                JBD2_FEATURE_INCOMPAT_CSUM_V2);
        }
        return ret;
@@ -3436,7 +3457,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                logical_sb_block = sb_block;
        }
-        if (!(bh = sb_bread(sb, logical_sb_block))) {
+        if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                goto out_fail;
        }
@@ -3487,8 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* Precompute checksum seed for all metadata */
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+        if (ext4_has_metadata_csum(sb))
-                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                               sizeof(es->s_uuid));
@@ -3506,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
 #endif
+        /* don't forget to enable journal_csum when metadata_csum is enabled. */
+        if (ext4_has_metadata_csum(sb))
+                set_opt(sb, JOURNAL_CHECKSUM);
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3519,8 +3543,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sb, ERRORS_CONT);
        else
                set_opt(sb, ERRORS_RO);
-        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+        /* block_validity enabled by default; disable with noblock_validity */
-                set_opt(sb, BLOCK_VALIDITY);
+        set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);
@@ -3646,7 +3670,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                brelse(bh);
                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
-                bh = sb_bread(sb, logical_sb_block);
+                bh = sb_bread_unmovable(sb, logical_sb_block);
                if (!bh) {
                        ext4_msg(sb, KERN_ERR,
                               "Can't read superblock on 2nd try");
@@ -3868,7 +3892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
-                sbi->s_group_desc[i] = sb_bread(sb, block);
+                sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
                if (!sbi->s_group_desc[i]) {
                        ext4_msg(sb, KERN_ERR,
                               "can't read group descriptor %d", i);
@@ -3890,13 +3914,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.data = (unsigned long) sb;
        /* Register extent status tree shrinker */
-        ext4_es_register_shrinker(sbi);
+        if (ext4_es_register_shrinker(sbi))
-        err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
-        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_extent_max_zeroout_kb = 32;
@@ -3904,11 +3923,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /*
         * set up enough so that it can read an inode
         */
-        if (!test_opt(sb, NOLOAD) &&
+        sb->s_op = &ext4_sops;
-            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
-                sb->s_op = &ext4_sops;
-        else
-                sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -3932,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
            !(sb->s_flags & MS_RDONLY))
                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
-                        goto failed_mount3;
+                        goto failed_mount3a;
        /*
         * The first inode we look at is the journal inode.  Don't try
@@ -3941,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!test_opt(sb, NOLOAD) &&
            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext4_load_journal(sb, es, journal_devnum))
-                        goto failed_mount3;
+                        goto failed_mount3a;
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4229,10 +4244,10 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-failed_mount3:
+failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
+failed_mount3:
        del_timer_sync(&sbi->s_err_report);
-        percpu_counter_destroy(&sbi->s_extent_cache_cnt);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4247,7 +4262,7 @@ failed_mount:
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
 #endif
        ext4_blkdev_remove(sbi);
@@ -4375,6 +4390,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
+        if ((le32_to_cpu(es->s_feature_ro_compat) &
+             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+            es->s_checksum != ext4_superblock_csum(sb, es)) {
+                ext4_msg(sb, KERN_ERR, "external journal has "
+                                       "corrupt superblock");
+                brelse(bh);
+                goto out_bdev;
+        }
        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
@@ -4677,15 +4701,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
         * being sent at the end of the function. But we can skip it if
         * transaction_commit will do it for us.
         */
-        target = jbd2_get_latest_transaction(sbi->s_journal);
+        if (sbi->s_journal) {
-        if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+                target = jbd2_get_latest_transaction(sbi->s_journal);
-            !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+                        needs_barrier = true;
+                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+                        if (wait)
+                                ret = jbd2_log_wait_commit(sbi->s_journal,
+                                                           target);
+                }
+        } else if (wait && test_opt(sb, BARRIER))
                needs_barrier = true;
-        if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
-                if (wait)
-                        ret = jbd2_log_wait_commit(sbi->s_journal, target);
-        }
        if (needs_barrier) {
                int err;
                err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4696,19 +4724,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        return ret;
 }
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
-{
-        int ret = 0;
-        trace_ext4_sync_fs(sb, wait);
-        flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
-        dquot_writeback_dquots(sb, -1);
-        if (wait && test_opt(sb, BARRIER))
-                ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
-        return ret;
-}
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
@@ -4727,23 +4742,26 @@ static int ext4_freeze(struct super_block *sb)
        journal = EXT4_SB(sb)->s_journal;
-        /* Now we set up the journal barrier. */
+        if (journal) {
-        jbd2_journal_lock_updates(journal);
+                /* Now we set up the journal barrier. */
+                jbd2_journal_lock_updates(journal);
-        /*
+                /*
-         * Don't clear the needs_recovery flag if we failed to flush
+                 * Don't clear the needs_recovery flag if we failed to
-         * the journal.
+                 * flush the journal.
-         */
+                 */
-        error = jbd2_journal_flush(journal);
+                error = jbd2_journal_flush(journal);
-        if (error < 0)
+                if (error < 0)
-                goto out;
+                        goto out;
+        }
        /* Journal blocked and flushed, clear needs_recovery flag. */
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
 out:
-        /* we rely on upper layer to stop further updates */
+        if (journal)
-        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                /* we rely on upper layer to stop further updates */
+                jbd2_journal_unlock_updates(journal);
        return error;
 }
@@ -4774,7 +4792,7 @@ struct ext4_mount_options {
        u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
+        char *s_qf_names[EXT4_MAXQUOTAS];
 #endif
 };
@@ -4804,7 +4822,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
                                                         GFP_KERNEL);
@@ -4828,6 +4846,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
+        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+            test_opt(sb, JOURNAL_CHECKSUM)) {
+                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+                         "during remount not supported");
+                err = -EINVAL;
+                goto restore_opts;
+        }
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -4965,7 +4991,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
@@ -4994,7 +5020,7 @@ restore_opts:
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
-        for (i = 0; i < MAXQUOTAS; i++) {
+        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                kfree(sbi->s_qf_names[i]);
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
@@ -5197,7 +5223,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 {
        int err;
        struct inode *qf_inode;
-        unsigned long qf_inums[MAXQUOTAS] = {
+        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
        };
@@ -5225,13 +5251,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 static int ext4_enable_quotas(struct super_block *sb)
 {
        int type, err = 0;
-        unsigned long qf_inums[MAXQUOTAS] = {
+        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
        };
        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
-        for (type = 0; type < MAXQUOTAS; type++) {
+        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                                DQUOT_USAGE_ENABLED);
@@ -5309,7 +5335,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
-        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
        size_t toread;
@@ -5324,9 +5349,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
        while (toread > 0) {
                tocopy = sb->s_blocksize - offset < toread ?
                                sb->s_blocksize - offset : toread;
-                bh = ext4_bread(NULL, inode, blk, 0, &err);
+                bh = ext4_bread(NULL, inode, blk, 0);
-                if (err)
+                if (IS_ERR(bh))
-                        return err;
+                        return PTR_ERR(bh);
                if (!bh)        /* A hole? */
                        memset(data, 0, tocopy);
                else
@@ -5347,8 +5372,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
-        int err = 0;
+        int err, offset = off & (sb->s_blocksize - 1);
-        int offset = off & (sb->s_blocksize - 1);
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
@@ -5369,14 +5393,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                return -EIO;
        }
-        bh = ext4_bread(handle, inode, blk, 1, &err);
+        bh = ext4_bread(handle, inode, blk, 1);
+        if (IS_ERR(bh))
+                return PTR_ERR(bh);
        if (!bh)
                goto out;
        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                brelse(bh);
-                goto out;
+                return err;
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
@@ -5385,8 +5411,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
 out:
-        if (err)
-                return err;
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e7387337060c..1e09fc77395c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
                                        sector_t block_nr,
                                        struct ext4_xattr_header *hdr)
 {
-        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (ext4_has_metadata_csum(inode->i_sb) &&
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
                return 0;
        return 1;
@@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode,
                                      sector_t block_nr,
                                      struct ext4_xattr_header *hdr)
 {
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+        if (!ext4_has_metadata_csum(inode->i_sb))
-                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                return;
        hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
@@ -190,14 +188,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+                       void *value_start)
 {
-        while (!IS_LAST_ENTRY(entry)) {
+        struct ext4_xattr_entry *e = entry;
-                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+        while (!IS_LAST_ENTRY(e)) {
+                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
                if ((void *)next >= end)
                        return -EIO;
-                entry = next;
+                e = next;
        }
+        while (!IS_LAST_ENTRY(entry)) {
+                if (entry->e_value_size != 0 &&
+                    (value_start + le16_to_cpu(entry->e_value_offs) <
+                     (void *)e + sizeof(__u32) ||
+                     value_start + le16_to_cpu(entry->e_value_offs) +
+                    le32_to_cpu(entry->e_value_size) > end))
+                        return -EIO;
+                entry = EXT4_XATTR_NEXT(entry);
+        }
        return 0;
 }
@@ -214,7 +226,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
                return -EIO;
        if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
                return -EIO;
-        error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+        error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
+                                       bh->b_data);
        if (!error)
                set_buffer_verified(bh);
        return error;
@@ -331,7 +344,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
        header = IHDR(inode, raw_inode);
        entry = IFIRST(header);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-        error = ext4_xattr_check_names(entry, end);
+        error = ext4_xattr_check_names(entry, end, entry);
        if (error)
                goto cleanup;
        error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -463,7 +476,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-        error = ext4_xattr_check_names(IFIRST(header), end);
+        error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
        if (error)
                goto cleanup;
        error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -899,14 +912,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-                        /*
-                         * take i_data_sem because we will test
-                         * i_delalloc_reserved_flag in ext4_mb_new_blocks
-                         */
-                        down_read(&EXT4_I(inode)->i_data_sem);
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
-                        up_read((&EXT4_I(inode)->i_data_sem));
                        if (error)
                                goto cleanup;
@@ -986,7 +993,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
        is->s.here = is->s.first;
        is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
-                error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+                error = ext4_xattr_check_names(IFIRST(header), is->s.end,
+                                               IFIRST(header));
                if (error)
                        return error;
                /* Find the named attribute. */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6df8d3d885e5..b8b92c2f9683 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -736,7 +736,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        }
        alias = d_find_alias(inode);
-        if (alias && !vfat_d_anon_disconn(alias)) {
+        /*
+         * Checking "alias->d_parent == dentry->d_parent" to make sure
+         * FS is not corrupted (especially double linked dir).
+         */
+        if (alias && alias->d_parent == dentry->d_parent &&
+            !vfat_d_anon_disconn(alias)) {
                /*
                 * This inode has non anonymous-DCACHE_DISCONNECTED
                 * dentry. This means, the user did ->lookup() by an
@@ -755,12 +760,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
-        dentry->d_time = dentry->d_parent->d_inode->i_version;
+        if (!inode)
-        dentry = d_splice_alias(inode, dentry);
+                dentry->d_time = dir->i_version;
-        if (dentry)
+        return d_splice_alias(inode, dentry);
-                dentry->d_time = dentry->d_parent->d_inode->i_version;
-        return dentry;
 error:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return ERR_PTR(err);
@@ -793,7 +795,6 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
        /* timestamp is already written, so mark_inode_dirty() is unneeded. */
-        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -824,6 +825,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        clear_nlink(inode);
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
+        dentry->d_time = dir->i_version;
 out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -849,6 +851,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        clear_nlink(inode);
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
+        dentry->d_time = dir->i_version;
 out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -889,7 +892,6 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
        /* timestamp is already written, so mark_inode_dirty() is unneeded. */
-        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
diff --git a/fs/internal.h b/fs/internal.h
index 9477f8f6aefc..757ba2abf21e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -47,7 +47,6 @@ extern void __init chrdev_init(void);
 /*
 * namei.c
 */
-extern int __inode_permission(struct inode *, int);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
                           const char *, unsigned int, struct path *);
@@ -139,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 /*
- * splice.c
- */
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-                loff_t *opos, size_t len, unsigned int flags);
-/*
 * pipe.c
 */
 extern const struct file_operations pipefifo_fops;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 881b3bd0143f..d67a16f2a45d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,13 +29,9 @@
 #define BEQUIET
 static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
                const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);
-static int isofs_dentry_cmp(const struct dentry *parent,
-                const struct dentry *dentry,
-                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
 static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
@@ -135,10 +131,6 @@ static const struct super_operations isofs_sops = {
 static const struct dentry_operations isofs_dentry_ops[] = {
        {
-                .d_hash         = isofs_hash,
-                .d_compare      = isofs_dentry_cmp,
-        },
-        {
                .d_hash         = isofs_hashi,
                .d_compare      = isofs_dentry_cmpi,
        },
@@ -182,27 +174,6 @@ struct iso9660_options{
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hash_common(struct qstr *qstr, int ms)
-{
-        const char *name;
-        int len;
-        len = qstr->len;
-        name = qstr->name;
-        if (ms) {
-                while (len && name[len-1] == '.')
-                        len--;
-        }
-        qstr->hash = full_name_hash(name, len);
-        return 0;
-}
-/*
- * Compute the hash for the isofs name corresponding to the dentry.
- */
-static int
 isofs_hashi_common(struct qstr *qstr, int ms)
 {
        const char *name;
@@ -258,32 +229,40 @@ static int isofs_dentry_cmp_common(
 }
 static int
-isofs_hash(const struct dentry *dentry, struct qstr *qstr)
-{
-        return isofs_hash_common(qstr, 0);
-}
-static int
 isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
        return isofs_hashi_common(qstr, 0);
 }
 static int
-isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(len, str, name, 0, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
+#ifdef CONFIG_JOLIET
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
 static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
+isofs_hash_common(struct qstr *qstr, int ms)
-                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(len, str, name, 0, 1);
+        const char *name;
+        int len;
+        len = qstr->len;
+        name = qstr->name;
+        if (ms) {
+                while (len && name[len-1] == '.')
+                        len--;
+        }
+        qstr->hash = full_name_hash(name, len);
+        return 0;
 }
-#ifdef CONFIG_JOLIET
 static int
 isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
 {
@@ -930,7 +909,8 @@ root_found:
        if (opt.check == 'r')
                table++;
-        s->s_d_op = &isofs_dentry_ops[table];
+        if (table)
+                s->s_d_op = &isofs_dentry_ops[table - 1];
        /* get the root dentry */
        s->s_root = d_make_root(inode);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 95295640d9c8..7b543e6b6526 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -18,25 +18,10 @@ static int
 isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 {
        struct qstr qstr;
-        if (!compare)
-                return 1;
-        /* check special "." and ".." files */
-        if (dlen == 1) {
-                /* "." */
-                if (compare[0] == 0) {
-                        if (!dentry->d_name.len)
-                                return 0;
-                        compare = ".";
-                } else if (compare[0] == 1) {
-                        compare = "..";
-                        dlen = 2;
-                }
-        }
        qstr.name = compare;
        qstr.len = dlen;
+        if (likely(!dentry->d_op))
+                return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
        return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
 }
@@ -146,7 +131,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
                                (!(de->flags[-sbi->s_high_sierra] & 1))) &&
                        (sbi->s_showassoc ||
                                (!(de->flags[-sbi->s_high_sierra] & 4)))) {
-                        match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+                        if (dpnt && (dlen > 1 || dpnt[0] > 1))
+                                match = (isofs_cmp(dentry, dpnt, dlen) == 0);
                }
                if (match) {
                        isofs_normalize_block_and_offset(de,
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e0abfa..aab8549591e7 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
                goto out_err;
        }
-        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
        if (!bh) {
                printk(KERN_ERR
                       "%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd2b61e..dcead636c33b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
 #include <linux/bio.h>
 #endif
 #include <linux/log2.h>
+#include <linux/hash.h>
 static struct kmem_cache *revoke_record_cache;
 static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 /* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned int block)
 {
        struct jbd_revoke_table_s *table = journal->j_revoke;
-        int hash_shift = table->hash_shift;
-        return ((block << (hash_shift - 6)) ^
+        return hash_32(block, table->hash_shift);
-                (block >> 13) ^
-                (block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 7f34f4716165..988b32ed4c87 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        if (jh->b_transaction == NULL && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
-                /*
-                 * Get our reference so that bh cannot be freed before
-                 * we unlock it
-                 */
-                get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
-                BUFFER_TRACE(bh, "release");
-                __brelse(bh);
        }
        return ret;
 }
@@ -122,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        nblocks = jbd2_space_needed(journal);
        while (jbd2_log_space_left(journal) < nblocks) {
-                if (journal->j_flags & JBD2_ABORT)
-                        return;
                write_unlock(&journal->j_state_lock);
                mutex_lock(&journal->j_checkpoint_mutex);
@@ -139,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                 * trace for forensic evidence.
                 */
                write_lock(&journal->j_state_lock);
+                if (journal->j_flags & JBD2_ABORT) {
+                        mutex_unlock(&journal->j_checkpoint_mutex);
+                        return;
+                }
                spin_lock(&journal->j_list_lock);
                nblocks = jbd2_space_needed(journal);
                space_left = jbd2_log_space_left(journal);
@@ -183,58 +178,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        }
 }
-/*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
- *
- * Return 0 on success, and return <0 if some buffers have failed
- * to be written out.
- *
- * Called with j_list_lock held.
- */
-static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
-{
-        struct journal_head *jh;
-        struct buffer_head *bh;
-        tid_t this_tid;
-        int released = 0;
-        int ret = 0;
-        this_tid = transaction->t_tid;
-restart:
-        /* Did somebody clean up the transaction in the meanwhile? */
-        if (journal->j_checkpoint_transactions != transaction ||
-                        transaction->t_tid != this_tid)
-                return ret;
-        while (!released && transaction->t_checkpoint_io_list) {
-                jh = transaction->t_checkpoint_io_list;
-                bh = jh2bh(jh);
-                get_bh(bh);
-                if (buffer_locked(bh)) {
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        /* the journal_head may have gone by now */
-                        BUFFER_TRACE(bh, "brelse");
-                        __brelse(bh);
-                        spin_lock(&journal->j_list_lock);
-                        goto restart;
-                }
-                if (unlikely(buffer_write_io_error(bh)))
-                        ret = -EIO;
-                /*
-                 * Now in whatever state the buffer currently is, we know that
-                 * it has been written out and so we can drop it from the list
-                 */
-                released = __jbd2_journal_remove_checkpoint(jh);
-                __brelse(bh);
-        }
-        return ret;
-}
 static void
 __flush_batch(journal_t *journal, int *batch_count)
 {
@@ -255,81 +198,6 @@ __flush_batch(journal_t *journal, int *batch_count)
 }
 /*
- * Try to flush one buffer from the checkpoint list to disk.
- *
- * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.  Return <0 if the buffer has failed to
- * be written out.
- *
- * Called with j_list_lock held and drops it if 1 is returned
- */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                            int *batch_count, transaction_t *transaction)
-{
-        struct buffer_head *bh = jh2bh(jh);
-        int ret = 0;
-        if (buffer_locked(bh)) {
-                get_bh(bh);
-                spin_unlock(&journal->j_list_lock);
-                wait_on_buffer(bh);
-                /* the journal_head may have gone by now */
-                BUFFER_TRACE(bh, "brelse");
-                __brelse(bh);
-                ret = 1;
-        } else if (jh->b_transaction != NULL) {
-                transaction_t *t = jh->b_transaction;
-                tid_t tid = t->t_tid;
-                transaction->t_chp_stats.cs_forced_to_close++;
-                spin_unlock(&journal->j_list_lock);
-                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
-                        /*
-                         * The journal thread is dead; so starting and
-                         * waiting for a commit to finish will cause
-                         * us to wait for a _very_ long time.
-                         */
-                        printk(KERN_ERR "JBD2: %s: "
-                               "Waiting for Godot: block %llu\n",
-                               journal->j_devname,
-                               (unsigned long long) bh->b_blocknr);
-                jbd2_log_start_commit(journal, tid);
-                jbd2_log_wait_commit(journal, tid);
-                ret = 1;
-        } else if (!buffer_dirty(bh)) {
-                ret = 1;
-                if (unlikely(buffer_write_io_error(bh)))
-                        ret = -EIO;
-                get_bh(bh);
-                BUFFER_TRACE(bh, "remove from checkpoint");
-                __jbd2_journal_remove_checkpoint(jh);
-                spin_unlock(&journal->j_list_lock);
-                __brelse(bh);
-        } else {
-                /*
-                 * Important: we are about to write the buffer, and
-                 * possibly block, while still holding the journal lock.
-                 * We cannot afford to let the transaction logic start
-                 * messing around with this buffer before we write it to
-                 * disk, as that would break recoverability.
-                 */
-                BUFFER_TRACE(bh, "queue");
-                get_bh(bh);
-                J_ASSERT_BH(bh, !buffer_jwrite(bh));
-                journal->j_chkpt_bhs[*batch_count] = bh;
-                __buffer_relink_io(jh);
-                transaction->t_chp_stats.cs_written++;
-                (*batch_count)++;
-                if (*batch_count == JBD2_NR_BATCH) {
-                        spin_unlock(&journal->j_list_lock);
-                        __flush_batch(journal, batch_count);
-                        ret = 1;
-                }
-        }
-        return ret;
-}
-/*
 * Perform an actual checkpoint. We take the first transaction on the
 * list of transactions to be checkpointed and send all its buffers
 * to disk. We submit larger chunks of data at once.
@@ -339,9 +207,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
-        transaction_t *transaction;
+        struct journal_head     *jh;
-        tid_t this_tid;
+        struct buffer_head      *bh;
-        int result;
+        transaction_t           *transaction;
+        tid_t                   this_tid;
+        int                     result, batch_count = 0;
        jbd_debug(1, "Start checkpoint\n");
@@ -374,45 +244,117 @@ restart:
         * done (maybe it's a new transaction, but it fell at the same
         * address).
         */
-        if (journal->j_checkpoint_transactions == transaction &&
+        if (journal->j_checkpoint_transactions != transaction ||
-                        transaction->t_tid == this_tid) {
+            transaction->t_tid != this_tid)
-                int batch_count = 0;
+                goto out;
-                struct journal_head *jh;
-                int retry = 0, err;
+        /* checkpoint all of the transaction's buffers */
+        while (transaction->t_checkpoint_list) {
-                while (!retry && transaction->t_checkpoint_list) {
+                jh = transaction->t_checkpoint_list;
-                        jh = transaction->t_checkpoint_list;
+                bh = jh2bh(jh);
-                        retry = __process_buffer(journal, jh, &batch_count,
-                                                 transaction);
+                if (buffer_locked(bh)) {
-                        if (retry < 0 && !result)
+                        spin_unlock(&journal->j_list_lock);
-                                result = retry;
+                        get_bh(bh);
-                        if (!retry && (need_resched() ||
+                        wait_on_buffer(bh);
-                                spin_needbreak(&journal->j_list_lock))) {
+                        /* the journal_head may have gone by now */
-                                spin_unlock(&journal->j_list_lock);
+                        BUFFER_TRACE(bh, "brelse");
-                                retry = 1;
+                        __brelse(bh);
-                                break;
+                        goto retry;
-                        }
                }
+                if (jh->b_transaction != NULL) {
+                        transaction_t *t = jh->b_transaction;
+                        tid_t tid = t->t_tid;
-                if (batch_count) {
+                        transaction->t_chp_stats.cs_forced_to_close++;
-                        if (!retry) {
+                        spin_unlock(&journal->j_list_lock);
-                                spin_unlock(&journal->j_list_lock);
+                        if (unlikely(journal->j_flags & JBD2_UNMOUNT))
-                                retry = 1;
+                                /*
-                        }
+                                 * The journal thread is dead; so
-                        __flush_batch(journal, &batch_count);
+                                 * starting and waiting for a commit
+                                 * to finish will cause us to wait for
+                                 * a _very_ long time.
+                                 */
+                                printk(KERN_ERR
+                "JBD2: %s: Waiting for Godot: block %llu\n",
+                journal->j_devname, (unsigned long long) bh->b_blocknr);
+                        jbd2_log_start_commit(journal, tid);
+                        jbd2_log_wait_commit(journal, tid);
+                        goto retry;
+                }
+                if (!buffer_dirty(bh)) {
+                        if (unlikely(buffer_write_io_error(bh)) && !result)
+                                result = -EIO;
+                        BUFFER_TRACE(bh, "remove from checkpoint");
+                        if (__jbd2_journal_remove_checkpoint(jh))
+                                /* The transaction was released; we're done */
+                                goto out;
+                        continue;
                }
+                /*
+                 * Important: we are about to write the buffer, and
+                 * possibly block, while still holding the journal
+                 * lock.  We cannot afford to let the transaction
+                 * logic start messing around with this buffer before
+                 * we write it to disk, as that would break
+                 * recoverability.
+                 */
+                BUFFER_TRACE(bh, "queue");
+                get_bh(bh);
+                J_ASSERT_BH(bh, !buffer_jwrite(bh));
+                journal->j_chkpt_bhs[batch_count++] = bh;
+                __buffer_relink_io(jh);
+                transaction->t_chp_stats.cs_written++;
+                if ((batch_count == JBD2_NR_BATCH) ||
+                    need_resched() ||
+                    spin_needbreak(&journal->j_list_lock))
+                        goto unlock_and_flush;
+        }
-                if (retry) {
+        if (batch_count) {
+                unlock_and_flush:
+                        spin_unlock(&journal->j_list_lock);
+                retry:
+                        if (batch_count)
+                                __flush_batch(journal, &batch_count);
                        spin_lock(&journal->j_list_lock);
                        goto restart;
+        }
+        /*
+         * Now we issued all of the transaction's buffers, let's deal
+         * with the buffers that are out for I/O.
+         */
+restart2:
+        /* Did somebody clean up the transaction in the meanwhile? */
+        if (journal->j_checkpoint_transactions != transaction ||
+            transaction->t_tid != this_tid)
+                goto out;
+        while (transaction->t_checkpoint_io_list) {
+                jh = transaction->t_checkpoint_io_list;
+                bh = jh2bh(jh);
+                if (buffer_locked(bh)) {
+                        spin_unlock(&journal->j_list_lock);
+                        get_bh(bh);
+                        wait_on_buffer(bh);
+                        /* the journal_head may have gone by now */
+                        BUFFER_TRACE(bh, "brelse");
+                        __brelse(bh);
+                        spin_lock(&journal->j_list_lock);
+                        goto restart2;
                }
+                if (unlikely(buffer_write_io_error(bh)) && !result)
+                        result = -EIO;
                /*
-                 * Now we have cleaned up the first transaction's checkpoint
+                 * Now in whatever state the buffer currently is, we
-                 * list. Let's clean up the second one
+                 * know that it has been written out and so we can
+                 * drop it from the list
                 */
-                err = __wait_cp_io(journal, transaction);
+                if (__jbd2_journal_remove_checkpoint(jh))
-                if (!result)
+                        break;
-                        result = err;
        }
 out:
        spin_unlock(&journal->j_list_lock);
@@ -478,18 +420,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 * Find all the written-back checkpoint buffers in the given list and
 * release them.
 *
- * Called with the journal locked.
 * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
+ * Returns 1 if we freed the transaction, 0 otherwise.
 */
+static int journal_clean_one_cp_list(struct journal_head *jh)
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 {
        struct journal_head *last_jh;
        struct journal_head *next_jh = jh;
-        int ret, freed = 0;
+        int ret;
+        int freed = 0;
-        *released = 0;
        if (!jh)
                return 0;
@@ -498,13 +438,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
                jh = next_jh;
                next_jh = jh->b_cpnext;
                ret = __try_to_free_cp_buf(jh);
-                if (ret) {
+                if (!ret)
-                        freed++;
+                        return freed;
-                        if (ret == 2) {
+                if (ret == 2)
-                                *released = 1;
+                        return 1;
-                                return freed;
+                freed = 1;
-                        }
-                }
                /*
                 * This function only frees up some memory
                 * if possible so we dont have an obligation
@@ -523,49 +461,49 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 *
 * Find all the written-back checkpoint buffers in the journal and release them.
 *
- * Called with the journal locked.
 * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
 */
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
-int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 {
        transaction_t *transaction, *last_transaction, *next_transaction;
-        int ret = 0;
+        int ret;
-        int released;
        transaction = journal->j_checkpoint_transactions;
        if (!transaction)
-                goto out;
+                return;
        last_transaction = transaction->t_cpprev;
        next_transaction = transaction;
        do {
                transaction = next_transaction;
                next_transaction = transaction->t_cpnext;
-                ret += journal_clean_one_cp_list(transaction->
+                ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
-                                t_checkpoint_list, &released);
                /*
                 * This function only frees up some memory if possible so we
                 * dont have an obligation to finish processing. Bail out if
                 * preemption requested:
                 */
                if (need_resched())
-                        goto out;
+                        return;
-                if (released)
+                if (ret)
                        continue;
                /*
                 * It is essential that we are as careful as in the case of
                 * t_checkpoint_list with removing the buffer from the list as
                 * we can possibly see not yet submitted buffers on io_list
                 */
-                ret += journal_clean_one_cp_list(transaction->
+                ret = journal_clean_one_cp_list(transaction->
-                                t_checkpoint_io_list, &released);
+                                t_checkpoint_io_list);
                if (need_resched())
-                        goto out;
+                        return;
+                /*
+                 * Stop scanning if we couldn't free the transaction. This
+                 * avoids pointless scanning of transactions which still
+                 * weren't checkpointed.
+                 */
+                if (!ret)
+                        return;
        } while (transaction != last_transaction);
-out:
-        return ret;
 }
 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 19d74d86d99c..1df94fabe4eb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
                goto out_err;
        }
-        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
        if (!bh) {
                printk(KERN_ERR
                       "%s: Cannot get buffer for journal superblock\n",
@@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
-        if (jbd2_journal_has_csum_v2or3(journal) &&
-            JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
-                /* Can't have checksum v1 and v2 on at the same time! */
-                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
-                       "at the same time!\n");
-                goto out;
-        }
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
            JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                /* Can't have checksum v2 and v3 at the same time! */
@@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
+        if (jbd2_journal_has_csum_v2or3(journal) &&
+            JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+                /* Can't have checksum v1 and v2 on at the same time! */
+                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+                       "at the same time!\n");
+                goto out;
+        }
        if (!jbd2_verify_csum_type(journal, sb)) {
                printk(KERN_ERR "JBD2: Unknown checksum type\n");
                goto out;
@@ -1853,13 +1853,12 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                                journal->j_chksum_driver = NULL;
                                return 0;
                        }
-                }
-                /* Precompute checksum seed for all metadata */
+                        /* Precompute checksum seed for all metadata */
-                if (jbd2_journal_has_csum_v2or3(journal))
                        journal->j_csum_seed = jbd2_chksum(journal, ~0,
                                                           sb->s_uuid,
                                                           sizeof(sb->s_uuid));
+                }
        }
        /* If enabling v1 checksums, downgrade superblock */
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9b329b55ffe3..bcbef08a4d8f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal,
                            !jbd2_descr_block_csum_verify(journal,
                                                          bh->b_data)) {
                                err = -EIO;
+                                brelse(bh);
                                goto failed;
                        }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index d5e95a175c92..c6cbaef2bda1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -92,6 +92,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #endif
 static struct kmem_cache *jbd2_revoke_record_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 /* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned long long block)
 {
-        struct jbd2_revoke_table_s *table = journal->j_revoke;
+        return hash_64(block, journal->j_revoke->hash_shift);
-        int hash_shift = table->hash_shift;
-        int hash = (int)block ^ (int)((block >> 31) >> 1);
-        return ((hash << (hash_shift - 6)) ^
-                (hash >> 13) ^
-                (hash << (hash_shift - 12))) & (table->hash_size - 1);
 }
 static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
diff --git a/fs/namei.c b/fs/namei.c
index 43927d14db67..db5fe86319e6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
        return security_inode_permission(inode, mask);
 }
+EXPORT_SYMBOL(__inode_permission);
 /**
 * sb_permission - Check superblock-level permissions
@@ -2383,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
 }
 EXPORT_SYMBOL(kern_path_mountpoint);
-/*
+int __check_sticky(struct inode *dir, struct inode *inode)
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
- */
-static inline int check_sticky(struct inode *dir, struct inode *inode)
 {
        kuid_t fsuid = current_fsuid();
-        if (!(dir->i_mode & S_ISVTX))
-                return 0;
        if (uid_eq(inode->i_uid, fsuid))
                return 0;
        if (uid_eq(dir->i_uid, fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
 }
+EXPORT_SYMBOL(__check_sticky);
 /*
 *      Check whether we can remove a link victim from directory dir, check
@@ -2501,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
        }
        mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
        return NULL;
 }
 EXPORT_SYMBOL(lock_rename);
@@ -3064,9 +3060,12 @@ finish_open_created:
        error = may_open(&nd->path, acc_mode, open_flag);
        if (error)
                goto out;
-        file->f_path.mnt = nd->path.mnt;
-        error = finish_open(file, nd->path.dentry, NULL, opened);
+        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
-        if (error) {
+        error = vfs_open(&nd->path, file, current_cred());
+        if (!error) {
+                *opened |= FILE_OPENED;
+        } else {
                if (error == -EOPENSTALE)
                        goto stale_open;
                goto out;
@@ -3155,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
        if (error)
                goto out2;
        audit_inode(pathname, nd->path.dentry, 0);
-        error = may_open(&nd->path, op->acc_mode, op->open_flag);
+        /* Don't check for other permissions, the inode was just created */
+        error = may_open(&nd->path, MAY_OPEN, op->open_flag);
        if (error)
                goto out2;
        file->f_path.mnt = nd->path.mnt;
@@ -4210,12 +4210,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
        bool should_retry = false;
        int error;
-        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
-        if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+            (flags & RENAME_EXCHANGE))
                return -EINVAL;
+        if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+                return -EPERM;
 retry:
        from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
        if (IS_ERR(from)) {
@@ -4347,6 +4351,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
        return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+        int error = may_create(dir, dentry);
+        if (error)
+                return error;
+        if (!dir->i_op->mknod)
+                return -EPERM;
+        return dir->i_op->mknod(dir, dentry,
+                                S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
 int readlink_copy(char __user *buffer, int buflen, const char *link)
 {
        int len = PTR_ERR(link);
diff --git a/fs/namespace.c b/fs/namespace.c
index fbba8b17330d..5b66b2b3624d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
        namespace_unlock();
 }
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path.  The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+        struct mount *old_mnt = real_mount(path->mnt);
+        struct mount *new_mnt;
+        if (IS_MNT_UNBINDABLE(old_mnt))
+                return ERR_PTR(-EINVAL);
+        down_read(&namespace_sem);
+        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+        up_read(&namespace_sem);
+        if (IS_ERR(new_mnt))
+                return ERR_CAST(new_mnt);
+        return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
 {
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f201d3d5..4f46f7a05289 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
        loff_t offset = header->args.offset;
        size_t count = header->args.count;
        struct page **pages = header->args.pages;
-        int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+        int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
        unsigned int pg_len;
        struct blk_plug plug;
        int i;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c023b1b7..acbf9ca4018c 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+        mutex_lock(&nn->bl_mutex);
        bl_pipe_msg.bl_wq = &nn->bl_wq;
        b->simple.len += 4;     /* single volume */
        if (b->simple.len > PAGE_SIZE)
-                return -EIO;
+                goto out_unlock;
        memset(msg, 0, sizeof(*msg));
        msg->len = sizeof(*bl_msg) + b->simple.len;
        msg->data = kzalloc(msg->len, gfp_mask);
        if (!msg->data)
-                goto out;
+                goto out_free_data;
        bl_msg = msg->data;
        bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
        if (rc < 0) {
                remove_wait_queue(&nn->bl_wq, &wq);
-                goto out;
+                goto out_free_data;
        }
        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
        if (reply->status != BL_DEVICE_REQUEST_PROC) {
                printk(KERN_WARNING "%s failed to decode device: %d\n",
                        __func__, reply->status);
-                goto out;
+                goto out_free_data;
        }
        dev = MKDEV(reply->major, reply->minor);
-out:
+out_free_data:
        kfree(msg->data);
+out_unlock:
+        mutex_unlock(&nn->bl_mutex);
        return dev;
 }
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
        struct nfs_net *nn = net_generic(net, nfs_net_id);
        struct dentry *dentry;
+        mutex_init(&nn->bl_mutex);
        init_waitqueue_head(&nn->bl_wq);
        nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
        if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53db732..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
                        continue;
                if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
                        continue;
+                if (!nfs4_valid_open_stateid(state))
+                        continue;
                if (!nfs4_stateid_match(&state->stateid, stateid))
                        continue;
                get_nfs_open_context(ctx);
@@ -193,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 {
        int res = 0;
-        res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+        if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+                res = nfs4_proc_delegreturn(inode,
+                                delegation->cred,
+                                &delegation->stateid,
+                                issync);
        nfs_free_delegation(delegation);
        return res;
 }
@@ -380,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
-        int err;
+        int err = 0;
        if (delegation == NULL)
                return 0;
        do {
+                if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+                        break;
                err = nfs_delegation_claim_opens(inode, &delegation->stateid);
                if (!issync || err != -EAGAIN)
                        break;
@@ -605,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
        rcu_read_unlock();
 }
+static void nfs_revoke_delegation(struct inode *inode)
+{
+        struct nfs_delegation *delegation;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation != NULL) {
+                set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+                nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+        }
+        rcu_read_unlock();
+}
 void nfs_remove_bad_delegation(struct inode *inode)
 {
        struct nfs_delegation *delegation;
+        nfs_revoke_delegation(inode);
        delegation = nfs_inode_detach_delegation(inode);
        if (delegation) {
                nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce39297f..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
        NFS_DELEGATION_RETURN_IF_CLOSED,
        NFS_DELEGATION_REFERENCED,
        NFS_DELEGATION_RETURNING,
+        NFS_DELEGATION_REVOKED,
 };
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfcbb670..6e62155abf26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
                case -ENOENT:
                        d_drop(dentry);
                        d_add(dentry, NULL);
+                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                        break;
                case -EISDIR:
                case -ENOTDIR:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc830468..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
 {
        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+        nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
        if (dreq->l_ctx != NULL)
                nfs_put_lock_context(dreq->l_ctx);
        if (dreq->ctx != NULL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb455a..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-                if (state == NULL)
-                        break;
-                nfs_remove_bad_delegation(state->inode);
        case -NFS4ERR_OPENMODE:
                if (state == NULL)
                        break;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59f2add..00689a8a85e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
        int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
-        int err;
+        int err = 0;
        trace_nfs_getattr_enter(inode);
        /* Flush out writes to the server in order to update c/mtime.  */
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb8a183..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
        struct rpc_pipe *bl_device_pipe;
        struct bl_dev_msg bl_mount_reply;
        wait_queue_head_t bl_wq;
+        struct mutex bl_mutex;
        struct list_head nfs_client_list;
        struct list_head nfs_volume_list;
 #if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95c1f58..69dc20a743f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                        if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
-                                nfs_remove_bad_delegation(inode);
-                                exception->retry = 1;
-                                break;
-                        }
                        if (state == NULL)
                                break;
                        ret = nfs4_schedule_stateid_recovery(server, state);
@@ -1654,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                        nfs_inode_find_state_and_recover(state->inode,
                                        stateid);
                        nfs4_schedule_stateid_recovery(server, state);
-                        return 0;
+                        return -EAGAIN;
                case -NFS4ERR_DELAY:
                case -NFS4ERR_GRACE:
                        set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2109,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
        return ret;
 }
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+        nfs_remove_bad_delegation(state->inode);
+        write_seqlock(&state->seqlock);
+        nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+        write_sequnlock(&state->seqlock);
+        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+        if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+                nfs_finish_clear_delegation_stateid(state);
+}
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+        /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+        nfs40_clear_delegation_stateid(state);
+        return nfs4_open_expired(sp, state);
+}
 #if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
-        nfs4_stateid *stateid = &state->stateid;
+        nfs4_stateid stateid;
        struct nfs_delegation *delegation;
-        struct rpc_cred *cred = NULL;
+        struct rpc_cred *cred;
-        int status = -NFS4ERR_BAD_STATEID;
+        int status;
-        /* If a state reset has been done, test_stateid is unneeded */
-        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-                return;
        /* Get the delegation credential for use by test/free_stateid */
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-        if (delegation != NULL &&
+        if (delegation == NULL) {
-            nfs4_stateid_match(&delegation->stateid, stateid)) {
-                cred = get_rpccred(delegation->cred);
-                rcu_read_unlock();
-                status = nfs41_test_stateid(server, stateid, cred);
-                trace_nfs4_test_delegation_stateid(state, NULL, status);
-        } else
                rcu_read_unlock();
+                return;
+        }
+        nfs4_stateid_copy(&stateid, &delegation->stateid);
+        cred = get_rpccred(delegation->cred);
+        rcu_read_unlock();
+        status = nfs41_test_stateid(server, &stateid, cred);
+        trace_nfs4_test_delegation_stateid(state, NULL, status);
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                        nfs41_free_stateid(server, stateid, cred);
+                        nfs41_free_stateid(server, &stateid, cred);
-                nfs_remove_bad_delegation(state->inode);
+                nfs_finish_clear_delegation_stateid(state);
-                write_seqlock(&state->seqlock);
-                nfs4_stateid_copy(&state->stateid, &state->open_stateid);
-                write_sequnlock(&state->seqlock);
-                clear_bit(NFS_DELEGATED_STATE, &state->flags);
        }
-        if (cred != NULL)
+        put_rpccred(cred);
-                put_rpccred(cred);
 }
 /**
@@ -2192,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 {
        int status;
-        nfs41_clear_delegation_stateid(state);
+        nfs41_check_delegation_stateid(state);
        status = nfs41_check_open_stateid(state);
        if (status != NFS_OK)
                status = nfs4_open_expired(sp, state);
@@ -2231,19 +2240,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
        ret = _nfs4_proc_open(opendata);
-        if (ret != 0) {
+        if (ret != 0)
-                if (ret == -ENOENT) {
-                        dentry = opendata->dentry;
-                        if (dentry->d_inode)
-                                d_delete(dentry);
-                        else if (d_unhashed(dentry))
-                                d_add(dentry, NULL);
-                        nfs_set_verifier(dentry,
-                                         nfs_save_change_attribute(opendata->dir->d_inode));
-                }
                goto out;
-        }
        state = nfs4_opendata_to_nfs4_state(opendata);
        ret = PTR_ERR(state);
@@ -4841,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs_remove_bad_delegation(state->inode);
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
@@ -8341,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
 static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
-        .recover_open   = nfs4_open_expired,
+        .recover_open   = nfs40_open_expired,
        .recover_lock   = nfs4_lock_expired,
        .establish_clid = nfs4_init_clientid,
 };
@@ -8408,8 +8403,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
                | NFS_CAP_CHANGE_ATTR
                | NFS_CAP_POSIX_LOCK
                | NFS_CAP_STATEID_NFSV41
-                | NFS_CAP_ATOMIC_OPEN_V1
+                | NFS_CAP_ATOMIC_OPEN_V1,
-                | NFS_CAP_SEEK,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
@@ -8431,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_CHANGE_ATTR
                | NFS_CAP_POSIX_LOCK
                | NFS_CAP_STATEID_NFSV41
-                | NFS_CAP_ATOMIC_OPEN_V1,
+                | NFS_CAP_ATOMIC_OPEN_V1
+                | NFS_CAP_SEEK,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6e4bda63000..9e5bc42180e4 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -5,7 +5,7 @@
 *  All rights reserved.
 *
 *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index c89357c7a914..919efd4a1a23 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -5,7 +5,7 @@
 *  All rights reserved.
 *
 *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 3a0828d57339..2641dbad345c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -6,7 +6,7 @@
 *  All rights reserved.
 *
 *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index b3918f7ac34d..f093c7ec983b 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -5,7 +5,7 @@
 *  All rights reserved.
 *
 *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12493846a2d3..f83b02dc9166 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
                nfs_release_request(req);
-        else
-                WARN_ON_ONCE(1);
 }
 static void
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ed2b1151b171..7cbdf1b2e4ab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -774,8 +774,12 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
 {
        if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
                rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
-                dprintk("%s slot is busy\n", __func__);
+                /* Race breaker */
-                return false;
+                if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+                        dprintk("%s slot is busy\n", __func__);
+                        return false;
+                }
+                rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
        }
        return true;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cdeb3cfd6f32..0beb023f25ac 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1272,7 +1272,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
         */
        if (argp->opcnt == resp->opcnt)
                return false;
+        if (next->opnum == OP_ILLEGAL)
+                return false;
        nextd = OPDESC(next);
        /*
         * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
@@ -1589,7 +1590,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
 static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
                                       struct nfsd4_op *op)
 {
-        return NFS4_MAX_SESSIONID_LEN + 20;
+        return (op_encode_hdr_size
+                + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1893,6 +1895,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_sequence,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_SEQUENCE",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
        },
        [OP_DESTROY_CLIENTID] = {
                .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 747f3b95bd11..33a46a8dfaf7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -335,12 +335,15 @@ void		nfsd_lockd_shutdown(void);
        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+#define NFSD4_2_SECURITY_ATTRS          FATTR4_WORD2_SECURITY_LABEL
-        (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
 #else
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#define NFSD4_2_SECURITY_ATTRS          0
 #endif
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+        (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+        NFSD4_2_SECURITY_ATTRS)
 static inline u32 nfsd_suppattrs0(u32 minorversion)
 {
        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..89326acd4561 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                                              &fsnotify_mark_srcu);
        }
+        /*
+         * We need to merge inode & vfsmount mark lists so that inode mark
+         * ignore masks are properly reflected for mount mark notifications.
+         * That's why this traversal is so complicated...
+         */
        while (inode_node || vfsmount_node) {
-                inode_group = vfsmount_group = NULL;
+                inode_group = NULL;
+                inode_mark = NULL;
+                vfsmount_group = NULL;
+                vfsmount_mark = NULL;
                if (inode_node) {
                        inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                        vfsmount_group = vfsmount_mark->group;
                }
-                if (inode_group > vfsmount_group) {
+                if (inode_group && vfsmount_group) {
-                        /* handle inode */
+                        int cmp = fsnotify_compare_groups(inode_group,
-                        ret = send_to_group(to_tell, inode_mark, NULL, mask,
+                                                          vfsmount_group);
-                                            data, data_is, cookie, file_name);
+                        if (cmp > 0) {
-                        /* we didn't use the vfsmount_mark */
+                                inode_group = NULL;
-                        vfsmount_group = NULL;
+                                inode_mark = NULL;
-                } else if (vfsmount_group > inode_group) {
+                        } else if (cmp < 0) {
-                        ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
+                                vfsmount_group = NULL;
-                                            data, data_is, cookie, file_name);
+                                vfsmount_mark = NULL;
-                        inode_group = NULL;
+                        }
-                } else {
-                        ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-                                            mask, data, data_is, cookie,
-                                            file_name);
                }
+                ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+                                    data, data_is, cookie, file_name);
                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c4cfe1..3b68b0ae0a97 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 /* protects reads of inode and vfsmount marks list */
 extern struct srcu_struct fsnotify_mark_srcu;
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+                                   struct fsnotify_group *b);
 extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
                                                __u32 mask);
 /* add a mark to an inode */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 9ce062218de9..dfbf5447eea4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 {
        struct fsnotify_mark *lmark, *last = NULL;
        int ret = 0;
+        int cmp;
        mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group->priority < lmark->group->priority)
+                cmp = fsnotify_compare_groups(lmark->group, mark->group);
-                        continue;
+                if (cmp < 0)
-                if ((mark->group->priority == lmark->group->priority) &&
-                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -288,20 +286,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
                spin_unlock(&inode->i_lock);
                /* In case the dropping of a reference would nuke next_i. */
-                if ((&next_i->i_sb_list != list) &&
+                while (&next_i->i_sb_list != list) {
-                    atomic_read(&next_i->i_count)) {
                        spin_lock(&next_i->i_lock);
-                        if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+                        if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+                                                atomic_read(&next_i->i_count)) {
                                __iget(next_i);
                                need_iput = next_i;
+                                spin_unlock(&next_i->i_lock);
+                                break;
                        }
                        spin_unlock(&next_i->i_lock);
+                        next_i = list_entry(next_i->i_sb_list.next,
+                                                struct inode, i_sb_list);
                }
                /*
-                 * We can safely drop inode_sb_list_lock here because we hold
+                 * We can safely drop inode_sb_list_lock here because either
-                 * references on both inode and next_i.  Also no new inodes
+                 * we actually hold references on both inode and next_i or
-                 * will be added since the umount has begun.
+                 * end of list.  Also no new inodes will be added since the
+                 * umount has begun.
                 */
                spin_unlock(&inode_sb_list_lock);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..34c38fabf514 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -210,6 +210,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
 }
 /*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+        if (a == b)
+                return 0;
+        if (!a)
+                return 1;
+        if (!b)
+                return -1;
+        if (a->priority < b->priority)
+                return 1;
+        if (a->priority > b->priority)
+                return -1;
+        if (a < b)
+                return 1;
+        return -1;
+}
+/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8376b1..faefa72a11eb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
        struct mount *m = real_mount(mnt);
        struct fsnotify_mark *lmark, *last = NULL;
        int ret = 0;
+        int cmp;
        mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group->priority < lmark->group->priority)
+                cmp = fsnotify_compare_groups(lmark->group, mark->group);
-                        continue;
+                if (cmp < 0)
-                if ((mark->group->priority == lmark->group->priority) &&
-                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fbd9f78..a96044004064 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
                              size_t veclen, size_t total)
 {
        int ret;
-        struct msghdr msg;
+        struct msghdr msg = {.msg_flags = 0,};
        if (sock == NULL) {
                ret = -EINVAL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1030d7..b931e04e3388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
                 * NOTE: This dentry already has ->d_op set from
                 * ocfs2_get_parent() and ocfs2_get_dentry()
                 */
-                if (ret)
+                if (!IS_ERR_OR_NULL(ret))
                        dentry = ret;
                status = ocfs2_dentry_attach_lock(dentry, inode,
diff --git a/fs/open.c b/fs/open.c
index d6fd3acde134..de92c13b58be 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags,
        f = get_empty_filp();
        if (!IS_ERR(f)) {
                f->f_flags = flags;
-                f->f_path = *path;
+                error = vfs_open(path, f, cred);
-                error = do_dentry_open(f, NULL, cred);
                if (!error) {
                        /* from now on we need fput() to dispose of f */
                        error = open_check_o_direct(f);
@@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+             const struct cred *cred)
+{
+        struct inode *inode = path->dentry->d_inode;
+        if (inode->i_op->dentry_open)
+                return inode->i_op->dentry_open(path->dentry, filp, cred);
+        else {
+                filp->f_path = *path;
+                return do_dentry_open(filp, NULL, cred);
+        }
+}
+EXPORT_SYMBOL(vfs_open);
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
        int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..34355818a2e0
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAY_FS
+        tristate "Overlay filesystem support"
+        help
+          An overlay filesystem combines two filesystems - an 'upper' filesystem
+          and a 'lower' filesystem.  When a name exists in both filesystems, the
+          object in the 'upper' filesystem is visible while the object in the
+          'lower' filesystem is either hidden or, in the case of directories,
+          merged with the 'upper' object.
+          For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..900daed3e91d
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
+overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..ea10a8719107
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+        ssize_t list_size, size;
+        char *buf, *name, *value;
+        int error;
+        if (!old->d_inode->i_op->getxattr ||
+            !new->d_inode->i_op->getxattr)
+                return 0;
+        list_size = vfs_listxattr(old, NULL, 0);
+        if (list_size <= 0) {
+                if (list_size == -EOPNOTSUPP)
+                        return 0;
+                return list_size;
+        }
+        buf = kzalloc(list_size, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        error = -ENOMEM;
+        value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+        if (!value)
+                goto out;
+        list_size = vfs_listxattr(old, buf, list_size);
+        if (list_size <= 0) {
+                error = list_size;
+                goto out_free_value;
+        }
+        for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+                size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+                if (size <= 0) {
+                        error = size;
+                        goto out_free_value;
+                }
+                error = vfs_setxattr(new, name, value, size, 0);
+                if (error)
+                        goto out_free_value;
+        }
+out_free_value:
+        kfree(value);
+out:
+        kfree(buf);
+        return error;
+}
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+        struct file *old_file;
+        struct file *new_file;
+        loff_t old_pos = 0;
+        loff_t new_pos = 0;
+        int error = 0;
+        if (len == 0)
+                return 0;
+        old_file = ovl_path_open(old, O_RDONLY);
+        if (IS_ERR(old_file))
+                return PTR_ERR(old_file);
+        new_file = ovl_path_open(new, O_WRONLY);
+        if (IS_ERR(new_file)) {
+                error = PTR_ERR(new_file);
+                goto out_fput;
+        }
+        /* FIXME: copy up sparse files efficiently */
+        while (len) {
+                size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+                long bytes;
+                if (len < this_len)
+                        this_len = len;
+                if (signal_pending_state(TASK_KILLABLE, current)) {
+                        error = -EINTR;
+                        break;
+                }
+                bytes = do_splice_direct(old_file, &old_pos,
+                                         new_file, &new_pos,
+                                         this_len, SPLICE_F_MOVE);
+                if (bytes <= 0) {
+                        error = bytes;
+                        break;
+                }
+                WARN_ON(old_pos != new_pos);
+                len -= bytes;
+        }
+        fput(new_file);
+out_fput:
+        fput(old_file);
+        return error;
+}
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+        int res;
+        char *buf;
+        struct inode *inode = realdentry->d_inode;
+        mm_segment_t old_fs;
+        res = -EINVAL;
+        if (!inode->i_op->readlink)
+                goto err;
+        res = -ENOMEM;
+        buf = (char *) __get_free_page(GFP_KERNEL);
+        if (!buf)
+                goto err;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = inode->i_op->readlink(realdentry,
+                                    (char __user *)buf, PAGE_SIZE - 1);
+        set_fs(old_fs);
+        if (res < 0) {
+                free_page((unsigned long) buf);
+                goto err;
+        }
+        buf[res] = '\0';
+        return buf;
+err:
+        return ERR_PTR(res);
+}
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+        struct iattr attr = {
+                .ia_valid =
+                     ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+                .ia_atime = stat->atime,
+                .ia_mtime = stat->mtime,
+        };
+        return notify_change(upperdentry, &attr, NULL);
+}
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+        int err = 0;
+        if (!S_ISLNK(stat->mode)) {
+                struct iattr attr = {
+                        .ia_valid = ATTR_MODE,
+                        .ia_mode = stat->mode,
+                };
+                err = notify_change(upperdentry, &attr, NULL);
+        }
+        if (!err) {
+                struct iattr attr = {
+                        .ia_valid = ATTR_UID | ATTR_GID,
+                        .ia_uid = stat->uid,
+                        .ia_gid = stat->gid,
+                };
+                err = notify_change(upperdentry, &attr, NULL);
+        }
+        if (!err)
+                ovl_set_timestamps(upperdentry, stat);
+        return err;
+}
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+                              struct dentry *dentry, struct path *lowerpath,
+                              struct kstat *stat, struct iattr *attr,
+                              const char *link)
+{
+        struct inode *wdir = workdir->d_inode;
+        struct inode *udir = upperdir->d_inode;
+        struct dentry *newdentry = NULL;
+        struct dentry *upper = NULL;
+        umode_t mode = stat->mode;
+        int err;
+        newdentry = ovl_lookup_temp(workdir, dentry);
+        err = PTR_ERR(newdentry);
+        if (IS_ERR(newdentry))
+                goto out;
+        upper = lookup_one_len(dentry->d_name.name, upperdir,
+                               dentry->d_name.len);
+        err = PTR_ERR(upper);
+        if (IS_ERR(upper))
+                goto out1;
+        /* Can't properly set mode on creation because of the umask */
+        stat->mode &= S_IFMT;
+        err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+        stat->mode = mode;
+        if (err)
+                goto out2;
+        if (S_ISREG(stat->mode)) {
+                struct path upperpath;
+                ovl_path_upper(dentry, &upperpath);
+                BUG_ON(upperpath.dentry != NULL);
+                upperpath.dentry = newdentry;
+                err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+                if (err)
+                        goto out_cleanup;
+        }
+        err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+        if (err)
+                goto out_cleanup;
+        mutex_lock(&newdentry->d_inode->i_mutex);
+        err = ovl_set_attr(newdentry, stat);
+        if (!err && attr)
+                err = notify_change(newdentry, attr, NULL);
+        mutex_unlock(&newdentry->d_inode->i_mutex);
+        if (err)
+                goto out_cleanup;
+        err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+        if (err)
+                goto out_cleanup;
+        ovl_dentry_update(dentry, newdentry);
+        newdentry = NULL;
+        /*
+         * Non-directores become opaque when copied up.
+         */
+        if (!S_ISDIR(stat->mode))
+                ovl_dentry_set_opaque(dentry, true);
+out2:
+        dput(upper);
+out1:
+        dput(newdentry);
+out:
+        return err;
+out_cleanup:
+        ovl_cleanup(wdir, newdentry);
+        goto out;
+}
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up).  Directories which are on lower or
+ * are merged may not be renamed.  For these -EXDEV is returned and
+ * userspace has to deal with it.  This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary.  The
+ * actual rename will only proceed once the copy up was successful.  Copy
+ * up uses upper parent i_mutex for exclusion.  Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent.  At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+                    struct path *lowerpath, struct kstat *stat,
+                    struct iattr *attr)
+{
+        struct dentry *workdir = ovl_workdir(dentry);
+        int err;
+        struct kstat pstat;
+        struct path parentpath;
+        struct dentry *upperdir;
+        struct dentry *upperdentry;
+        const struct cred *old_cred;
+        struct cred *override_cred;
+        char *link = NULL;
+        ovl_path_upper(parent, &parentpath);
+        upperdir = parentpath.dentry;
+        err = vfs_getattr(&parentpath, &pstat);
+        if (err)
+                return err;
+        if (S_ISLNK(stat->mode)) {
+                link = ovl_read_symlink(lowerpath->dentry);
+                if (IS_ERR(link))
+                        return PTR_ERR(link);
+        }
+        err = -ENOMEM;
+        override_cred = prepare_creds();
+        if (!override_cred)
+                goto out_free_link;
+        override_cred->fsuid = stat->uid;
+        override_cred->fsgid = stat->gid;
+        /*
+         * CAP_SYS_ADMIN for copying up extended attributes
+         * CAP_DAC_OVERRIDE for create
+         * CAP_FOWNER for chmod, timestamp update
+         * CAP_FSETID for chmod
+         * CAP_CHOWN for chown
+         * CAP_MKNOD for mknod
+         */
+        cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+        cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+        cap_raise(override_cred->cap_effective, CAP_FOWNER);
+        cap_raise(override_cred->cap_effective, CAP_FSETID);
+        cap_raise(override_cred->cap_effective, CAP_CHOWN);
+        cap_raise(override_cred->cap_effective, CAP_MKNOD);
+        old_cred = override_creds(override_cred);
+        err = -EIO;
+        if (lock_rename(workdir, upperdir) != NULL) {
+                pr_err("overlayfs: failed to lock workdir+upperdir\n");
+                goto out_unlock;
+        }
+        upperdentry = ovl_dentry_upper(dentry);
+        if (upperdentry) {
+                unlock_rename(workdir, upperdir);
+                err = 0;
+                /* Raced with another copy-up?  Do the setattr here */
+                if (attr) {
+                        mutex_lock(&upperdentry->d_inode->i_mutex);
+                        err = notify_change(upperdentry, attr, NULL);
+                        mutex_unlock(&upperdentry->d_inode->i_mutex);
+                }
+                goto out_put_cred;
+        }
+        err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+                                 stat, attr, link);
+        if (!err) {
+                /* Restore timestamps on parent (best effort) */
+                ovl_set_timestamps(upperdir, &pstat);
+        }
+out_unlock:
+        unlock_rename(workdir, upperdir);
+out_put_cred:
+        revert_creds(old_cred);
+        put_cred(override_cred);
+out_free_link:
+        if (link)
+                free_page((unsigned long) link);
+        return err;
+}
+int ovl_copy_up(struct dentry *dentry)
+{
+        int err;
+        err = 0;
+        while (!err) {
+                struct dentry *next;
+                struct dentry *parent;
+                struct path lowerpath;
+                struct kstat stat;
+                enum ovl_path_type type = ovl_path_type(dentry);
+                if (type != OVL_PATH_LOWER)
+                        break;
+                next = dget(dentry);
+                /* find the topmost dentry not yet copied up */
+                for (;;) {
+                        parent = dget_parent(next);
+                        type = ovl_path_type(parent);
+                        if (type != OVL_PATH_LOWER)
+                                break;
+                        dput(next);
+                        next = parent;
+                }
+                ovl_path_lower(next, &lowerpath);
+                err = vfs_getattr(&lowerpath, &stat);
+                if (!err)
+                        err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+                dput(parent);
+                dput(next);
+        }
+        return err;
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..8ffc4b980f1b
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,928 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+        int err;
+        dget(wdentry);
+        if (S_ISDIR(wdentry->d_inode->i_mode))
+                err = ovl_do_rmdir(wdir, wdentry);
+        else
+                err = ovl_do_unlink(wdir, wdentry);
+        dput(wdentry);
+        if (err) {
+                pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+                       wdentry, err);
+        }
+}
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+        struct dentry *temp;
+        char name[20];
+        snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+        temp = lookup_one_len(name, workdir, strlen(name));
+        if (!IS_ERR(temp) && temp->d_inode) {
+                pr_err("overlayfs: workdir/%s already exists\n", name);
+                dput(temp);
+                temp = ERR_PTR(-EIO);
+        }
+        return temp;
+}
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+                                   struct dentry *dentry)
+{
+        int err;
+        struct dentry *whiteout;
+        struct inode *wdir = workdir->d_inode;
+        whiteout = ovl_lookup_temp(workdir, dentry);
+        if (IS_ERR(whiteout))
+                return whiteout;
+        err = ovl_do_whiteout(wdir, whiteout);
+        if (err) {
+                dput(whiteout);
+                whiteout = ERR_PTR(err);
+        }
+        return whiteout;
+}
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+                    struct kstat *stat, const char *link,
+                    struct dentry *hardlink, bool debug)
+{
+        int err;
+        if (newdentry->d_inode)
+                return -ESTALE;
+        if (hardlink) {
+                err = ovl_do_link(hardlink, dir, newdentry, debug);
+        } else {
+                switch (stat->mode & S_IFMT) {
+                case S_IFREG:
+                        err = ovl_do_create(dir, newdentry, stat->mode, debug);
+                        break;
+                case S_IFDIR:
+                        err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+                        break;
+                case S_IFCHR:
+                case S_IFBLK:
+                case S_IFIFO:
+                case S_IFSOCK:
+                        err = ovl_do_mknod(dir, newdentry,
+                                           stat->mode, stat->rdev, debug);
+                        break;
+                case S_IFLNK:
+                        err = ovl_do_symlink(dir, newdentry, link, debug);
+                        break;
+                default:
+                        err = -EPERM;
+                }
+        }
+        if (!err && WARN_ON(!newdentry->d_inode)) {
+                /*
+                 * Not quite sure if non-instantiated dentry is legal or not.
+                 * VFS doesn't seem to care so check and warn here.
+                 */
+                err = -ENOENT;
+        }
+        return err;
+}
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+        return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+}
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+        int err;
+        err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+        if (err) {
+                pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+                        upperdentry->d_name.name, err);
+        }
+}
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                         struct kstat *stat)
+{
+        int err;
+        enum ovl_path_type type;
+        struct path realpath;
+        type = ovl_path_real(dentry, &realpath);
+        err = vfs_getattr(&realpath, stat);
+        if (err)
+                return err;
+        stat->dev = dentry->d_sb->s_dev;
+        stat->ino = dentry->d_inode->i_ino;
+        /*
+         * It's probably not worth it to count subdirs to get the
+         * correct link count.  nlink=1 seems to pacify 'find' and
+         * other utilities.
+         */
+        if (type == OVL_PATH_MERGE)
+                stat->nlink = 1;
+        return 0;
+}
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+                            struct kstat *stat, const char *link,
+                            struct dentry *hardlink)
+{
+        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+        struct inode *udir = upperdir->d_inode;
+        struct dentry *newdentry;
+        int err;
+        mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+        newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+                                   dentry->d_name.len);
+        err = PTR_ERR(newdentry);
+        if (IS_ERR(newdentry))
+                goto out_unlock;
+        err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+        if (err)
+                goto out_dput;
+        ovl_dentry_version_inc(dentry->d_parent);
+        ovl_dentry_update(dentry, newdentry);
+        ovl_copyattr(newdentry->d_inode, inode);
+        d_instantiate(dentry, inode);
+        newdentry = NULL;
+out_dput:
+        dput(newdentry);
+out_unlock:
+        mutex_unlock(&udir->i_mutex);
+        return err;
+}
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+                                   struct dentry *upperdir)
+{
+        /* Workdir should not be the same as upperdir */
+        if (workdir == upperdir)
+                goto err;
+        /* Workdir should not be subdir of upperdir and vice versa */
+        if (lock_rename(workdir, upperdir) != NULL)
+                goto err_unlock;
+        return 0;
+err_unlock:
+        unlock_rename(workdir, upperdir);
+err:
+        pr_err("overlayfs: failed to lock workdir+upperdir\n");
+        return -EIO;
+}
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+                                      struct list_head *list)
+{
+        struct dentry *workdir = ovl_workdir(dentry);
+        struct inode *wdir = workdir->d_inode;
+        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+        struct inode *udir = upperdir->d_inode;
+        struct path upperpath;
+        struct dentry *upper;
+        struct dentry *opaquedir;
+        struct kstat stat;
+        int err;
+        err = ovl_lock_rename_workdir(workdir, upperdir);
+        if (err)
+                goto out;
+        ovl_path_upper(dentry, &upperpath);
+        err = vfs_getattr(&upperpath, &stat);
+        if (err)
+                goto out_unlock;
+        err = -ESTALE;
+        if (!S_ISDIR(stat.mode))
+                goto out_unlock;
+        upper = upperpath.dentry;
+        if (upper->d_parent->d_inode != udir)
+                goto out_unlock;
+        opaquedir = ovl_lookup_temp(workdir, dentry);
+        err = PTR_ERR(opaquedir);
+        if (IS_ERR(opaquedir))
+                goto out_unlock;
+        err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+        if (err)
+                goto out_dput;
+        err = ovl_copy_xattr(upper, opaquedir);
+        if (err)
+                goto out_cleanup;
+        err = ovl_set_opaque(opaquedir);
+        if (err)
+                goto out_cleanup;
+        mutex_lock(&opaquedir->d_inode->i_mutex);
+        err = ovl_set_attr(opaquedir, &stat);
+        mutex_unlock(&opaquedir->d_inode->i_mutex);
+        if (err)
+                goto out_cleanup;
+        err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+        if (err)
+                goto out_cleanup;
+        ovl_cleanup_whiteouts(upper, list);
+        ovl_cleanup(wdir, upper);
+        unlock_rename(workdir, upperdir);
+        /* dentry's upper doesn't match now, get rid of it */
+        d_drop(dentry);
+        return opaquedir;
+out_cleanup:
+        ovl_cleanup(wdir, opaquedir);
+out_dput:
+        dput(opaquedir);
+out_unlock:
+        unlock_rename(workdir, upperdir);
+out:
+        return ERR_PTR(err);
+}
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
+{
+        int err;
+        struct dentry *ret = NULL;
+        LIST_HEAD(list);
+        err = ovl_check_empty_dir(dentry, &list);
+        if (err)
+                ret = ERR_PTR(err);
+        else {
+                /*
+                 * If no upperdentry then skip clearing whiteouts.
+                 *
+                 * Can race with copy-up, since we don't hold the upperdir
+                 * mutex.  Doesn't matter, since copy-up can't create a
+                 * non-empty directory from an empty one.
+                 */
+                if (ovl_dentry_upper(dentry))
+                        ret = ovl_clear_empty(dentry, &list);
+        }
+        ovl_cache_free(&list);
+        return ret;
+}
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+                                    struct kstat *stat, const char *link,
+                                    struct dentry *hardlink)
+{
+        struct dentry *workdir = ovl_workdir(dentry);
+        struct inode *wdir = workdir->d_inode;
+        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+        struct inode *udir = upperdir->d_inode;
+        struct dentry *upper;
+        struct dentry *newdentry;
+        int err;
+        err = ovl_lock_rename_workdir(workdir, upperdir);
+        if (err)
+                goto out;
+        newdentry = ovl_lookup_temp(workdir, dentry);
+        err = PTR_ERR(newdentry);
+        if (IS_ERR(newdentry))
+                goto out_unlock;
+        upper = lookup_one_len(dentry->d_name.name, upperdir,
+                               dentry->d_name.len);
+        err = PTR_ERR(upper);
+        if (IS_ERR(upper))
+                goto out_dput;
+        err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+        if (err)
+                goto out_dput2;
+        if (S_ISDIR(stat->mode)) {
+                err = ovl_set_opaque(newdentry);
+                if (err)
+                        goto out_cleanup;
+                err = ovl_do_rename(wdir, newdentry, udir, upper,
+                                    RENAME_EXCHANGE);
+                if (err)
+                        goto out_cleanup;
+                ovl_cleanup(wdir, upper);
+        } else {
+                err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+                if (err)
+                        goto out_cleanup;
+        }
+        ovl_dentry_version_inc(dentry->d_parent);
+        ovl_dentry_update(dentry, newdentry);
+        ovl_copyattr(newdentry->d_inode, inode);
+        d_instantiate(dentry, inode);
+        newdentry = NULL;
+out_dput2:
+        dput(upper);
+out_dput:
+        dput(newdentry);
+out_unlock:
+        unlock_rename(workdir, upperdir);
+out:
+        return err;
+out_cleanup:
+        ovl_cleanup(wdir, newdentry);
+        goto out_dput2;
+}
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+                              const char *link, struct dentry *hardlink)
+{
+        int err;
+        struct inode *inode;
+        struct kstat stat = {
+                .mode = mode,
+                .rdev = rdev,
+        };
+        err = -ENOMEM;
+        inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+        if (!inode)
+                goto out;
+        err = ovl_copy_up(dentry->d_parent);
+        if (err)
+                goto out_iput;
+        if (!ovl_dentry_is_opaque(dentry)) {
+                err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+        } else {
+                const struct cred *old_cred;
+                struct cred *override_cred;
+                err = -ENOMEM;
+                override_cred = prepare_creds();
+                if (!override_cred)
+                        goto out_iput;
+                /*
+                 * CAP_SYS_ADMIN for setting opaque xattr
+                 * CAP_DAC_OVERRIDE for create in workdir, rename
+                 * CAP_FOWNER for removing whiteout from sticky dir
+                 */
+                cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+                cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+                cap_raise(override_cred->cap_effective, CAP_FOWNER);
+                old_cred = override_creds(override_cred);
+                err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+                                               hardlink);
+                revert_creds(old_cred);
+                put_cred(override_cred);
+        }
+        if (!err)
+                inode = NULL;
+out_iput:
+        iput(inode);
+out:
+        return err;
+}
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+                             const char *link)
+{
+        int err;
+        err = ovl_want_write(dentry);
+        if (!err) {
+                err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+                ovl_drop_write(dentry);
+        }
+        return err;
+}
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+                      bool excl)
+{
+        return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+                     dev_t rdev)
+{
+        /* Don't allow creation of "whiteout" on overlay */
+        if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+                return -EPERM;
+        return ovl_create_object(dentry, mode, rdev, NULL);
+}
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+                       const char *link)
+{
+        return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+static int ovl_link(struct dentry *old, struct inode *newdir,
+                    struct dentry *new)
+{
+        int err;
+        struct dentry *upper;
+        err = ovl_want_write(old);
+        if (err)
+                goto out;
+        err = ovl_copy_up(old);
+        if (err)
+                goto out_drop_write;
+        upper = ovl_dentry_upper(old);
+        err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+out_drop_write:
+        ovl_drop_write(old);
+out:
+        return err;
+}
+static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
+{
+        struct dentry *workdir = ovl_workdir(dentry);
+        struct inode *wdir = workdir->d_inode;
+        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+        struct inode *udir = upperdir->d_inode;
+        struct dentry *whiteout;
+        struct dentry *upper;
+        struct dentry *opaquedir = NULL;
+        int err;
+        if (is_dir) {
+                opaquedir = ovl_check_empty_and_clear(dentry);
+                err = PTR_ERR(opaquedir);
+                if (IS_ERR(opaquedir))
+                        goto out;
+        }
+        err = ovl_lock_rename_workdir(workdir, upperdir);
+        if (err)
+                goto out_dput;
+        whiteout = ovl_whiteout(workdir, dentry);
+        err = PTR_ERR(whiteout);
+        if (IS_ERR(whiteout))
+                goto out_unlock;
+        upper = ovl_dentry_upper(dentry);
+        if (!upper) {
+                upper = lookup_one_len(dentry->d_name.name, upperdir,
+                                       dentry->d_name.len);
+                err = PTR_ERR(upper);
+                if (IS_ERR(upper))
+                        goto kill_whiteout;
+                err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+                dput(upper);
+                if (err)
+                        goto kill_whiteout;
+        } else {
+                int flags = 0;
+                if (opaquedir)
+                        upper = opaquedir;
+                err = -ESTALE;
+                if (upper->d_parent != upperdir)
+                        goto kill_whiteout;
+                if (is_dir)
+                        flags |= RENAME_EXCHANGE;
+                err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+                if (err)
+                        goto kill_whiteout;
+                if (is_dir)
+                        ovl_cleanup(wdir, upper);
+        }
+        ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+        d_drop(dentry);
+        dput(whiteout);
+out_unlock:
+        unlock_rename(workdir, upperdir);
+out_dput:
+        dput(opaquedir);
+out:
+        return err;
+kill_whiteout:
+        ovl_cleanup(wdir, whiteout);
+        goto out_d_drop;
+}
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+        struct inode *dir = upperdir->d_inode;
+        struct dentry *upper = ovl_dentry_upper(dentry);
+        int err;
+        mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+        err = -ESTALE;
+        if (upper->d_parent == upperdir) {
+                /* Don't let d_delete() think it can reset d_inode */
+                dget(upper);
+                if (is_dir)
+                        err = vfs_rmdir(dir, upper);
+                else
+                        err = vfs_unlink(dir, upper, NULL);
+                dput(upper);
+                ovl_dentry_version_inc(dentry->d_parent);
+        }
+        /*
+         * Keeping this dentry hashed would mean having to release
+         * upperpath/lowerpath, which could only be done if we are the
+         * sole user of this dentry.  Too tricky...  Just unhash for
+         * now.
+         */
+        d_drop(dentry);
+        mutex_unlock(&dir->i_mutex);
+        return err;
+}
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+        struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+        struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+        if (check_sticky(dir, inode))
+                return -EPERM;
+        return 0;
+}
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+        enum ovl_path_type type;
+        int err;
+        err = ovl_check_sticky(dentry);
+        if (err)
+                goto out;
+        err = ovl_want_write(dentry);
+        if (err)
+                goto out;
+        err = ovl_copy_up(dentry->d_parent);
+        if (err)
+                goto out_drop_write;
+        type = ovl_path_type(dentry);
+        if (type == OVL_PATH_PURE_UPPER) {
+                err = ovl_remove_upper(dentry, is_dir);
+        } else {
+                const struct cred *old_cred;
+                struct cred *override_cred;
+                err = -ENOMEM;
+                override_cred = prepare_creds();
+                if (!override_cred)
+                        goto out_drop_write;
+                /*
+                 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+                 * CAP_DAC_OVERRIDE for create in workdir, rename
+                 * CAP_FOWNER for removing whiteout from sticky dir
+                 * CAP_FSETID for chmod of opaque dir
+                 * CAP_CHOWN for chown of opaque dir
+                 */
+                cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+                cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+                cap_raise(override_cred->cap_effective, CAP_FOWNER);
+                cap_raise(override_cred->cap_effective, CAP_FSETID);
+                cap_raise(override_cred->cap_effective, CAP_CHOWN);
+                old_cred = override_creds(override_cred);
+                err = ovl_remove_and_whiteout(dentry, is_dir);
+                revert_creds(old_cred);
+                put_cred(override_cred);
+        }
+out_drop_write:
+        ovl_drop_write(dentry);
+out:
+        return err;
+}
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+        return ovl_do_remove(dentry, false);
+}
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        return ovl_do_remove(dentry, true);
+}
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+                       struct inode *newdir, struct dentry *new,
+                       unsigned int flags)
+{
+        int err;
+        enum ovl_path_type old_type;
+        enum ovl_path_type new_type;
+        struct dentry *old_upperdir;
+        struct dentry *new_upperdir;
+        struct dentry *olddentry;
+        struct dentry *newdentry;
+        struct dentry *trap;
+        bool old_opaque;
+        bool new_opaque;
+        bool new_create = false;
+        bool cleanup_whiteout = false;
+        bool overwrite = !(flags & RENAME_EXCHANGE);
+        bool is_dir = S_ISDIR(old->d_inode->i_mode);
+        bool new_is_dir = false;
+        struct dentry *opaquedir = NULL;
+        const struct cred *old_cred = NULL;
+        struct cred *override_cred = NULL;
+        err = -EINVAL;
+        if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+                goto out;
+        flags &= ~RENAME_NOREPLACE;
+        err = ovl_check_sticky(old);
+        if (err)
+                goto out;
+        /* Don't copy up directory trees */
+        old_type = ovl_path_type(old);
+        err = -EXDEV;
+        if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+                goto out;
+        if (new->d_inode) {
+                err = ovl_check_sticky(new);
+                if (err)
+                        goto out;
+                if (S_ISDIR(new->d_inode->i_mode))
+                        new_is_dir = true;
+                new_type = ovl_path_type(new);
+                err = -EXDEV;
+                if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+                        goto out;
+                err = 0;
+                if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+                        if (ovl_dentry_lower(old)->d_inode ==
+                            ovl_dentry_lower(new)->d_inode)
+                                goto out;
+                }
+                if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+                        if (ovl_dentry_upper(old)->d_inode ==
+                            ovl_dentry_upper(new)->d_inode)
+                                goto out;
+                }
+        } else {
+                if (ovl_dentry_is_opaque(new))
+                        new_type = OVL_PATH_UPPER;
+                else
+                        new_type = OVL_PATH_PURE_UPPER;
+        }
+        err = ovl_want_write(old);
+        if (err)
+                goto out;
+        err = ovl_copy_up(old);
+        if (err)
+                goto out_drop_write;
+        err = ovl_copy_up(new->d_parent);
+        if (err)
+                goto out_drop_write;
+        if (!overwrite) {
+                err = ovl_copy_up(new);
+                if (err)
+                        goto out_drop_write;
+        }
+        old_opaque = old_type != OVL_PATH_PURE_UPPER;
+        new_opaque = new_type != OVL_PATH_PURE_UPPER;
+        if (old_opaque || new_opaque) {
+                err = -ENOMEM;
+                override_cred = prepare_creds();
+                if (!override_cred)
+                        goto out_drop_write;
+                /*
+                 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+                 * CAP_DAC_OVERRIDE for create in workdir
+                 * CAP_FOWNER for removing whiteout from sticky dir
+                 * CAP_FSETID for chmod of opaque dir
+                 * CAP_CHOWN for chown of opaque dir
+                 */
+                cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+                cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+                cap_raise(override_cred->cap_effective, CAP_FOWNER);
+                cap_raise(override_cred->cap_effective, CAP_FSETID);
+                cap_raise(override_cred->cap_effective, CAP_CHOWN);
+                old_cred = override_creds(override_cred);
+        }
+        if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+                opaquedir = ovl_check_empty_and_clear(new);
+                err = PTR_ERR(opaquedir);
+                if (IS_ERR(opaquedir)) {
+                        opaquedir = NULL;
+                        goto out_revert_creds;
+                }
+        }
+        if (overwrite) {
+                if (old_opaque) {
+                        if (new->d_inode || !new_opaque) {
+                                /* Whiteout source */
+                                flags |= RENAME_WHITEOUT;
+                        } else {
+                                /* Switch whiteouts */
+                                flags |= RENAME_EXCHANGE;
+                        }
+                } else if (is_dir && !new->d_inode && new_opaque) {
+                        flags |= RENAME_EXCHANGE;
+                        cleanup_whiteout = true;
+                }
+        }
+        old_upperdir = ovl_dentry_upper(old->d_parent);
+        new_upperdir = ovl_dentry_upper(new->d_parent);
+        trap = lock_rename(new_upperdir, old_upperdir);
+        olddentry = ovl_dentry_upper(old);
+        newdentry = ovl_dentry_upper(new);
+        if (newdentry) {
+                if (opaquedir) {
+                        newdentry = opaquedir;
+                        opaquedir = NULL;
+                } else {
+                        dget(newdentry);
+                }
+        } else {
+                new_create = true;
+                newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+                                           new->d_name.len);
+                err = PTR_ERR(newdentry);
+                if (IS_ERR(newdentry))
+                        goto out_unlock;
+        }
+        err = -ESTALE;
+        if (olddentry->d_parent != old_upperdir)
+                goto out_dput;
+        if (newdentry->d_parent != new_upperdir)
+                goto out_dput;
+        if (olddentry == trap)
+                goto out_dput;
+        if (newdentry == trap)
+                goto out_dput;
+        if (is_dir && !old_opaque && new_opaque) {
+                err = ovl_set_opaque(olddentry);
+                if (err)
+                        goto out_dput;
+        }
+        if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+                err = ovl_set_opaque(newdentry);
+                if (err)
+                        goto out_dput;
+        }
+        if (old_opaque || new_opaque) {
+                err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+                                    new_upperdir->d_inode, newdentry,
+                                    flags);
+        } else {
+                /* No debug for the plain case */
+                BUG_ON(flags & ~RENAME_EXCHANGE);
+                err = vfs_rename(old_upperdir->d_inode, olddentry,
+                                 new_upperdir->d_inode, newdentry,
+                                 NULL, flags);
+        }
+        if (err) {
+                if (is_dir && !old_opaque && new_opaque)
+                        ovl_remove_opaque(olddentry);
+                if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+                        ovl_remove_opaque(newdentry);
+                goto out_dput;
+        }
+        if (is_dir && old_opaque && !new_opaque)
+                ovl_remove_opaque(olddentry);
+        if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+                ovl_remove_opaque(newdentry);
+        if (old_opaque != new_opaque) {
+                ovl_dentry_set_opaque(old, new_opaque);
+                if (!overwrite)
+                        ovl_dentry_set_opaque(new, old_opaque);
+        }
+        if (cleanup_whiteout)
+                ovl_cleanup(old_upperdir->d_inode, newdentry);
+        ovl_dentry_version_inc(old->d_parent);
+        ovl_dentry_version_inc(new->d_parent);
+out_dput:
+        dput(newdentry);
+out_unlock:
+        unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+        if (old_opaque || new_opaque) {
+                revert_creds(old_cred);
+                put_cred(override_cred);
+        }
+out_drop_write:
+        ovl_drop_write(old);
+out:
+        dput(opaquedir);
+        return err;
+}
+const struct inode_operations ovl_dir_inode_operations = {
+        .lookup         = ovl_lookup,
+        .mkdir          = ovl_mkdir,
+        .symlink        = ovl_symlink,
+        .unlink         = ovl_unlink,
+        .rmdir          = ovl_rmdir,
+        .rename2        = ovl_rename2,
+        .link           = ovl_link,
+        .setattr        = ovl_setattr,
+        .create         = ovl_create,
+        .mknod          = ovl_mknod,
+        .permission     = ovl_permission,
+        .getattr        = ovl_dir_getattr,
+        .setxattr       = ovl_setxattr,
+        .getxattr       = ovl_getxattr,
+        .listxattr      = ovl_listxattr,
+        .removexattr    = ovl_removexattr,
+};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..07d74b24913b
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,434 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+                            bool no_data)
+{
+        int err;
+        struct dentry *parent;
+        struct kstat stat;
+        struct path lowerpath;
+        parent = dget_parent(dentry);
+        err = ovl_copy_up(parent);
+        if (err)
+                goto out_dput_parent;
+        ovl_path_lower(dentry, &lowerpath);
+        err = vfs_getattr(&lowerpath, &stat);
+        if (err)
+                goto out_dput_parent;
+        if (no_data)
+                stat.size = 0;
+        err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+out_dput_parent:
+        dput(parent);
+        return err;
+}
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        int err;
+        struct dentry *upperdentry;
+        err = ovl_want_write(dentry);
+        if (err)
+                goto out;
+        upperdentry = ovl_dentry_upper(dentry);
+        if (upperdentry) {
+                mutex_lock(&upperdentry->d_inode->i_mutex);
+                err = notify_change(upperdentry, attr, NULL);
+                mutex_unlock(&upperdentry->d_inode->i_mutex);
+        } else {
+                err = ovl_copy_up_last(dentry, attr, false);
+        }
+        ovl_drop_write(dentry);
+out:
+        return err;
+}
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                         struct kstat *stat)
+{
+        struct path realpath;
+        ovl_path_real(dentry, &realpath);
+        return vfs_getattr(&realpath, stat);
+}
+int ovl_permission(struct inode *inode, int mask)
+{
+        struct ovl_entry *oe;
+        struct dentry *alias = NULL;
+        struct inode *realinode;
+        struct dentry *realdentry;
+        bool is_upper;
+        int err;
+        if (S_ISDIR(inode->i_mode)) {
+                oe = inode->i_private;
+        } else if (mask & MAY_NOT_BLOCK) {
+                return -ECHILD;
+        } else {
+                /*
+                 * For non-directories find an alias and get the info
+                 * from there.
+                 */
+                alias = d_find_any_alias(inode);
+                if (WARN_ON(!alias))
+                        return -ENOENT;
+                oe = alias->d_fsdata;
+        }
+        realdentry = ovl_entry_real(oe, &is_upper);
+        /* Careful in RCU walk mode */
+        realinode = ACCESS_ONCE(realdentry->d_inode);
+        if (!realinode) {
+                WARN_ON(!(mask & MAY_NOT_BLOCK));
+                err = -ENOENT;
+                goto out_dput;
+        }
+        if (mask & MAY_WRITE) {
+                umode_t mode = realinode->i_mode;
+                /*
+                 * Writes will always be redirected to upper layer, so
+                 * ignore lower layer being read-only.
+                 *
+                 * If the overlay itself is read-only then proceed
+                 * with the permission check, don't return EROFS.
+                 * This will only happen if this is the lower layer of
+                 * another overlayfs.
+                 *
+                 * If upper fs becomes read-only after the overlay was
+                 * constructed return EROFS to prevent modification of
+                 * upper layer.
+                 */
+                err = -EROFS;
+                if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+                        goto out_dput;
+        }
+        err = __inode_permission(realinode, mask);
+out_dput:
+        dput(alias);
+        return err;
+}
+struct ovl_link_data {
+        struct dentry *realdentry;
+        void *cookie;
+};
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        void *ret;
+        struct dentry *realdentry;
+        struct inode *realinode;
+        realdentry = ovl_dentry_real(dentry);
+        realinode = realdentry->d_inode;
+        if (WARN_ON(!realinode->i_op->follow_link))
+                return ERR_PTR(-EPERM);
+        ret = realinode->i_op->follow_link(realdentry, nd);
+        if (IS_ERR(ret))
+                return ret;
+        if (realinode->i_op->put_link) {
+                struct ovl_link_data *data;
+                data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+                if (!data) {
+                        realinode->i_op->put_link(realdentry, nd, ret);
+                        return ERR_PTR(-ENOMEM);
+                }
+                data->realdentry = realdentry;
+                data->cookie = ret;
+                return data;
+        } else {
+                return NULL;
+        }
+}
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+        struct inode *realinode;
+        struct ovl_link_data *data = c;
+        if (!data)
+                return;
+        realinode = data->realdentry->d_inode;
+        realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+        kfree(data);
+}
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+        struct path realpath;
+        struct inode *realinode;
+        ovl_path_real(dentry, &realpath);
+        realinode = realpath.dentry->d_inode;
+        if (!realinode->i_op->readlink)
+                return -EINVAL;
+        touch_atime(&realpath);
+        return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+static bool ovl_is_private_xattr(const char *name)
+{
+        return strncmp(name, "trusted.overlay.", 14) == 0;
+}
+int ovl_setxattr(struct dentry *dentry, const char *name,
+                 const void *value, size_t size, int flags)
+{
+        int err;
+        struct dentry *upperdentry;
+        err = ovl_want_write(dentry);
+        if (err)
+                goto out;
+        err = -EPERM;
+        if (ovl_is_private_xattr(name))
+                goto out_drop_write;
+        err = ovl_copy_up(dentry);
+        if (err)
+                goto out_drop_write;
+        upperdentry = ovl_dentry_upper(dentry);
+        err = vfs_setxattr(upperdentry, name, value, size, flags);
+out_drop_write:
+        ovl_drop_write(dentry);
+out:
+        return err;
+}
+static bool ovl_need_xattr_filter(struct dentry *dentry,
+                                  enum ovl_path_type type)
+{
+        return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode);
+}
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+                     void *value, size_t size)
+{
+        struct path realpath;
+        enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+        if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+                return -ENODATA;
+        return vfs_getxattr(realpath.dentry, name, value, size);
+}
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+        struct path realpath;
+        enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+        ssize_t res;
+        int off;
+        res = vfs_listxattr(realpath.dentry, list, size);
+        if (res <= 0 || size == 0)
+                return res;
+        if (!ovl_need_xattr_filter(dentry, type))
+                return res;
+        /* filter out private xattrs */
+        for (off = 0; off < res;) {
+                char *s = list + off;
+                size_t slen = strlen(s) + 1;
+                BUG_ON(off + slen > res);
+                if (ovl_is_private_xattr(s)) {
+                        res -= slen;
+                        memmove(s, s + slen, res - off);
+                } else {
+                        off += slen;
+                }
+        }
+        return res;
+}
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+        int err;
+        struct path realpath;
+        enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+        err = ovl_want_write(dentry);
+        if (err)
+                goto out;
+        err = -ENODATA;
+        if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+                goto out_drop_write;
+        if (type == OVL_PATH_LOWER) {
+                err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+                if (err < 0)
+                        goto out_drop_write;
+                err = ovl_copy_up(dentry);
+                if (err)
+                        goto out_drop_write;
+                ovl_path_upper(dentry, &realpath);
+        }
+        err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+        ovl_drop_write(dentry);
+out:
+        return err;
+}
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+                                  struct dentry *realdentry)
+{
+        if (type != OVL_PATH_LOWER)
+                return false;
+        if (special_file(realdentry->d_inode->i_mode))
+                return false;
+        if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+                return false;
+        return true;
+}
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+                    const struct cred *cred)
+{
+        int err;
+        struct path realpath;
+        enum ovl_path_type type;
+        bool want_write = false;
+        type = ovl_path_real(dentry, &realpath);
+        if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+                want_write = true;
+                err = ovl_want_write(dentry);
+                if (err)
+                        goto out;
+                if (file->f_flags & O_TRUNC)
+                        err = ovl_copy_up_last(dentry, NULL, true);
+                else
+                        err = ovl_copy_up(dentry);
+                if (err)
+                        goto out_drop_write;
+                ovl_path_upper(dentry, &realpath);
+        }
+        err = vfs_open(&realpath, file, cred);
+out_drop_write:
+        if (want_write)
+                ovl_drop_write(dentry);
+out:
+        return err;
+}
+static const struct inode_operations ovl_file_inode_operations = {
+        .setattr        = ovl_setattr,
+        .permission     = ovl_permission,
+        .getattr        = ovl_getattr,
+        .setxattr       = ovl_setxattr,
+        .getxattr       = ovl_getxattr,
+        .listxattr      = ovl_listxattr,
+        .removexattr    = ovl_removexattr,
+        .dentry_open    = ovl_dentry_open,
+};
+static const struct inode_operations ovl_symlink_inode_operations = {
+        .setattr        = ovl_setattr,
+        .follow_link    = ovl_follow_link,
+        .put_link       = ovl_put_link,
+        .readlink       = ovl_readlink,
+        .getattr        = ovl_getattr,
+        .setxattr       = ovl_setxattr,
+        .getxattr       = ovl_getxattr,
+        .listxattr      = ovl_listxattr,
+        .removexattr    = ovl_removexattr,
+};
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+                            struct ovl_entry *oe)
+{
+        struct inode *inode;
+        inode = new_inode(sb);
+        if (!inode)
+                return NULL;
+        mode &= S_IFMT;
+        inode->i_ino = get_next_ino();
+        inode->i_mode = mode;
+        inode->i_flags |= S_NOATIME | S_NOCMTIME;
+        switch (mode) {
+        case S_IFDIR:
+                inode->i_private = oe;
+                inode->i_op = &ovl_dir_inode_operations;
+                inode->i_fop = &ovl_dir_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &ovl_symlink_inode_operations;
+                break;
+        case S_IFREG:
+        case S_IFSOCK:
+        case S_IFBLK:
+        case S_IFCHR:
+        case S_IFIFO:
+                inode->i_op = &ovl_file_inode_operations;
+                break;
+        default:
+                WARN(1, "illegal file type: %i\n", mode);
+                iput(inode);
+                inode = NULL;
+        }
+        return inode;
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..814bed33dd07
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+struct ovl_entry;
+enum ovl_path_type {
+        OVL_PATH_PURE_UPPER,
+        OVL_PATH_UPPER,
+        OVL_PATH_MERGE,
+        OVL_PATH_LOWER,
+};
+extern const char *ovl_opaque_xattr;
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        int err = vfs_rmdir(dir, dentry);
+        pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+        return err;
+}
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+        int err = vfs_unlink(dir, dentry, NULL);
+        pr_debug("unlink(%pd2) = %i\n", dentry, err);
+        return err;
+}
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+                              struct dentry *new_dentry, bool debug)
+{
+        int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+        if (debug) {
+                pr_debug("link(%pd2, %pd2) = %i\n",
+                         old_dentry, new_dentry, err);
+        }
+        return err;
+}
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+                             umode_t mode, bool debug)
+{
+        int err = vfs_create(dir, dentry, mode, true);
+        if (debug)
+                pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+        return err;
+}
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+                               umode_t mode, bool debug)
+{
+        int err = vfs_mkdir(dir, dentry, mode);
+        if (debug)
+                pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+        return err;
+}
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+                               umode_t mode, dev_t dev, bool debug)
+{
+        int err = vfs_mknod(dir, dentry, mode, dev);
+        if (debug) {
+                pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+                         dentry, mode, dev, err);
+        }
+        return err;
+}
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+                                 const char *oldname, bool debug)
+{
+        int err = vfs_symlink(dir, dentry, oldname);
+        if (debug)
+                pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+        return err;
+}
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+                                  const void *value, size_t size, int flags)
+{
+        int err = vfs_setxattr(dentry, name, value, size, flags);
+        pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+                 dentry, name, (int) size, (char *) value, flags, err);
+        return err;
+}
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+        int err = vfs_removexattr(dentry, name);
+        pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+        return err;
+}
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+                                struct inode *newdir, struct dentry *newdentry,
+                                unsigned int flags)
+{
+        int err;
+        pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+                 olddentry, newdentry, flags);
+        err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+        if (err) {
+                pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+                         olddentry, newdentry, err);
+        }
+        return err;
+}
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+        int err = vfs_whiteout(dir, dentry);
+        pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+        return err;
+}
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+                          unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+                                struct kstat *stat, const char *link);
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+                 const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+                     void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+                            struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+        to->i_uid = from->i_uid;
+        to->i_gid = from->i_gid;
+}
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+                    struct kstat *stat, const char *link,
+                    struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+                    struct path *lowerpath, struct kstat *stat,
+                    struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..ab1e3dcbed95
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,586 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+struct ovl_cache_entry {
+        unsigned int len;
+        unsigned int type;
+        u64 ino;
+        struct list_head l_node;
+        struct rb_node node;
+        bool is_whiteout;
+        bool is_cursor;
+        char name[];
+};
+struct ovl_dir_cache {
+        long refcount;
+        u64 version;
+        struct list_head entries;
+};
+struct ovl_readdir_data {
+        struct dir_context ctx;
+        bool is_merge;
+        struct rb_root root;
+        struct list_head *list;
+        struct list_head middle;
+        int count;
+        int err;
+};
+struct ovl_dir_file {
+        bool is_real;
+        bool is_upper;
+        struct ovl_dir_cache *cache;
+        struct ovl_cache_entry cursor;
+        struct file *realfile;
+        struct file *upperfile;
+};
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+        return container_of(n, struct ovl_cache_entry, node);
+}
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+                                                    const char *name, int len)
+{
+        struct rb_node *node = root->rb_node;
+        int cmp;
+        while (node) {
+                struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+                cmp = strncmp(name, p->name, len);
+                if (cmp > 0)
+                        node = p->node.rb_right;
+                else if (cmp < 0 || len < p->len)
+                        node = p->node.rb_left;
+                else
+                        return p;
+        }
+        return NULL;
+}
+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+                                                   u64 ino, unsigned int d_type)
+{
+        struct ovl_cache_entry *p;
+        size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+        p = kmalloc(size, GFP_KERNEL);
+        if (p) {
+                memcpy(p->name, name, len);
+                p->name[len] = '\0';
+                p->len = len;
+                p->type = d_type;
+                p->ino = ino;
+                p->is_whiteout = false;
+                p->is_cursor = false;
+        }
+        return p;
+}
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+                                  const char *name, int len, u64 ino,
+                                  unsigned int d_type)
+{
+        struct rb_node **newp = &rdd->root.rb_node;
+        struct rb_node *parent = NULL;
+        struct ovl_cache_entry *p;
+        while (*newp) {
+                int cmp;
+                struct ovl_cache_entry *tmp;
+                parent = *newp;
+                tmp = ovl_cache_entry_from_node(*newp);
+                cmp = strncmp(name, tmp->name, len);
+                if (cmp > 0)
+                        newp = &tmp->node.rb_right;
+                else if (cmp < 0 || len < tmp->len)
+                        newp = &tmp->node.rb_left;
+                else
+                        return 0;
+        }
+        p = ovl_cache_entry_new(name, len, ino, d_type);
+        if (p == NULL)
+                return -ENOMEM;
+        list_add_tail(&p->l_node, rdd->list);
+        rb_link_node(&p->node, parent, newp);
+        rb_insert_color(&p->node, &rdd->root);
+        return 0;
+}
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+                          const char *name, int namelen,
+                          loff_t offset, u64 ino, unsigned int d_type)
+{
+        struct ovl_cache_entry *p;
+        p = ovl_cache_entry_find(&rdd->root, name, namelen);
+        if (p) {
+                list_move_tail(&p->l_node, &rdd->middle);
+        } else {
+                p = ovl_cache_entry_new(name, namelen, ino, d_type);
+                if (p == NULL)
+                        rdd->err = -ENOMEM;
+                else
+                        list_add_tail(&p->l_node, &rdd->middle);
+        }
+        return rdd->err;
+}
+void ovl_cache_free(struct list_head *list)
+{
+        struct ovl_cache_entry *p;
+        struct ovl_cache_entry *n;
+        list_for_each_entry_safe(p, n, list, l_node)
+                kfree(p);
+        INIT_LIST_HEAD(list);
+}
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+        struct ovl_dir_cache *cache = od->cache;
+        list_del_init(&od->cursor.l_node);
+        WARN_ON(cache->refcount <= 0);
+        cache->refcount--;
+        if (!cache->refcount) {
+                if (ovl_dir_cache(dentry) == cache)
+                        ovl_set_dir_cache(dentry, NULL);
+                ovl_cache_free(&cache->entries);
+                kfree(cache);
+        }
+}
+static int ovl_fill_merge(void *buf, const char *name, int namelen,
+                          loff_t offset, u64 ino, unsigned int d_type)
+{
+        struct ovl_readdir_data *rdd = buf;
+        rdd->count++;
+        if (!rdd->is_merge)
+                return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+        else
+                return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+static inline int ovl_dir_read(struct path *realpath,
+                               struct ovl_readdir_data *rdd)
+{
+        struct file *realfile;
+        int err;
+        realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+        if (IS_ERR(realfile))
+                return PTR_ERR(realfile);
+        rdd->ctx.pos = 0;
+        do {
+                rdd->count = 0;
+                rdd->err = 0;
+                err = iterate_dir(realfile, &rdd->ctx);
+                if (err >= 0)
+                        err = rdd->err;
+        } while (!err && rdd->count);
+        fput(realfile);
+        return err;
+}
+static void ovl_dir_reset(struct file *file)
+{
+        struct ovl_dir_file *od = file->private_data;
+        struct ovl_dir_cache *cache = od->cache;
+        struct dentry *dentry = file->f_path.dentry;
+        enum ovl_path_type type = ovl_path_type(dentry);
+        if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+                ovl_cache_put(od, dentry);
+                od->cache = NULL;
+        }
+        WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
+        if (od->is_real && type == OVL_PATH_MERGE)
+                od->is_real = false;
+}
+static int ovl_dir_mark_whiteouts(struct dentry *dir,
+                                  struct ovl_readdir_data *rdd)
+{
+        struct ovl_cache_entry *p;
+        struct dentry *dentry;
+        const struct cred *old_cred;
+        struct cred *override_cred;
+        override_cred = prepare_creds();
+        if (!override_cred) {
+                ovl_cache_free(rdd->list);
+                return -ENOMEM;
+        }
+        /*
+         * CAP_DAC_OVERRIDE for lookup
+         */
+        cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+        old_cred = override_creds(override_cred);
+        mutex_lock(&dir->d_inode->i_mutex);
+        list_for_each_entry(p, rdd->list, l_node) {
+                if (p->is_cursor)
+                        continue;
+                if (p->type != DT_CHR)
+                        continue;
+                dentry = lookup_one_len(p->name, dir, p->len);
+                if (IS_ERR(dentry))
+                        continue;
+                p->is_whiteout = ovl_is_whiteout(dentry);
+                dput(dentry);
+        }
+        mutex_unlock(&dir->d_inode->i_mutex);
+        revert_creds(old_cred);
+        put_cred(override_cred);
+        return 0;
+}
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
+{
+        int err;
+        struct path lowerpath;
+        struct path upperpath;
+        struct ovl_readdir_data rdd = {
+                .ctx.actor = ovl_fill_merge,
+                .list = list,
+                .root = RB_ROOT,
+                .is_merge = false,
+        };
+        ovl_path_lower(dentry, &lowerpath);
+        ovl_path_upper(dentry, &upperpath);
+        if (upperpath.dentry) {
+                err = ovl_dir_read(&upperpath, &rdd);
+                if (err)
+                        goto out;
+                if (lowerpath.dentry) {
+                        err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd);
+                        if (err)
+                                goto out;
+                }
+        }
+        if (lowerpath.dentry) {
+                /*
+                 * Insert lowerpath entries before upperpath ones, this allows
+                 * offsets to be reasonably constant
+                 */
+                list_add(&rdd.middle, rdd.list);
+                rdd.is_merge = true;
+                err = ovl_dir_read(&lowerpath, &rdd);
+                list_del(&rdd.middle);
+        }
+out:
+        return err;
+}
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+        struct ovl_cache_entry *p;
+        loff_t off = 0;
+        list_for_each_entry(p, &od->cache->entries, l_node) {
+                if (p->is_cursor)
+                        continue;
+                if (off >= pos)
+                        break;
+                off++;
+        }
+        list_move_tail(&od->cursor.l_node, &p->l_node);
+}
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+        int res;
+        struct ovl_dir_cache *cache;
+        cache = ovl_dir_cache(dentry);
+        if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+                cache->refcount++;
+                return cache;
+        }
+        ovl_set_dir_cache(dentry, NULL);
+        cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+        if (!cache)
+                return ERR_PTR(-ENOMEM);
+        cache->refcount = 1;
+        INIT_LIST_HEAD(&cache->entries);
+        res = ovl_dir_read_merged(dentry, &cache->entries);
+        if (res) {
+                ovl_cache_free(&cache->entries);
+                kfree(cache);
+                return ERR_PTR(res);
+        }
+        cache->version = ovl_dentry_version_get(dentry);
+        ovl_set_dir_cache(dentry, cache);
+        return cache;
+}
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+        struct ovl_dir_file *od = file->private_data;
+        struct dentry *dentry = file->f_path.dentry;
+        if (!ctx->pos)
+                ovl_dir_reset(file);
+        if (od->is_real)
+                return iterate_dir(od->realfile, ctx);
+        if (!od->cache) {
+                struct ovl_dir_cache *cache;
+                cache = ovl_cache_get(dentry);
+                if (IS_ERR(cache))
+                        return PTR_ERR(cache);
+                od->cache = cache;
+                ovl_seek_cursor(od, ctx->pos);
+        }
+        while (od->cursor.l_node.next != &od->cache->entries) {
+                struct ovl_cache_entry *p;
+                p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
+                /* Skip cursors */
+                if (!p->is_cursor) {
+                        if (!p->is_whiteout) {
+                                if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+                                        break;
+                        }
+                        ctx->pos++;
+                }
+                list_move(&od->cursor.l_node, &p->l_node);
+        }
+        return 0;
+}
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t res;
+        struct ovl_dir_file *od = file->private_data;
+        mutex_lock(&file_inode(file)->i_mutex);
+        if (!file->f_pos)
+                ovl_dir_reset(file);
+        if (od->is_real) {
+                res = vfs_llseek(od->realfile, offset, origin);
+                file->f_pos = od->realfile->f_pos;
+        } else {
+                res = -EINVAL;
+                switch (origin) {
+                case SEEK_CUR:
+                        offset += file->f_pos;
+                        break;
+                case SEEK_SET:
+                        break;
+                default:
+                        goto out_unlock;
+                }
+                if (offset < 0)
+                        goto out_unlock;
+                if (offset != file->f_pos) {
+                        file->f_pos = offset;
+                        if (od->cache)
+                                ovl_seek_cursor(od, offset);
+                }
+                res = offset;
+        }
+out_unlock:
+        mutex_unlock(&file_inode(file)->i_mutex);
+        return res;
+}
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+                         int datasync)
+{
+        struct ovl_dir_file *od = file->private_data;
+        struct dentry *dentry = file->f_path.dentry;
+        struct file *realfile = od->realfile;
+        /*
+         * Need to check if we started out being a lower dir, but got copied up
+         */
+        if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) {
+                struct inode *inode = file_inode(file);
+                realfile = lockless_dereference(od->upperfile);
+                if (!realfile) {
+                        struct path upperpath;
+                        ovl_path_upper(dentry, &upperpath);
+                        realfile = ovl_path_open(&upperpath, O_RDONLY);
+                        smp_mb__before_spinlock();
+                        mutex_lock(&inode->i_mutex);
+                        if (!od->upperfile) {
+                                if (IS_ERR(realfile)) {
+                                        mutex_unlock(&inode->i_mutex);
+                                        return PTR_ERR(realfile);
+                                }
+                                od->upperfile = realfile;
+                        } else {
+                                /* somebody has beaten us to it */
+                                if (!IS_ERR(realfile))
+                                        fput(realfile);
+                                realfile = od->upperfile;
+                        }
+                        mutex_unlock(&inode->i_mutex);
+                }
+        }
+        return vfs_fsync_range(realfile, start, end, datasync);
+}
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+        struct ovl_dir_file *od = file->private_data;
+        if (od->cache) {
+                mutex_lock(&inode->i_mutex);
+                ovl_cache_put(od, file->f_path.dentry);
+                mutex_unlock(&inode->i_mutex);
+        }
+        fput(od->realfile);
+        if (od->upperfile)
+                fput(od->upperfile);
+        kfree(od);
+        return 0;
+}
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+        struct path realpath;
+        struct file *realfile;
+        struct ovl_dir_file *od;
+        enum ovl_path_type type;
+        od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+        if (!od)
+                return -ENOMEM;
+        type = ovl_path_real(file->f_path.dentry, &realpath);
+        realfile = ovl_path_open(&realpath, file->f_flags);
+        if (IS_ERR(realfile)) {
+                kfree(od);
+                return PTR_ERR(realfile);
+        }
+        INIT_LIST_HEAD(&od->cursor.l_node);
+        od->realfile = realfile;
+        od->is_real = (type != OVL_PATH_MERGE);
+        od->is_upper = (type != OVL_PATH_LOWER);
+        od->cursor.is_cursor = true;
+        file->private_data = od;
+        return 0;
+}
+const struct file_operations ovl_dir_operations = {
+        .read           = generic_read_dir,
+        .open           = ovl_dir_open,
+        .iterate        = ovl_iterate,
+        .llseek         = ovl_dir_llseek,
+        .fsync          = ovl_dir_fsync,
+        .release        = ovl_dir_release,
+};
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+        int err;
+        struct ovl_cache_entry *p;
+        err = ovl_dir_read_merged(dentry, list);
+        if (err)
+                return err;
+        err = 0;
+        list_for_each_entry(p, list, l_node) {
+                if (p->is_whiteout)
+                        continue;
+                if (p->name[0] == '.') {
+                        if (p->len == 1)
+                                continue;
+                        if (p->len == 2 && p->name[1] == '.')
+                                continue;
+                }
+                err = -ENOTEMPTY;
+                break;
+        }
+        return err;
+}
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+        struct ovl_cache_entry *p;
+        mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+        list_for_each_entry(p, list, l_node) {
+                struct dentry *dentry;
+                if (!p->is_whiteout)
+                        continue;
+                dentry = lookup_one_len(p->name, upper, p->len);
+                if (IS_ERR(dentry)) {
+                        pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+                               upper->d_name.name, p->len, p->name,
+                               (int) PTR_ERR(dentry));
+                        continue;
+                }
+                ovl_cleanup(upper->d_inode, dentry);
+                dput(dentry);
+        }
+        mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..f16d318b71f8
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,833 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include "overlayfs.h"
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+struct ovl_config {
+        char *lowerdir;
+        char *upperdir;
+        char *workdir;
+};
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+        struct vfsmount *upper_mnt;
+        struct vfsmount *lower_mnt;
+        struct dentry *workdir;
+        long lower_namelen;
+        /* pathnames of lower and upper dirs, for show_options */
+        struct ovl_config config;
+};
+struct ovl_dir_cache;
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+        struct dentry *__upperdentry;
+        struct dentry *lowerdentry;
+        struct ovl_dir_cache *cache;
+        union {
+                struct {
+                        u64 version;
+                        bool opaque;
+                };
+                struct rcu_head rcu;
+        };
+};
+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        if (oe->__upperdentry) {
+                if (oe->lowerdentry) {
+                        if (S_ISDIR(dentry->d_inode->i_mode))
+                                return OVL_PATH_MERGE;
+                        else
+                                return OVL_PATH_UPPER;
+                } else {
+                        if (oe->opaque)
+                                return OVL_PATH_UPPER;
+                        else
+                                return OVL_PATH_PURE_UPPER;
+                }
+        } else {
+                return OVL_PATH_LOWER;
+        }
+}
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+        return lockless_dereference(oe->__upperdentry);
+}
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        struct ovl_entry *oe = dentry->d_fsdata;
+        path->mnt = ofs->upper_mnt;
+        path->dentry = ovl_upperdentry_dereference(oe);
+}
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+        enum ovl_path_type type = ovl_path_type(dentry);
+        if (type == OVL_PATH_LOWER)
+                ovl_path_lower(dentry, path);
+        else
+                ovl_path_upper(dentry, path);
+        return type;
+}
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        return ovl_upperdentry_dereference(oe);
+}
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        return oe->lowerdentry;
+}
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        struct dentry *realdentry;
+        realdentry = ovl_upperdentry_dereference(oe);
+        if (!realdentry)
+                realdentry = oe->lowerdentry;
+        return realdentry;
+}
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+        struct dentry *realdentry;
+        realdentry = ovl_upperdentry_dereference(oe);
+        if (realdentry) {
+                *is_upper = true;
+        } else {
+                realdentry = oe->lowerdentry;
+                *is_upper = false;
+        }
+        return realdentry;
+}
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        return oe->cache;
+}
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        oe->cache = cache;
+}
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        struct ovl_entry *oe = dentry->d_fsdata;
+        path->mnt = ofs->lower_mnt;
+        path->dentry = oe->lowerdentry;
+}
+int ovl_want_write(struct dentry *dentry)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        return mnt_want_write(ofs->upper_mnt);
+}
+void ovl_drop_write(struct dentry *dentry)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        mnt_drop_write(ofs->upper_mnt);
+}
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        return ofs->workdir;
+}
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        return oe->opaque;
+}
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        oe->opaque = opaque;
+}
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+        WARN_ON(oe->__upperdentry);
+        BUG_ON(!upperdentry->d_inode);
+        /*
+         * Make sure upperdentry is consistent before making it visible to
+         * ovl_upperdentry_dereference().
+         */
+        smp_wmb();
+        oe->__upperdentry = upperdentry;
+}
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        oe->version++;
+}
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        return oe->version;
+}
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        return inode && IS_WHITEOUT(inode);
+}
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+        int res;
+        char val;
+        struct inode *inode = dentry->d_inode;
+        if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+                return false;
+        res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+        if (res == 1 && val == 'y')
+                return true;
+        return false;
+}
+static void ovl_dentry_release(struct dentry *dentry)
+{
+        struct ovl_entry *oe = dentry->d_fsdata;
+        if (oe) {
+                dput(oe->__upperdentry);
+                dput(oe->lowerdentry);
+                kfree_rcu(oe, rcu);
+        }
+}
+static const struct dentry_operations ovl_dentry_operations = {
+        .d_release = ovl_dentry_release,
+};
+static struct ovl_entry *ovl_alloc_entry(void)
+{
+        return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+}
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+                                             struct qstr *name)
+{
+        struct dentry *dentry;
+        mutex_lock(&dir->d_inode->i_mutex);
+        dentry = lookup_one_len(name->name, dir, name->len);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        if (IS_ERR(dentry)) {
+                if (PTR_ERR(dentry) == -ENOENT)
+                        dentry = NULL;
+        } else if (!dentry->d_inode) {
+                dput(dentry);
+                dentry = NULL;
+        }
+        return dentry;
+}
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+                          unsigned int flags)
+{
+        struct ovl_entry *oe;
+        struct dentry *upperdir;
+        struct dentry *lowerdir;
+        struct dentry *upperdentry = NULL;
+        struct dentry *lowerdentry = NULL;
+        struct inode *inode = NULL;
+        int err;
+        err = -ENOMEM;
+        oe = ovl_alloc_entry();
+        if (!oe)
+                goto out;
+        upperdir = ovl_dentry_upper(dentry->d_parent);
+        lowerdir = ovl_dentry_lower(dentry->d_parent);
+        if (upperdir) {
+                upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
+                err = PTR_ERR(upperdentry);
+                if (IS_ERR(upperdentry))
+                        goto out_put_dir;
+                if (lowerdir && upperdentry) {
+                        if (ovl_is_whiteout(upperdentry)) {
+                                dput(upperdentry);
+                                upperdentry = NULL;
+                                oe->opaque = true;
+                        } else if (ovl_is_opaquedir(upperdentry)) {
+                                oe->opaque = true;
+                        }
+                }
+        }
+        if (lowerdir && !oe->opaque) {
+                lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
+                err = PTR_ERR(lowerdentry);
+                if (IS_ERR(lowerdentry))
+                        goto out_dput_upper;
+        }
+        if (lowerdentry && upperdentry &&
+            (!S_ISDIR(upperdentry->d_inode->i_mode) ||
+             !S_ISDIR(lowerdentry->d_inode->i_mode))) {
+                dput(lowerdentry);
+                lowerdentry = NULL;
+                oe->opaque = true;
+        }
+        if (lowerdentry || upperdentry) {
+                struct dentry *realdentry;
+                realdentry = upperdentry ? upperdentry : lowerdentry;
+                err = -ENOMEM;
+                inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+                                      oe);
+                if (!inode)
+                        goto out_dput;
+                ovl_copyattr(realdentry->d_inode, inode);
+        }
+        oe->__upperdentry = upperdentry;
+        oe->lowerdentry = lowerdentry;
+        dentry->d_fsdata = oe;
+        d_add(dentry, inode);
+        return NULL;
+out_dput:
+        dput(lowerdentry);
+out_dput_upper:
+        dput(upperdentry);
+out_put_dir:
+        kfree(oe);
+out:
+        return ERR_PTR(err);
+}
+struct file *ovl_path_open(struct path *path, int flags)
+{
+        return dentry_open(path, flags, current_cred());
+}
+static void ovl_put_super(struct super_block *sb)
+{
+        struct ovl_fs *ufs = sb->s_fs_info;
+        dput(ufs->workdir);
+        mntput(ufs->upper_mnt);
+        mntput(ufs->lower_mnt);
+        kfree(ufs->config.lowerdir);
+        kfree(ufs->config.upperdir);
+        kfree(ufs->config.workdir);
+        kfree(ufs);
+}
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics.  As writes always target the upper layer
+ * filesystem pass the statfs to the same filesystem.
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+        struct dentry *root_dentry = dentry->d_sb->s_root;
+        struct path path;
+        int err;
+        ovl_path_upper(root_dentry, &path);
+        err = vfs_statfs(&path, buf);
+        if (!err) {
+                buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+                buf->f_type = OVERLAYFS_SUPER_MAGIC;
+        }
+        return err;
+}
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct ovl_fs *ufs = sb->s_fs_info;
+        seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+        seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+        seq_printf(m, ",workdir=%s", ufs->config.workdir);
+        return 0;
+}
+static const struct super_operations ovl_super_operations = {
+        .put_super      = ovl_put_super,
+        .statfs         = ovl_statfs,
+        .show_options   = ovl_show_options,
+};
+enum {
+        OPT_LOWERDIR,
+        OPT_UPPERDIR,
+        OPT_WORKDIR,
+        OPT_ERR,
+};
+static const match_table_t ovl_tokens = {
+        {OPT_LOWERDIR,                  "lowerdir=%s"},
+        {OPT_UPPERDIR,                  "upperdir=%s"},
+        {OPT_WORKDIR,                   "workdir=%s"},
+        {OPT_ERR,                       NULL}
+};
+static char *ovl_next_opt(char **s)
+{
+        char *sbegin = *s;
+        char *p;
+        if (sbegin == NULL)
+                return NULL;
+        for (p = sbegin; *p; p++) {
+                if (*p == '\\') {
+                        p++;
+                        if (!*p)
+                                break;
+                } else if (*p == ',') {
+                        *p = '\0';
+                        *s = p + 1;
+                        return sbegin;
+                }
+        }
+        *s = NULL;
+        return sbegin;
+}
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+        char *p;
+        while ((p = ovl_next_opt(&opt)) != NULL) {
+                int token;
+                substring_t args[MAX_OPT_ARGS];
+                if (!*p)
+                        continue;
+                token = match_token(p, ovl_tokens, args);
+                switch (token) {
+                case OPT_UPPERDIR:
+                        kfree(config->upperdir);
+                        config->upperdir = match_strdup(&args[0]);
+                        if (!config->upperdir)
+                                return -ENOMEM;
+                        break;
+                case OPT_LOWERDIR:
+                        kfree(config->lowerdir);
+                        config->lowerdir = match_strdup(&args[0]);
+                        if (!config->lowerdir)
+                                return -ENOMEM;
+                        break;
+                case OPT_WORKDIR:
+                        kfree(config->workdir);
+                        config->workdir = match_strdup(&args[0]);
+                        if (!config->workdir)
+                                return -ENOMEM;
+                        break;
+                default:
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+#define OVL_WORKDIR_NAME "work"
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+                                         struct dentry *dentry)
+{
+        struct inode *dir = dentry->d_inode;
+        struct dentry *work;
+        int err;
+        bool retried = false;
+        err = mnt_want_write(mnt);
+        if (err)
+                return ERR_PTR(err);
+        mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+        work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+                              strlen(OVL_WORKDIR_NAME));
+        if (!IS_ERR(work)) {
+                struct kstat stat = {
+                        .mode = S_IFDIR | 0,
+                };
+                if (work->d_inode) {
+                        err = -EEXIST;
+                        if (retried)
+                                goto out_dput;
+                        retried = true;
+                        ovl_cleanup(dir, work);
+                        dput(work);
+                        goto retry;
+                }
+                err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+                if (err)
+                        goto out_dput;
+        }
+out_unlock:
+        mutex_unlock(&dir->i_mutex);
+        mnt_drop_write(mnt);
+        return work;
+out_dput:
+        dput(work);
+        work = ERR_PTR(err);
+        goto out_unlock;
+}
+static void ovl_unescape(char *s)
+{
+        char *d = s;
+        for (;; s++, d++) {
+                if (*s == '\\')
+                        s++;
+                *d = *s;
+                if (!*s)
+                        break;
+        }
+}
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+        int err;
+        char *tmp = kstrdup(name, GFP_KERNEL);
+        if (!tmp)
+                return -ENOMEM;
+        ovl_unescape(tmp);
+        err = kern_path(tmp, LOOKUP_FOLLOW, path);
+        if (err) {
+                pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err);
+                err = -EINVAL;
+        }
+        kfree(tmp);
+        return err;
+}
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+        const struct dentry_operations *dop = root->d_op;
+        /*
+         * We don't support:
+         *  - automount filesystems
+         *  - filesystems with revalidate (FIXME for lower layer)
+         *  - filesystems with case insensitive names
+         */
+        if (dop &&
+            (dop->d_manage || dop->d_automount ||
+             dop->d_revalidate || dop->d_weak_revalidate ||
+             dop->d_compare || dop->d_hash)) {
+                return false;
+        }
+        return true;
+}
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+        bool ok = false;
+        if (workdir != upperdir) {
+                ok = (lock_rename(workdir, upperdir) == NULL);
+                unlock_rename(workdir, upperdir);
+        }
+        return ok;
+}
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct path lowerpath;
+        struct path upperpath;
+        struct path workpath;
+        struct inode *root_inode;
+        struct dentry *root_dentry;
+        struct ovl_entry *oe;
+        struct ovl_fs *ufs;
+        struct kstatfs statfs;
+        int err;
+        err = -ENOMEM;
+        ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+        if (!ufs)
+                goto out;
+        err = ovl_parse_opt((char *) data, &ufs->config);
+        if (err)
+                goto out_free_config;
+        /* FIXME: workdir is not needed for a R/O mount */
+        err = -EINVAL;
+        if (!ufs->config.upperdir || !ufs->config.lowerdir ||
+            !ufs->config.workdir) {
+                pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+                goto out_free_config;
+        }
+        err = -ENOMEM;
+        oe = ovl_alloc_entry();
+        if (oe == NULL)
+                goto out_free_config;
+        err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+        if (err)
+                goto out_free_oe;
+        err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
+        if (err)
+                goto out_put_upperpath;
+        err = ovl_mount_dir(ufs->config.workdir, &workpath);
+        if (err)
+                goto out_put_lowerpath;
+        err = -EINVAL;
+        if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
+            !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
+            !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
+                pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
+                goto out_put_workpath;
+        }
+        if (upperpath.mnt != workpath.mnt) {
+                pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+                goto out_put_workpath;
+        }
+        if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+                pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+                goto out_put_workpath;
+        }
+        if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
+                pr_err("overlayfs: filesystem of upperdir is not supported\n");
+                goto out_put_workpath;
+        }
+        if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
+                pr_err("overlayfs: filesystem of lowerdir is not supported\n");
+                goto out_put_workpath;
+        }
+        err = vfs_statfs(&lowerpath, &statfs);
+        if (err) {
+                pr_err("overlayfs: statfs failed on lowerpath\n");
+                goto out_put_workpath;
+        }
+        ufs->lower_namelen = statfs.f_namelen;
+        sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
+                                lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+        err = -EINVAL;
+        if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+                pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+                goto out_put_workpath;
+        }
+        ufs->upper_mnt = clone_private_mount(&upperpath);
+        err = PTR_ERR(ufs->upper_mnt);
+        if (IS_ERR(ufs->upper_mnt)) {
+                pr_err("overlayfs: failed to clone upperpath\n");
+                goto out_put_workpath;
+        }
+        ufs->lower_mnt = clone_private_mount(&lowerpath);
+        err = PTR_ERR(ufs->lower_mnt);
+        if (IS_ERR(ufs->lower_mnt)) {
+                pr_err("overlayfs: failed to clone lowerpath\n");
+                goto out_put_upper_mnt;
+        }
+        ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+        err = PTR_ERR(ufs->workdir);
+        if (IS_ERR(ufs->workdir)) {
+                pr_err("overlayfs: failed to create directory %s/%s\n",
+                       ufs->config.workdir, OVL_WORKDIR_NAME);
+                goto out_put_lower_mnt;
+        }
+        /*
+         * Make lower_mnt R/O.  That way fchmod/fchown on lower file
+         * will fail instead of modifying lower fs.
+         */
+        ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+        /* If the upper fs is r/o, we mark overlayfs r/o too */
+        if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+                sb->s_flags |= MS_RDONLY;
+        sb->s_d_op = &ovl_dentry_operations;
+        err = -ENOMEM;
+        root_inode = ovl_new_inode(sb, S_IFDIR, oe);
+        if (!root_inode)
+                goto out_put_workdir;
+        root_dentry = d_make_root(root_inode);
+        if (!root_dentry)
+                goto out_put_workdir;
+        mntput(upperpath.mnt);
+        mntput(lowerpath.mnt);
+        path_put(&workpath);
+        oe->__upperdentry = upperpath.dentry;
+        oe->lowerdentry = lowerpath.dentry;
+        root_dentry->d_fsdata = oe;
+        sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+        sb->s_op = &ovl_super_operations;
+        sb->s_root = root_dentry;
+        sb->s_fs_info = ufs;
+        return 0;
+out_put_workdir:
+        dput(ufs->workdir);
+out_put_lower_mnt:
+        mntput(ufs->lower_mnt);
+out_put_upper_mnt:
+        mntput(ufs->upper_mnt);
+out_put_workpath:
+        path_put(&workpath);
+out_put_lowerpath:
+        path_put(&lowerpath);
+out_put_upperpath:
+        path_put(&upperpath);
+out_free_oe:
+        kfree(oe);
+out_free_config:
+        kfree(ufs->config.lowerdir);
+        kfree(ufs->config.upperdir);
+        kfree(ufs->config.workdir);
+        kfree(ufs);
+out:
+        return err;
+}
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+                                const char *dev_name, void *raw_data)
+{
+        return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+static struct file_system_type ovl_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "overlay",
+        .mount          = ovl_mount,
+        .kill_sb        = kill_anon_super,
+};
+MODULE_ALIAS_FS("overlay");
+static int __init ovl_init(void)
+{
+        return register_filesystem(&ovl_fs_type);
+}
+static void __exit ovl_exit(void)
+{
+        unregister_filesystem(&ovl_fs_type);
+}
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b663b2d9562..6b4527216a7f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
                        dqstats_inc(DQST_LOOKUPS);
                        err = sb->dq_op->write_dquot(dquot);
                        if (!ret && err)
-                                err = ret;
+                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba84510..75c6058eabf2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
        return ret;
 }
+EXPORT_SYMBOL(do_splice_direct);
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99a5857..281002689d64 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1338,7 +1338,10 @@ xfs_free_file_space(
        goto out;
 }
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
 int
 xfs_zero_file_space(
        struct xfs_inode        *ip,
@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
        xfs_off_t               len)
 {
        struct xfs_mount        *mp = ip->i_mount;
-        uint                    granularity;
+        uint                    blksize;
-        xfs_off_t               start_boundary;
-        xfs_off_t               end_boundary;
        int                     error;
        trace_xfs_zero_file_space(ip);
-        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+        blksize = 1 << mp->m_sb.sb_blocklog;
        /*
-         * Round the range of extents we are going to convert inwards.  If the
+         * Punch a hole and prealloc the range. We use hole punch rather than
-         * offset is aligned, then it doesn't get changed so we zero from the
+         * unwritten extent conversion for two reasons:
-         * start of the block offset points to.
+         *
+         * 1.) Hole punch handles partial block zeroing for us.
+         *
+         * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+         * by virtue of the hole punch.
         */
-        start_boundary = round_up(offset, granularity);
+        error = xfs_free_file_space(ip, offset, len);
-        end_boundary = round_down(offset + len, granularity);
+        if (error)
+                goto out;
-        ASSERT(start_boundary >= offset);
-        ASSERT(end_boundary <= offset + len);
-        if (start_boundary < end_boundary - 1) {
-                /*
-                 * Writeback the range to ensure any inode size updates due to
-                 * appending writes make it to disk (otherwise we could just
-                 * punch out the delalloc blocks).
-                 */
-                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                start_boundary, end_boundary - 1);
-                if (error)
-                        goto out;
-                truncate_pagecache_range(VFS_I(ip), start_boundary,
-                                         end_boundary - 1);
-                /* convert the blocks */
-                error = xfs_alloc_file_space(ip, start_boundary,
-                                        end_boundary - start_boundary - 1,
-                                        XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
-                if (error)
-                        goto out;
-                /* We've handled the interior of the range, now for the edges */
-                if (start_boundary != offset) {
-                        error = xfs_iozero(ip, offset, start_boundary - offset);
-                        if (error)
-                                goto out;
-                }
-                if (end_boundary != offset + len)
-                        error = xfs_iozero(ip, end_boundary,
-                                           offset + len - end_boundary);
-        } else {
-                /*
-                 * It's either a sub-granularity range or the range spanned lies
-                 * partially across two adjacent blocks.
-                 */
-                error = xfs_iozero(ip, offset, len);
-        }
+        error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+                                     round_up(offset + len, blksize) -
+                                     round_down(offset, blksize),
+                                     XFS_BMAPI_PREALLOC);
 out:
        return error;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb961a296..894924a5129b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
        XFS_WANT_CORRUPTED_RETURN(stat == 1);
        /* Check if the record contains the inode in request */
-        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
-                return -EINVAL;
+                *icount = 0;
+                return 0;
+        }
        idx = agino - irec->ir_startino + 1;
        if (idx < XFS_INODES_PER_CHUNK &&
@@ -262,75 +264,76 @@ xfs_bulkstat_grab_ichunk(
 #define XFS_BULKSTAT_UBLEFT(ubleft)     ((ubleft) >= statstruct_size)
+struct xfs_bulkstat_agichunk {
+        char            __user **ac_ubuffer;/* pointer into user's buffer */
+        int             ac_ubleft;      /* bytes left in user's buffer */
+        int             ac_ubelem;      /* spaces used in user's buffer */
+};
 /*
 * Process inodes in chunk with a pointer to a formatter function
 * that will iget the inode and fill in the appropriate structure.
 */
-int
+static int
 xfs_bulkstat_ag_ichunk(
        struct xfs_mount                *mp,
        xfs_agnumber_t                  agno,
        struct xfs_inobt_rec_incore     *irbp,
        bulkstat_one_pf                 formatter,
        size_t                          statstruct_size,
-        struct xfs_bulkstat_agichunk    *acp)
+        struct xfs_bulkstat_agichunk    *acp,
+        xfs_agino_t                     *last_agino)
 {
-        xfs_ino_t                       lastino = acp->ac_lastino;
        char                            __user **ubufp = acp->ac_ubuffer;
-        int                             ubleft = acp->ac_ubleft;
+        int                             chunkidx;
-        int                             ubelem = acp->ac_ubelem;
-        int                             chunkidx, clustidx;
        int                             error = 0;
-        xfs_agino_t                     agino;
+        xfs_agino_t                     agino = irbp->ir_startino;
-        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+        for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
-             XFS_BULKSTAT_UBLEFT(ubleft) &&
+             chunkidx++, agino++) {
-             irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+                int             fmterror;
-             chunkidx++, clustidx++, agino++) {
-                int             fmterror;       /* bulkstat formatter result */
                int             ubused;
-                xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+                /* inode won't fit in buffer, we are done */
+                if (acp->ac_ubleft < statstruct_size)
+                        break;
                /* Skip if this inode is free */
-                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
-                        lastino = ino;
                        continue;
-                }
-                /*
-                 * Count used inodes as free so we can tell when the
-                 * chunk is used up.
-                 */
-                irbp->ir_freecount++;
                /* Get the inode and fill in a single buffer */
                ubused = statstruct_size;
-                error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
+                error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
-                if (fmterror == BULKSTAT_RV_NOTHING) {
+                                  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
-                        if (error && error != -ENOENT && error != -EINVAL) {
-                                ubleft = 0;
+                if (fmterror == BULKSTAT_RV_GIVEUP ||
-                                break;
+                    (error && error != -ENOENT && error != -EINVAL)) {
-                        }
+                        acp->ac_ubleft = 0;
-                        lastino = ino;
-                        continue;
-                }
-                if (fmterror == BULKSTAT_RV_GIVEUP) {
-                        ubleft = 0;
                        ASSERT(error);
                        break;
                }
-                if (*ubufp)
-                        *ubufp += ubused;
+                /* be careful not to leak error if at end of chunk */
-                ubleft -= ubused;
+                if (fmterror == BULKSTAT_RV_NOTHING || error) {
-                ubelem++;
+                        error = 0;
-                lastino = ino;
+                        continue;
+                }
+                *ubufp += ubused;
+                acp->ac_ubleft -= ubused;
+                acp->ac_ubelem++;
        }
-        acp->ac_lastino = lastino;
+        /*
-        acp->ac_ubleft = ubleft;
+         * Post-update *last_agino. At this point, agino will always point one
-        acp->ac_ubelem = ubelem;
+         * inode past the last inode we processed successfully. Hence we
+         * substract that inode when setting the *last_agino cursor so that we
+         * return the correct cookie to userspace. On the next bulkstat call,
+         * the inode under the lastino cookie will be skipped as we have already
+         * processed it here.
+         */
+        *last_agino = agino - 1;
        return error;
 }
@@ -353,45 +356,33 @@ xfs_bulkstat(
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
-        int                     end_of_ag; /* set if we've seen the ag end */
-        int                     error;  /* error code */
-        int                     fmterror;/* bulkstat formatter result */
-        int                     i;      /* loop index */
-        int                     icount; /* count of inodes good in irbuf */
        size_t                  irbsize; /* size of irec buffer in bytes */
-        xfs_ino_t               ino;    /* inode number (filesystem) */
-        xfs_inobt_rec_incore_t  *irbp;  /* current irec buffer pointer */
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
-        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
-        xfs_ino_t               lastino; /* last inode number returned */
        int                     nirbuf; /* size of irbuf */
-        int                     rval;   /* return value error code */
-        int                     tmp;    /* result value from btree calls */
        int                     ubcount; /* size of user's buffer */
-        int                     ubleft; /* bytes left in user's buffer */
+        struct xfs_bulkstat_agichunk ac;
-        char                    __user *ubufp;  /* pointer into user's buffer */
+        int                     error = 0;
-        int                     ubelem; /* spaces used in user's buffer */
        /*
         * Get the last inode value, see if there's nothing to do.
         */
-        ino = (xfs_ino_t)*lastinop;
+        agno = XFS_INO_TO_AGNO(mp, *lastinop);
-        lastino = ino;
+        agino = XFS_INO_TO_AGINO(mp, *lastinop);
-        agno = XFS_INO_TO_AGNO(mp, ino);
-        agino = XFS_INO_TO_AGINO(mp, ino);
        if (agno >= mp->m_sb.sb_agcount ||
-            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+            *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
                *done = 1;
                *ubcountp = 0;
                return 0;
        }
        ubcount = *ubcountp; /* statstruct's */
-        ubleft = ubcount * statstruct_size; /* bytes */
+        ac.ac_ubuffer = &ubuffer;
-        *ubcountp = ubelem = 0;
+        ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+        ac.ac_ubelem = 0;
+        *ubcountp = 0;
        *done = 0;
-        fmterror = 0;
-        ubufp = ubuffer;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
                return -ENOMEM;
@@ -402,9 +393,13 @@ xfs_bulkstat(
         * Loop over the allocation groups, starting from the last
         * inode returned; 0 means start of the allocation group.
         */
-        rval = 0;
+        while (agno < mp->m_sb.sb_agcount) {
-        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+                struct xfs_inobt_rec_incore     *irbp = irbuf;
-                cond_resched();
+                struct xfs_inobt_rec_incore     *irbufend = irbuf + nirbuf;
+                bool                            end_of_ag = false;
+                int                             icount = 0;
+                int                             stat;
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
                if (error)
                        break;
@@ -414,10 +409,6 @@ xfs_bulkstat(
                 */
                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
                                            XFS_BTNUM_INO);
-                irbp = irbuf;
-                irbufend = irbuf + nirbuf;
-                end_of_ag = 0;
-                icount = 0;
                if (agino > 0) {
                        /*
                         * In the middle of an allocation group, we need to get
@@ -427,22 +418,23 @@ xfs_bulkstat(
                        error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
                        if (error)
-                                break;
+                                goto del_cursor;
                        if (icount) {
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
                        }
                        /* Increment to the next record */
-                        error = xfs_btree_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &stat);
                } else {
                        /* Start of ag.  Lookup the first inode chunk */
-                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
+                }
+                if (error || stat == 0) {
+                        end_of_ag = true;
+                        goto del_cursor;
                }
-                if (error)
-                        break;
                /*
                 * Loop through inode btree records in this ag,
@@ -451,10 +443,10 @@ xfs_bulkstat(
                while (irbp < irbufend && icount < ubcount) {
                        struct xfs_inobt_rec_incore     r;
-                        error = xfs_inobt_get_rec(cur, &r, &i);
+                        error = xfs_inobt_get_rec(cur, &r, &stat);
-                        if (error || i == 0) {
+                        if (error || stat == 0) {
-                                end_of_ag = 1;
+                                end_of_ag = true;
-                                break;
+                                goto del_cursor;
                        }
                        /*
@@ -469,77 +461,79 @@ xfs_bulkstat(
                                irbp++;
                                icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
                        }
-                        /*
+                        error = xfs_btree_increment(cur, 0, &stat);
-                         * Set agino to after this chunk and bump the cursor.
+                        if (error || stat == 0) {
-                         */
+                                end_of_ag = true;
-                        agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+                                goto del_cursor;
-                        error = xfs_btree_increment(cur, 0, &tmp);
+                        }
                        cond_resched();
                }
                /*
-                 * Drop the btree buffers and the agi buffer.
+                 * Drop the btree buffers and the agi buffer as we can't hold any
-                 * We can't hold any of the locks these represent
+                 * of the locks these represent when calling iget. If there is a
-                 * when calling iget.
+                 * pending error, then we are done.
                 */
+del_cursor:
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                xfs_buf_relse(agbp);
+                if (error)
+                        break;
                /*
-                 * Now format all the good inodes into the user's buffer.
+                 * Now format all the good inodes into the user's buffer. The
+                 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+                 * for the next loop iteration.
                 */
                irbufend = irbp;
                for (irbp = irbuf;
-                     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
+                     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
-                        struct xfs_bulkstat_agichunk ac;
+                     irbp++) {
-                        ac.ac_lastino = lastino;
-                        ac.ac_ubuffer = &ubuffer;
-                        ac.ac_ubleft = ubleft;
-                        ac.ac_ubelem = ubelem;
                        error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-                                        formatter, statstruct_size, &ac);
+                                        formatter, statstruct_size, &ac,
+                                        &agino);
                        if (error)
-                                rval = error;
+                                break;
-                        lastino = ac.ac_lastino;
-                        ubleft = ac.ac_ubleft;
-                        ubelem = ac.ac_ubelem;
                        cond_resched();
                }
                /*
-                 * Set up for the next loop iteration.
+                 * If we've run out of space or had a formatting error, we
+                 * are now done
                 */
-                if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+                if (ac.ac_ubleft < statstruct_size || error)
-                        if (end_of_ag) {
-                                agno++;
-                                agino = 0;
-                        } else
-                                agino = XFS_INO_TO_AGINO(mp, lastino);
-                } else
                        break;
+                if (end_of_ag) {
+                        agno++;
+                        agino = 0;
+                }
        }
        /*
         * Done, we're either out of filesystem or space to put the data.
         */
        kmem_free(irbuf);
-        *ubcountp = ubelem;
+        *ubcountp = ac.ac_ubelem;
        /*
-         * Found some inodes, return them now and return the error next time.
+         * We found some inodes, so clear the error status and return them.
+         * The lastino pointer will point directly at the inode that triggered
+         * any error that occurred, so on the next call the error will be
+         * triggered again and propagated to userspace as there will be no
+         * formatted inodes in the buffer.
         */
-        if (ubelem)
+        if (ac.ac_ubelem)
-                rval = 0;
+                error = 0;
-        if (agno >= mp->m_sb.sb_agcount) {
-                /*
+        /*
-                 * If we ran out of filesystem, mark lastino as off
+         * If we ran out of filesystem, lastino will point off the end of
-                 * the end of the filesystem, so the next call
+         * the filesystem so the next call will return immediately.
-                 * will return immediately.
+         */
-                 */
+        *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
-                *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+        if (agno >= mp->m_sb.sb_agcount)
                *done = 1;
-        } else
-                *lastinop = (xfs_ino_t)lastino;
-        return rval;
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed08022eb9..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
                               int              *ubused,
                               int              *stat);
-struct xfs_bulkstat_agichunk {
-        xfs_ino_t       ac_lastino;     /* last inode returned */
-        char            __user **ac_ubuffer;/* pointer into user's buffer */
-        int             ac_ubleft;      /* bytes left in user's buffer */
-        int             ac_ubelem;      /* spaces used in user's buffer */
-};
-int
-xfs_bulkstat_ag_ichunk(
-        struct xfs_mount                *mp,
-        xfs_agnumber_t                  agno,
-        struct xfs_inobt_rec_incore     *irbp,
-        bulkstat_one_pf                 formatter,
-        size_t                          statstruct_size,
-        struct xfs_bulkstat_agichunk    *acp);
 /*
 * Values for stat return value.
 */