64 files changed, 1514 insertions, 501 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003cb6f1b..952aeb048349 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return parse_options(sb, data);
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731b82ff..307453086c3f 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -530,6 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 5188f1222987..d626756ff721 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (!(*flags & MS_RDONLY))
                return -EINVAL;
        return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d4878ddba87a..9dbf42395153 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1380,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        unsigned int old_metadata_ratio = fs_info->metadata_ratio;
        int ret;
+        sync_filesystem(sb);
        btrfs_remount_prepare(fs_info);
        ret = btrfs_parse_options(root, data);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ab8ad2546c3e..2c70cbe35d39 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return 0;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 626abc02b694..d9c7751f10ac 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NOATIME;
        return 0;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a1f801c14fbc..ddcfe590b8a8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -243,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index ca4a08f38374..8c41b52da358 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
        int err;
        struct debugfs_fs_info *fsi = sb->s_fs_info;
+        sync_filesystem(sb);
        err = debugfs_parse_options(data, &fsi->mount_opts);
        if (err)
                goto fail;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f29cb7..c71038079b47 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
+        sync_filesystem(sb);
        err = parse_mount_options(data, PARSE_REMOUNT, opts);
        /*
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f8def1acf08c..3befcc9f5d63 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697bd638..d260115c0350 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
+        sync_filesystem(sb);
        spin_lock(&sbi->s_lock);
        /* Store the old options */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..95c6c5a6d0c5 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
+        sync_filesystem(sb);
        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534fdc5ff..f1c65dc7cc0a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
+#include <linux/falloc.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -567,6 +568,8 @@ enum {
 #define EXT4_GET_BLOCKS_NO_LOCK                 0x0100
        /* Do not put hole in extent cache */
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE             0x0200
+        /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN       0x0400
 /*
 * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_STD_GROUP_SIZE      0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT         0x00000004 /* Support HURD-castrated
+                                                      file systems */
 #define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
        struct list_head s_es_lru;
        unsigned long s_es_last_sorted;
        struct percpu_counter s_extent_cache_cnt;
+        struct mb_cache *s_mb_cache;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
        /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2757,6 +2761,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2771,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+                            struct ext4_extent **extent);
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de832c8..c3fb607413ed 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                if (WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
+                        if (inode == NULL) {
+                                pr_err("EXT4: jbd2_journal_dirty_metadata "
+                                       "failed: handle type %u started at "
+                                       "line %u, credits %u/%u, errcode %d",
+                                       handle->h_type,
+                                       handle->h_line_no,
+                                       handle->h_requested_credits,
+                                       handle->h_buffer_credits, err);
+                                return err;
+                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d549c58..82df3ce9874a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         * the extent that was written properly split out and conversion to
         * initialized is trivial.
         */
-        if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+        if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
                return 0;
        ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;
+        if (ext4_ext_is_uninitialized(ex1) &&
+            (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+             atomic_read(&EXT4_I(inode)->i_unwritten) ||
+             (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
+                return 0;
 #ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
-        int merge_done = 0;
+        int merge_done = 0, uninit;
        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
+                uninit = ext4_ext_is_uninitialized(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
+                if (uninit)
+                        ext4_ext_mark_uninitialized(ex);
                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
-        int mb_flags = 0;
+        int mb_flags = 0, uninit;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                                  path + depth);
                        if (err)
                                return err;
+                        uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                        if (uninit)
+                                ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -1971,10 +1980,13 @@ prepend:
                        if (err)
                                return err;
+                        uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                        if (uninit)
+                                ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
+        /*
+         * If we're starting with an extent other than the last one in the
+         * node, we need to see if it shares a cluster with the extent to
+         * the right (towards the end of the file). If its leftmost cluster
+         * is this extent's rightmost cluster and it is not cluster aligned,
+         * we'll mark it as a partial that is not to be deallocated.
+         */
+        if (ex != EXT_LAST_EXTENT(eh)) {
+                ext4_fsblk_t current_pblk, right_pblk;
+                long long current_cluster, right_cluster;
+                current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+                current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+                right_pblk = ext4_ext_pblock(ex + 1);
+                right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+                if (current_cluster == right_cluster &&
+                        EXT4_PBLK_COFF(sbi, right_pblk))
+                        *partial_cluster = -right_cluster;
+        }
        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                err = ext4_ext_correct_indexes(handle, inode, path);
        /*
-         * Free the partial cluster only if the current extent does not
+         * If there's a partial cluster and at least one extent remains in
-         * reference it. Otherwise we might free used cluster.
+         * the leaf, free the partial cluster if it isn't shared with the
+         * current extent.  If there's a partial cluster and no extents
+         * remain in the leaf, it can't be freed here.  It can only be
+         * freed when it's possible to determine if it's not shared with
+         * any other extent - when the next leaf is processed or when space
+         * removal is complete.
         */
-        if (*partial_cluster > 0 &&
+        if (*partial_cluster > 0 && eh->eh_entries &&
            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
             *partial_cluster)) {
                int flags = get_default_free_blocks_flags(inode);
@@ -3569,6 +3607,8 @@ out:
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
 * One of more index blocks maybe needed if the extent tree grow after
 * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3619,7 @@ out:
 *
 * Returns the size of uninitialized extent to be written on success.
 */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
@@ -3591,9 +3631,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        unsigned int ee_len;
        int split_flag = 0, depth;
-        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+        ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
-                "block %llu, max_blocks %u\n", inode->i_ino,
+                  __func__, inode->i_ino,
-                (unsigned long long)map->m_lblk, map->m_len);
+                  (unsigned long long)map->m_lblk, map->m_len);
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3648,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+        /* Convert to unwritten */
-        split_flag |= EXT4_EXT_MARK_UNINIT2;
+        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
-        if (flags & EXT4_GET_BLOCKS_CONVERT)
+                split_flag |= EXT4_EXT_DATA_VALID1;
-                split_flag |= EXT4_EXT_DATA_VALID2;
+        /* Convert to initialized */
+        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+                split_flag |= ee_block + ee_len <= eof_block ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+        }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
+static int ext4_convert_initialized_extents(handle_t *handle,
+                                            struct inode *inode,
+                                            struct ext4_map_blocks *map,
+                                            struct ext4_ext_path *path)
+{
+        struct ext4_extent *ex;
+        ext4_lblk_t ee_block;
+        unsigned int ee_len;
+        int depth;
+        int err = 0;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        ext_debug("%s: inode %lu, logical"
+                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                  (unsigned long long)ee_block, ee_len);
+        if (ee_block != map->m_lblk || ee_len > map->m_len) {
+                err = ext4_split_convert_extents(handle, inode, map, path,
+                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+                if (err < 0)
+                        goto out;
+                ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        goto out;
+                }
+                depth = ext_depth(inode);
+                ex = path[depth].p_ext;
+        }
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        /* first mark the extent as uninitialized */
+        ext4_ext_mark_uninitialized(ex);
+        /* note: ext4_ext_correct_indexes() isn't needed here because
+         * borders are not changed
+         */
+        ext4_ext_try_to_merge(handle, inode, path, ex);
+        /* Mark modified extent as dirty */
+        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+}
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
@@ -3649,8 +3748,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-                err = ext4_split_unwritten_extents(handle, inode, map, path,
+                err = ext4_split_convert_extents(handle, inode, map, path,
-                                                   EXT4_GET_BLOCKS_CONVERT);
+                                                 EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        goto out;
                ext4_ext_drop_refs(path);
@@ -3851,6 +3950,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+                        struct ext4_map_blocks *map,
+                        struct ext4_ext_path *path, int flags,
+                        unsigned int allocated, ext4_fsblk_t newblock)
+{
+        int ret = 0;
+        int err = 0;
+        /*
+         * Make sure that the extent is no bigger than we support with
+         * uninitialized extent
+         */
+        if (map->m_len > EXT_UNINIT_MAX_LEN)
+                map->m_len = EXT_UNINIT_MAX_LEN / 2;
+        ret = ext4_convert_initialized_extents(handle, inode, map,
+                                                path);
+        if (ret >= 0) {
+                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                         path, map->m_len);
+        } else
+                err = ret;
+        map->m_flags |= EXT4_MAP_UNWRITTEN;
+        if (allocated > map->m_len)
+                allocated = map->m_len;
+        map->m_len = allocated;
+        return err ? err : allocated;
+}
+static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
@@ -3877,8 +4008,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle, inode, map,
+                ret = ext4_split_convert_extents(handle, inode, map,
-                                                   path, flags);
+                                         path, flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -3993,10 +4124,6 @@ out1:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-        if (path) {
-                ext4_ext_drop_refs(path);
-                kfree(path);
-        }
        return err ? err : allocated;
 }
@@ -4128,7 +4255,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent newex, *ex, *ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-        int free_on_err = 0, err = 0, depth;
+        int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
@@ -4170,6 +4297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
                 * Uninitialized extents are treated as holes, except that
                 * we split out initialized portions during a write.
@@ -4186,13 +4314,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        if (!ext4_ext_is_uninitialized(ex))
+                        /*
+                         * If the extent is initialized check whether the
+                         * caller wants to convert it to unwritten.
+                         */
+                        if ((!ext4_ext_is_uninitialized(ex)) &&
+                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                                allocated = ext4_ext_convert_initialized_extent(
+                                                handle, inode, map, path, flags,
+                                                allocated, newblock);
+                                goto out2;
+                        } else if (!ext4_ext_is_uninitialized(ex))
                                goto out;
-                        allocated = ext4_ext_handle_uninitialized_extents(
+                        ret = ext4_ext_handle_uninitialized_extents(
                                handle, inode, map, path, flags,
                                allocated, newblock);
-                        goto out3;
+                        if (ret < 0)
+                                err = ret;
+                        else
+                                allocated = ret;
+                        goto out2;
                }
        }
@@ -4473,7 +4615,6 @@ out2:
                kfree(path);
        }
-out3:
        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        ext4_es_lru_add(inode);
@@ -4514,34 +4655,200 @@ retry:
        ext4_std_error(inode->i_sb, err);
 }
-static void ext4_falloc_update_inode(struct inode *inode,
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
-                                int mode, loff_t new_size, int update_ctime)
+                                  ext4_lblk_t len, int flags, int mode)
 {
-        struct timespec now;
+        struct inode *inode = file_inode(file);
+        handle_t *handle;
+        int ret = 0;
+        int ret2 = 0;
+        int retries = 0;
+        struct ext4_map_blocks map;
+        unsigned int credits;
-        if (update_ctime) {
+        map.m_lblk = offset;
-                now = current_fs_time(inode->i_sb);
+        /*
-                if (!timespec_equal(&inode->i_ctime, &now))
+         * Don't normalize the request if it can fit in one extent so
-                        inode->i_ctime = now;
+         * that it doesn't get unnecessarily split into multiple
+         * extents.
+         */
+        if (len <= EXT_UNINIT_MAX_LEN)
+                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+        /*
+         * credits to insert 1 extent into extent tree
+         */
+        credits = ext4_chunk_trans_blocks(inode, len);
+retry:
+        while (ret >= 0 && ret < len) {
+                map.m_lblk = map.m_lblk + ret;
+                map.m_len = len = len - ret;
+                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                            credits);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        break;
+                }
+                ret = ext4_map_blocks(handle, inode, &map, flags);
+                if (ret <= 0) {
+                        ext4_debug("inode #%lu: block %u: len %u: "
+                                   "ext4_ext_map_blocks returned %d",
+                                   inode->i_ino, map.m_lblk,
+                                   map.m_len, ret);
+                        ext4_mark_inode_dirty(handle, inode);
+                        ret2 = ext4_journal_stop(handle);
+                        break;
+                }
+                ret2 = ext4_journal_stop(handle);
+                if (ret2)
+                        break;
+        }
+        if (ret == -ENOSPC &&
+                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                ret = 0;
+                goto retry;
        }
+        return ret > 0 ? ret2 : ret;
+}
+static long ext4_zero_range(struct file *file, loff_t offset,
+                            loff_t len, int mode)
+{
+        struct inode *inode = file_inode(file);
+        handle_t *handle = NULL;
+        unsigned int max_blocks;
+        loff_t new_size = 0;
+        int ret = 0;
+        int flags;
+        int partial;
+        loff_t start, end;
+        ext4_lblk_t lblk;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned int blkbits = inode->i_blkbits;
+        trace_ext4_zero_range(inode, offset, len, mode);
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                ret = filemap_write_and_wait_range(mapping, offset,
+                                                   offset + len - 1);
+                if (ret)
+                        return ret;
+        }
        /*
-         * Update only when preallocation was requested beyond
+         * Round up offset. This is not fallocate, we neet to zero out
-         * the file size.
+         * blocks, so convert interior block aligned part of the range to
+         * unwritten and possibly manually zero out unaligned parts of the
+         * range.
         */
-        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+        start = round_up(offset, 1 << blkbits);
+        end = round_down((offset + len), 1 << blkbits);
+        if (start < offset || end > offset + len)
+                return -EINVAL;
+        partial = (offset + len) & ((1 << blkbits) - 1);
+        lblk = start >> blkbits;
+        max_blocks = (end >> blkbits);
+        if (max_blocks < lblk)
+                max_blocks = 0;
+        else
+                max_blocks -= lblk;
+        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+        if (mode & FALLOC_FL_KEEP_SIZE)
+                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * Indirect files do not support unwritten extnets
+         */
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+                ret = -EOPNOTSUPP;
+                goto out_mutex;
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+             offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                ret = inode_newsize_ok(inode, new_size);
+                if (ret)
+                        goto out_mutex;
+                /*
+                 * If we have a partial block after EOF we have to allocate
+                 * the entire block.
+                 */
+                if (partial)
+                        max_blocks += 1;
+        }
+        if (max_blocks > 0) {
+                /* Now release the pages and zero block aligned part of pages*/
+                truncate_pagecache_range(inode, start, end - 1);
+                /* Wait all existing dio workers, newcomers will block on i_mutex */
+                ext4_inode_block_unlocked_dio(inode);
+                inode_dio_wait(inode);
+                /*
+                 * Remove entire range from the extent status tree.
+                 */
+                ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+                if (ret)
+                        goto out_dio;
+                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                                             mode);
+                if (ret)
+                        goto out_dio;
+        }
+        handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                ext4_std_error(inode->i_sb, ret);
+                goto out_dio;
+        }
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        if (new_size) {
                if (new_size > i_size_read(inode))
                        i_size_write(inode, new_size);
                if (new_size > EXT4_I(inode)->i_disksize)
                        ext4_update_i_disksize(inode, new_size);
        } else {
                /*
-                 * Mark that we allocate beyond EOF so the subsequent truncate
+                * Mark that we allocate beyond EOF so the subsequent truncate
-                 * can proceed even if the new size is the same as i_size.
+                * can proceed even if the new size is the same as i_size.
-                 */
+                */
-                if (new_size > i_size_read(inode))
+                if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+        ext4_mark_inode_dirty(handle, inode);
+        /* Zero out partial block at the edges of the range */
+        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+        if (file->f_flags & O_SYNC)
+                ext4_handle_sync(handle);
+        ext4_journal_stop(handle);
+out_dio:
+        ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
 }
 /*
@@ -4555,22 +4862,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
-        loff_t new_size;
+        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
-        int ret2 = 0;
-        int retries = 0;
        int flags;
-        struct ext4_map_blocks map;
+        ext4_lblk_t lblk;
-        unsigned int credits, blkbits = inode->i_blkbits;
+        struct timespec tv;
+        unsigned int blkbits = inode->i_blkbits;
        /* Return error if mode is not supported */
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
                return ext4_punch_hole(inode, offset, len);
+        if (mode & FALLOC_FL_COLLAPSE_RANGE)
+                return ext4_collapse_range(inode, offset, len);
        ret = ext4_convert_inline_data(inode);
        if (ret)
                return ret;
@@ -4582,83 +4892,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_ZERO_RANGE)
+                return ext4_zero_range(file, offset, len, mode);
        trace_ext4_fallocate_enter(inode, offset, len, mode);
-        map.m_lblk = offset >> blkbits;
+        lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                - map.m_lblk;
+                - lblk;
-        /*
-         * credits to insert 1 extent into extent tree
-         */
-        credits = ext4_chunk_trans_blocks(inode, max_blocks);
-        mutex_lock(&inode->i_mutex);
-        ret = inode_newsize_ok(inode, (len + offset));
-        if (ret) {
-                mutex_unlock(&inode->i_mutex);
-                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-                return ret;
-        }
        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-        /*
-         * Don't normalize the request if it can fit in one extent so
-         * that it doesn't get unnecessarily split into multiple
-         * extents.
-         */
-        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-retry:
+        mutex_lock(&inode->i_mutex);
-        while (ret >= 0 && ret < max_blocks) {
-                map.m_lblk = map.m_lblk + ret;
-                map.m_len = max_blocks = max_blocks - ret;
-                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                            credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        break;
-                }
-                ret = ext4_map_blocks(handle, inode, &map, flags);
-                if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-                        ext4_warning(inode->i_sb,
-                                     "inode #%lu: block %u: len %u: "
-                                     "ext4_ext_map_blocks returned %d",
-                                     inode->i_ino, map.m_lblk,
-                                     map.m_len, ret);
-#endif
-                        ext4_mark_inode_dirty(handle, inode);
-                        ret2 = ext4_journal_stop(handle);
-                        break;
-                }
-                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-                                                blkbits) >> blkbits))
-                        new_size = offset + len;
-                else
-                        new_size = ((loff_t) map.m_lblk + ret) << blkbits;
-                ext4_falloc_update_inode(inode, mode, new_size,
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                                         (map.m_flags & EXT4_MAP_NEW));
+             offset + len > i_size_read(inode)) {
-                ext4_mark_inode_dirty(handle, inode);
+                new_size = offset + len;
-                if ((file->f_flags & O_SYNC) && ret >= max_blocks)
+                ret = inode_newsize_ok(inode, new_size);
-                        ext4_handle_sync(handle);
+                if (ret)
-                ret2 = ext4_journal_stop(handle);
+                        goto out;
-                if (ret2)
-                        break;
        }
-        if (ret == -ENOSPC &&
-                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
+        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
-                ret = 0;
+        if (ret)
-                goto retry;
+                goto out;
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+        if (IS_ERR(handle))
+                goto out;
+        tv = inode->i_ctime = ext4_current_time(inode);
+        if (new_size) {
+                if (new_size > i_size_read(inode)) {
+                        i_size_write(inode, new_size);
+                        inode->i_mtime = tv;
+                }
+                if (new_size > EXT4_I(inode)->i_disksize)
+                        ext4_update_i_disksize(inode, new_size);
+        } else {
+                /*
+                * Mark that we allocate beyond EOF so the subsequent truncate
+                * can proceed even if the new size is the same as i_size.
+                */
+                if ((offset + len) > i_size_read(inode))
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+        ext4_mark_inode_dirty(handle, inode);
+        if (file->f_flags & O_SYNC)
+                ext4_handle_sync(handle);
+        ext4_journal_stop(handle);
+out:
        mutex_unlock(&inode->i_mutex);
-        trace_ext4_fallocate_exit(inode, offset, max_blocks,
+        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-                                ret > 0 ? ret2 : ret);
+        return ret;
-        return ret > 0 ? ret2 : ret;
 }
 /*
@@ -4869,3 +5162,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        ext4_es_lru_add(inode);
        return error;
 }
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+                struct ext4_ext_path *path)
+{
+        int credits, err;
+        if (!ext4_handle_valid(handle))
+                return 0;
+        /*
+         * Check if need to extend journal credits
+         * 3 for leaf, sb, and inode plus 2 (bmap and group
+         * descriptor) for each block group; assume two block
+         * groups
+         */
+        if (handle->h_buffer_credits < 7) {
+                credits = ext4_writepage_trans_blocks(inode);
+                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+                /* EAGAIN is success */
+                if (err && err != -EAGAIN)
+                        return err;
+        }
+        err = ext4_ext_get_access(handle, inode, path);
+        return err;
+}
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+                            struct inode *inode, handle_t *handle,
+                            ext4_lblk_t *start)
+{
+        int depth, err = 0;
+        struct ext4_extent *ex_start, *ex_last;
+        bool update = 0;
+        depth = path->p_depth;
+        while (depth >= 0) {
+                if (depth == path->p_depth) {
+                        ex_start = path[depth].p_ext;
+                        if (!ex_start)
+                                return -EIO;
+                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+                        if (!ex_last)
+                                return -EIO;
+                        err = ext4_access_path(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+                                update = 1;
+                        *start = ex_last->ee_block +
+                                ext4_ext_get_actual_len(ex_last);
+                        while (ex_start <= ex_last) {
+                                ex_start->ee_block -= shift;
+                                if (ex_start >
+                                        EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+                                        if (ext4_ext_try_to_merge_right(inode,
+                                                path, ex_start - 1))
+                                                ex_last--;
+                                }
+                                ex_start++;
+                        }
+                        err = ext4_ext_dirty(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        if (--depth < 0 || !update)
+                                break;
+                }
+                /* Update index too */
+                err = ext4_access_path(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                path[depth].p_idx->ei_block -= shift;
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                /* we are done if current index is not a starting index */
+                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+                        break;
+                depth--;
+        }
+out:
+        return err;
+}
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+                       ext4_lblk_t start, ext4_lblk_t shift)
+{
+        struct ext4_ext_path *path;
+        int ret = 0, depth;
+        struct ext4_extent *extent;
+        ext4_lblk_t stop_block, current_block;
+        ext4_lblk_t ex_start, ex_end;
+        /* Let path point to the last extent */
+        path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        depth = path->p_depth;
+        extent = path[depth].p_ext;
+        if (!extent) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+                return ret;
+        }
+        stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+        ext4_ext_drop_refs(path);
+        kfree(path);
+        /* Nothing to shift, if hole is at the end of file */
+        if (start >= stop_block)
+                return ret;
+        /*
+         * Don't start shifting extents until we make sure the hole is big
+         * enough to accomodate the shift.
+         */
+        path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+        depth = path->p_depth;
+        extent =  path[depth].p_ext;
+        ex_start = extent->ee_block;
+        ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+        ext4_ext_drop_refs(path);
+        kfree(path);
+        if ((start == ex_start && shift > ex_start) ||
+            (shift > start - ex_end))
+                return -EINVAL;
+        /* Its safe to start updating extents */
+        while (start < stop_block) {
+                path = ext4_ext_find_extent(inode, start, NULL, 0);
+                if (IS_ERR(path))
+                        return PTR_ERR(path);
+                depth = path->p_depth;
+                extent = path[depth].p_ext;
+                current_block = extent->ee_block;
+                if (start > current_block) {
+                        /* Hole, move to the next extent */
+                        ret = mext_next_extent(inode, path, &extent);
+                        if (ret != 0) {
+                                ext4_ext_drop_refs(path);
+                                kfree(path);
+                                if (ret == 1)
+                                        ret = 0;
+                                break;
+                        }
+                }
+                ret = ext4_ext_shift_path_extents(path, shift, inode,
+                                handle, &start);
+                ext4_ext_drop_refs(path);
+                kfree(path);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+        struct super_block *sb = inode->i_sb;
+        ext4_lblk_t punch_start, punch_stop;
+        handle_t *handle;
+        unsigned int credits;
+        loff_t new_size;
+        int ret;
+        BUG_ON(offset + len > i_size_read(inode));
+        /* Collapse range works only on fs block size aligned offsets. */
+        if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+            len & (EXT4_BLOCK_SIZE(sb) - 1))
+                return -EINVAL;
+        if (!S_ISREG(inode->i_mode))
+                return -EOPNOTSUPP;
+        trace_ext4_collapse_range(inode, offset, len);
+        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+        /* Write out all dirty pages */
+        ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
+        if (ret)
+                return ret;
+        /* Take mutex lock */
+        mutex_lock(&inode->i_mutex);
+        /* It's not possible punch hole on append only file */
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+                ret = -EPERM;
+                goto out_mutex;
+        }
+        if (IS_SWAPFILE(inode)) {
+                ret = -ETXTBSY;
+                goto out_mutex;
+        }
+        /* Currently just for extent based files */
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                ret = -EOPNOTSUPP;
+                goto out_mutex;
+        }
+        truncate_pagecache_range(inode, offset, -1);
+        /* Wait for existing dio to complete */
+        ext4_inode_block_unlocked_dio(inode);
+        inode_dio_wait(inode);
+        credits = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out_dio;
+        }
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_discard_preallocations(inode);
+        ret = ext4_es_remove_extent(inode, punch_start,
+                                    EXT_MAX_BLOCKS - punch_start - 1);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                     punch_stop - punch_start);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        new_size = i_size_read(inode) - len;
+        truncate_setsize(inode, new_size);
+        EXT4_I(inode)->i_disksize = new_size;
+        ext4_discard_preallocations(inode);
+        up_write(&EXT4_I(inode)->i_data_sem);
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+out_stop:
+        ext4_journal_stop(handle);
+out_dio:
+        ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff783950..0a014a7194b2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
-                printk(KERN_DEBUG " [%u/%u) %llu %llx",
+                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
-                                        "want to add an delayed/hole extent "
+                                        "want to add a delayed/hole extent "
-                                        "[%d/%d/%llu/%llx]\n",
+                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
-                                "to add an written/unwritten extent "
+                                "to add a written/unwritten extent "
-                                "[%d/%d/%llu/%llx]\n", inode->i_ino,
+                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                         */
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
-                                "delayed/hole extent [%d/%d/%llu/%llx]\n",
+                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
-                                "an written extent [%d/%d/%llu/%llx]\n",
+                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        newes.es_lblk = lblk;
        newes.es_len = len;
-        ext4_es_store_pblock(&newes, pblk);
+        ext4_es_store_pblock_status(&newes, pblk, status);
-        ext4_es_store_status(&newes, status);
        trace_ext4_es_insert_extent(inode, &newes);
        ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
        newes.es_lblk = lblk;
        newes.es_len = len;
-        ext4_es_store_pblock(&newes, pblk);
+        ext4_es_store_pblock_status(&newes, pblk, status);
-        ext4_es_store_status(&newes, status);
        trace_ext4_es_cache_extent(inode, &newes);
        if (!len)
@@ -812,13 +810,13 @@ retry:
                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
+                        block = 0x7FDEADBEEF;
                        if (ext4_es_is_written(&orig_es) ||
-                            ext4_es_is_unwritten(&orig_es)) {
+                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
-                                ext4_es_store_pblock(&newes, block);
+                        ext4_es_store_pblock_status(&newes, block,
-                        }
+                                                    ext4_es_status(&orig_es));
-                        ext4_es_store_status(&newes, ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes);
                        if (err) {
                                es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc3..f1b62a419920 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
                       (es->es_pblk & ~ES_MASK));
 }
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                                               ext4_fsblk_t pb,
+                                               unsigned int status)
+{
+        es->es_pblk = (((ext4_fsblk_t)
+                        (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                       (pb & ~ES_MASK));
+}
 extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 175c3f933816..5b0d2c7d5408 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct extent_status es;
        int retval;
+        int ret = 0;
 #ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;
@@ -515,6 +516,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+        /*
+         * ext4_map_blocks returns an int, and m_len is an unsigned int
+         */
+        if (unlikely(map->m_len > INT_MAX))
+                map->m_len = INT_MAX;
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
                ext4_es_lru_add(inode);
@@ -553,7 +560,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        if (retval > 0) {
-                int ret;
                unsigned int status;
                if (unlikely(retval != map->m_len)) {
@@ -580,7 +586,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, map);
+                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -597,7 +603,13 @@ found:
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-                return retval;
+                /*
+                 * If we need to convert extent to unwritten
+                 * we continue and do the actual work in
+                 * ext4_ext_map_blocks()
+                 */
+                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+                        return retval;
        /*
         * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +665,6 @@ found:
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        if (retval > 0) {
-                int ret;
                unsigned int status;
                if (unlikely(retval != map->m_len)) {
@@ -688,7 +699,7 @@ found:
 has_zeroout:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, map);
+                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -3313,33 +3324,13 @@ void ext4_set_aops(struct inode *inode)
 }
 /*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from)
-{
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned length;
-        unsigned blocksize;
-        struct inode *inode = mapping->host;
-        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
-        return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that cooresponds to 'from'
 */
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3420,26 @@ unlock:
        return err;
 }
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+                struct address_space *mapping, loff_t from)
+{
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
 {
@@ -3502,7 +3513,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        trace_ext4_punch_hole(inode, offset, length);
+        trace_ext4_punch_hole(inode, offset, length, 0);
        /*
         * Write out all dirty pages to avoid race conditions
@@ -3609,6 +3620,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
+        /* Now release the pages again to reduce race window */
+        if (last_block_offset > first_block_offset)
+                truncate_pagecache_range(inode, first_block_offset,
+                                         last_block_offset);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
 out_stop:
@@ -3682,7 +3699,7 @@ void ext4_truncate(struct inode *inode)
        /*
         * There is a possibility that we're either freeing the inode
-         * or it completely new indode. In those cases we might not
+         * or it's a completely new inode. In those cases we might not
         * have i_mutex locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3951,8 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
-        set_mask_bits(&inode->i_flags,
+        inode_set_flags(inode, new_fl,
-                      S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4171,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
-        inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+                inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-                        inode->i_version |=
+                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+                                inode->i_version |=
+                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+                }
        }
        ret = 0;
@@ -4328,8 +4347,7 @@ static int ext4_do_update_inode(handle_t *handle,
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
-            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4392,15 @@ static int ext4_do_update_inode(handle_t *handle,
                        raw_inode->i_block[block] = ei->i_data[block];
        }
-        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-        if (ei->i_extra_isize) {
+                raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                if (ei->i_extra_isize) {
-                        raw_inode->i_version_hi =
+                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                        cpu_to_le32(inode->i_version >> 32);
+                                raw_inode->i_version_hi =
-                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+                                        cpu_to_le32(inode->i_version >> 32);
+                        raw_inode->i_extra_isize =
+                                cpu_to_le16(ei->i_extra_isize);
+                }
        }
        ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4446,7 +4467,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                        return -EIO;
                }
-                if (wbc->sync_mode != WB_SYNC_ALL)
+                /*
+                 * No need to force transaction in WB_SYNC_NONE mode. Also
+                 * ext4_sync_fs() will force the commit after everything is
+                 * written.
+                 */
+                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;
                err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4482,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                err = __ext4_get_inode_loc(inode, &iloc, 0);
                if (err)
                        return err;
-                if (wbc->sync_mode == WB_SYNC_ALL)
+                /*
+                 * sync(2) will flush the whole buffer cache. No need to do
+                 * it here separately for each inode.
+                 */
+                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..0f2252ec274d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
        struct ext4_inode_info *ei_bl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
-                err = -EINVAL;
+                return -EINVAL;
-                goto swap_boot_out;
-        }
-        if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+        if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
-                err = -EPERM;
+                return -EPERM;
-                goto swap_boot_out;
-        }
        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
-        if (IS_ERR(inode_bl)) {
+        if (IS_ERR(inode_bl))
-                err = PTR_ERR(inode_bl);
+                return PTR_ERR(inode_bl);
-                goto swap_boot_out;
-        }
        ei_bl = EXT4_I(inode_bl);
        filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
                        ext4_mark_inode_dirty(handle, inode);
                }
        }
        ext4_journal_stop(handle);
        ext4_double_up_write_data_sem(inode, inode_bl);
 journal_err_out:
        ext4_inode_resume_unlocked_dio(inode);
        ext4_inode_resume_unlocked_dio(inode_bl);
        unlock_two_nondirectories(inode, inode_bl);
        iput(inode_bl);
-swap_boot_out:
        return err;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c7504be9..a888cac76e9c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        ext4_lock_group(ac->ac_sb, group);
        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                             ac->ac_g_ex.fe_len, &ex);
+        ex.fe_logical = 0xDEADFA11; /* debug value */
        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                ext4_fsblk_t start;
@@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         */
                        break;
                }
+                ex.fe_logical = 0xDEADC0DE; /* debug value */
                ext4_mb_measure_extent(ac, &ex, e4b);
                i += ex.fe_len;
@@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                        if (max >= sbi->s_stripe) {
                                ac->ac_found++;
+                                ex.fe_logical = 0xDEADF00D; /* debug value */
                                ac->ac_b_ex = ex;
                                ext4_mb_use_best_found(ac, e4b);
                                break;
@@ -4006,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
-        ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
+        ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
-                 ac->ac_ex_scanned, ac->ac_found);
        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
        ngroups = ext4_get_groups_count(sb);
        for (i = 0; i < ngroups; i++) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd5..d634e183b4d4 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
                }                                                       \
        } while (0)
 #else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...)          no_printk(fmt, ## a)
 #endif
 #define EXT4_MB_HISTORY_ALLOC           1       /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
        /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;
-        /* number of iterations done. we have to track to limit searching */
-        unsigned long ac_ex_scanned;
        __u16 ac_groups_scanned;
        __u16 ac_found;
        __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503bd18c..58ee7dc87669 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
 * ext4_ext_path structure refers to the last extent, or a negative error
 * value on failure.
 */
-static int
+int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
                        }
                        if (!buffer_mapped(bh)) {
                                zero_user(page, block_start, blocksize);
-                                if (!err)
+                                set_buffer_uptodate(bh);
-                                        set_buffer_uptodate(bh);
                                continue;
                        }
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2377d4..f3c667091618 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
 static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
+        if (sbi->s_mb_cache) {
+                ext4_xattr_destroy_cache(sbi->s_mb_cache);
+                sbi->s_mb_cache = NULL;
+        }
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
        sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
                                             sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
+        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+                set_opt2(sb, HURD_COMPAT);
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                              EXT4_FEATURE_INCOMPAT_64BIT)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "The Hurd can't support 64-bit file systems");
+                        goto failed_mount;
+                }
+        }
        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -4010,6 +4025,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 no_journal:
+        if (ext4_mballoc_ready) {
+                sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+                if (!sbi->s_mb_cache) {
+                        ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+                        goto failed_mount_wq;
+                }
+        }
        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
@@ -4835,6 +4858,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = sync_filesystem(sb);
+                        if (err < 0)
+                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;
@@ -5516,11 +5542,9 @@ static int __init ext4_init_fs(void)
        err = ext4_init_mballoc();
        if (err)
-                goto out3;
-        err = ext4_init_xattr();
-        if (err)
                goto out2;
+        else
+                ext4_mballoc_ready = 1;
        err = init_inodecache();
        if (err)
                goto out1;
@@ -5536,10 +5560,9 @@ out:
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        ext4_exit_xattr();
+        ext4_mballoc_ready = 0;
-out2:
        ext4_exit_mballoc();
-out3:
+out2:
        ext4_exit_feat_adverts();
 out4:
        if (ext4_proc_root)
@@ -5562,7 +5585,6 @@ static void __exit ext4_exit_fs(void)
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        ext4_exit_xattr();
        ext4_exit_mballoc();
        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94116ac..1f5cf5880718 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
 # define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
 #endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
 static struct buffer_head *ext4_xattr_cache_find(struct inode *,
                                                 struct ext4_xattr_header *,
                                                 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
 static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
+#define EXT4_GET_MB_CACHE(inode)        (((struct ext4_sb_info *) \
+                                inode->i_sb->s_fs_info)->s_mb_cache)
 static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    sector_t block_nr,
                                    struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        struct ext4_xattr_entry *entry;
        size_t size;
        int error;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
                error = -EIO;
                goto cleanup;
        }
-        ext4_xattr_cache_insert(bh);
+        ext4_xattr_cache_insert(ext4_mb_cache, bh);
        entry = BFIRST(bh);
        error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
        if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                error = -EIO;
                goto cleanup;
        }
-        ext4_xattr_cache_insert(bh);
+        ext4_xattr_cache_insert(ext4_mb_cache, bh);
        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 {
        struct mb_cache_entry *ce = NULL;
        int error = 0;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
-        ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+        ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
        error = ext4_journal_get_write_access(handle, bh);
        if (error)
                goto out;
@@ -567,12 +571,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                    size_t *min_offs, void *base, int *total)
 {
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-                *total += EXT4_XATTR_LEN(last->e_name_len);
                if (!last->e_value_block && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < *min_offs)
                                *min_offs = offs;
                }
+                if (total)
+                        *total += EXT4_XATTR_LEN(last->e_name_len);
        }
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
@@ -745,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        struct ext4_xattr_search *s = &bs->s;
        struct mb_cache_entry *ce = NULL;
        int error = 0;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 #define header(x) ((struct ext4_xattr_header *)(x))
        if (i->value && i->value_len > sb->s_blocksize)
                return -ENOSPC;
        if (s->base) {
-                ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+                ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
                                        bs->bh->b_blocknr);
                error = ext4_journal_get_write_access(handle, bs->bh);
                if (error)
@@ -769,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                                if (!IS_LAST_ENTRY(s->first))
                                        ext4_xattr_rehash(header(s->base),
                                                          s->here);
-                                ext4_xattr_cache_insert(bs->bh);
+                                ext4_xattr_cache_insert(ext4_mb_cache,
+                                        bs->bh);
                        }
                        unlock_buffer(bs->bh);
                        if (error == -EIO)
@@ -905,7 +912,7 @@ getblk_failed:
                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
-                        ext4_xattr_cache_insert(new_bh);
+                        ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
                        error = ext4_handle_dirty_xattr_block(handle,
                                                              inode, new_bh);
                        if (error)
@@ -1228,7 +1235,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
        struct ext4_xattr_block_find *bs = NULL;
        char *buffer = NULL, *b_entry_name = NULL;
        size_t min_offs, free;
-        int total_ino, total_blk;
+        int total_ino;
        void *base, *start, *end;
        int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1293,7 @@ retry:
                first = BFIRST(bh);
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
-                free = ext4_xattr_free_space(first, &min_offs, base,
+                free = ext4_xattr_free_space(first, &min_offs, base, NULL);
-                                             &total_blk);
                if (free < new_extra_isize) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
@@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb)
 * Returns 0, or a negative error number on failure.
 */
 static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
 {
        __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+        ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
@@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 {
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+        ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
                                       hash);
        while (ce) {
                struct buffer_head *bh;
@@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
-int __init
+#define HASH_BUCKET_BITS        10
-ext4_init_xattr(void)
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
 {
-        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
+        return mb_cache_create(name, HASH_BUCKET_BITS);
-        if (!ext4_xattr_cache)
-                return -ENOMEM;
-        return 0;
 }
-void
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
-ext4_exit_xattr(void)
 {
-        if (ext4_xattr_cache)
+        if (cache)
-                mb_cache_destroy(ext4_xattr_cache);
+                mb_cache_destroy(cache);
-        ext4_xattr_cache = NULL;
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d6398833f..29bedf5589f6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
                                       struct ext4_xattr_info *i,
                                       struct ext4_xattr_ibody_find *is);
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83abd53..856bdf994c0a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -568,6 +568,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        struct f2fs_mount_info org_mount_opt;
        int err, active_logs;
+        sync_filesystem(sb);
        /*
         * Save the old mount options in case we
         * need to restore them.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c68d9f27135e..b3361fe2bcb5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+        sync_filesystem(sb);
        /* make sure we update state on remount. */
        new_rdonly = *flags & MS_RDONLY;
        if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e492..7ca8c75d50d3 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b4bff1b15028..8d611696fcad 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (*flags & MS_MANDLOCK)
                return -EINVAL;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 033ee975a895..de8afad89e51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1167,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        struct gfs2_tune *gt = &sdp->sd_tune;
        int error;
+        sync_filesystem(sb);
        spin_lock(&gt->gt_spin);
        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754cd..eee7206c38d1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a6abf87d79d0..a513d2d36be9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff688b76..fe3463a43236 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        char *new_opts = kstrdup(data, GFP_KERNEL);
        
+        sync_filesystem(s);
        *flags |= MS_NOATIME;
        
        hpfs_lock(s);
diff --git a/fs/inode.c b/fs/inode.c
index fb59ba7967f1..f96d2a6f88cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1898,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 EXPORT_SYMBOL(inode_dio_done);
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated).  The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+                     unsigned int mask)
+{
+        unsigned int old_flags, new_flags;
+        WARN_ON_ONCE(flags & ~mask);
+        do {
+                old_flags = ACCESS_ONCE(inode->i_flags);
+                new_flags = (old_flags & ~mask) | flags;
+        } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+                                  new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10ea13f2..6af66ee56390 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (!(*flags & MS_RDONLY))
                return -EROFS;
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc0594063..5f26139a165a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
                                          &log_bufs, WRITE_SYNC);
-        blk_finish_plug(&plug);
        jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        err = 0;
        bufs = 0;
        descriptor = NULL;
-        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
                /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
                goto restart_loop;
        }
+        /* Add the transaction to the checkpoint list
+         * __journal_remove_checkpoint() can not destroy transaction
+         * under us because it is not marked as T_FINISHED yet */
+        if (journal->j_checkpoint_transactions == NULL) {
+                journal->j_checkpoint_transactions = commit_transaction;
+                commit_transaction->t_cpnext = commit_transaction;
+                commit_transaction->t_cpprev = commit_transaction;
+        } else {
+                commit_transaction->t_cpnext =
+                        journal->j_checkpoint_transactions;
+                commit_transaction->t_cpprev =
+                        commit_transaction->t_cpnext->t_cpprev;
+                commit_transaction->t_cpnext->t_cpprev =
+                        commit_transaction;
+                commit_transaction->t_cpprev->t_cpnext =
+                                commit_transaction;
+        }
+        spin_unlock(&journal->j_list_lock);
        /* Done with this transaction! */
        jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
                atomic_read(&commit_transaction->t_handle_count);
        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
                             commit_transaction->t_tid, &stats.run);
+        stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
-        /*
-         * Calculate overall stats
-         */
-        spin_lock(&journal->j_history_lock);
-        journal->j_stats.ts_tid++;
-        if (commit_transaction->t_requested)
-                journal->j_stats.ts_requested++;
-        journal->j_stats.run.rs_wait += stats.run.rs_wait;
-        journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
-        journal->j_stats.run.rs_running += stats.run.rs_running;
-        journal->j_stats.run.rs_locked += stats.run.rs_locked;
-        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
-        journal->j_stats.run.rs_logging += stats.run.rs_logging;
-        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
-        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
-        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
-        spin_unlock(&journal->j_history_lock);
        commit_transaction->t_state = T_COMMIT_CALLBACK;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
        write_unlock(&journal->j_state_lock);
-        if (journal->j_checkpoint_transactions == NULL) {
-                journal->j_checkpoint_transactions = commit_transaction;
-                commit_transaction->t_cpnext = commit_transaction;
-                commit_transaction->t_cpprev = commit_transaction;
-        } else {
-                commit_transaction->t_cpnext =
-                        journal->j_checkpoint_transactions;
-                commit_transaction->t_cpprev =
-                        commit_transaction->t_cpnext->t_cpprev;
-                commit_transaction->t_cpnext->t_cpprev =
-                        commit_transaction;
-                commit_transaction->t_cpprev->t_cpnext =
-                                commit_transaction;
-        }
-        spin_unlock(&journal->j_list_lock);
-        /* Drop all spin_locks because commit_callback may be block.
-         * __journal_remove_checkpoint() can not destroy transaction
-         * under us because it is not marked as T_FINISHED yet */
        if (journal->j_commit_callback)
                journal->j_commit_callback(journal, commit_transaction);
@@ -1150,7 +1132,7 @@ restart_loop:
        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        commit_transaction->t_state = T_FINISHED;
-        /* Recheck checkpoint lists after j_list_lock was dropped */
+        /* Check if the transaction can be dropped now that we are finished */
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
        spin_unlock(&journal->j_list_lock);
        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_wait_done_commit);
+        /*
+         * Calculate overall stats
+         */
+        spin_lock(&journal->j_history_lock);
+        journal->j_stats.ts_tid++;
+        journal->j_stats.ts_requested += stats.ts_requested;
+        journal->j_stats.run.rs_wait += stats.run.rs_wait;
+        journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+        journal->j_stats.run.rs_running += stats.run.rs_running;
+        journal->j_stats.run.rs_locked += stats.run.rs_locked;
+        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+        journal->j_stats.run.rs_logging += stats.run.rs_logging;
+        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+        spin_unlock(&journal->j_history_lock);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344afb49a..67b8e303946c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 #endif
 /* Checksumming functions */
-int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
        return cpu_to_be32(csum);
 }
-int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
        return sb->s_checksum == jbd2_superblock_csum(j, sb);
 }
-void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
        journal->j_flags |= JBD2_UNMOUNT;
        while (journal->j_task) {
-                wake_up(&journal->j_wait_commit);
                write_unlock(&journal->j_state_lock);
+                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                write_lock(&journal->j_state_lock);
        }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
-                wake_up(&journal->j_wait_commit);
                read_unlock(&journal->j_state_lock);
+                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
                read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365f54a5..38cfcf5f6fce 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         * reused here.
         */
        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                jh->b_modified = 0;
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
+                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
        } else if (jh->b_transaction == journal->j_committing_transaction) {
                /* first access by this transaction */
                jh->b_modified = 0;
                JBUFFER_TRACE(jh, "set next transaction");
+                spin_lock(&journal->j_list_lock);
                jh->b_next_transaction = transaction;
        }
        spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                             journal->j_running_transaction)) {
                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
-                               "journal->j_running_transaction (%p, %u)",
+                               "journal->j_running_transaction (%p, %u)\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
-                if (unlikely(jh->b_transaction !=
+                if (unlikely(((jh->b_transaction !=
-                             journal->j_committing_transaction)) {
+                               journal->j_committing_transaction)) ||
-                        printk(KERN_ERR "JBD2: %s: "
+                             (jh->b_next_transaction != transaction))) {
-                               "jh->b_transaction (%llu, %p, %u) != "
+                        printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
-                               "journal->j_committing_transaction (%p, %u)",
+                               "bad jh for block %llu: "
+                               "transaction (%p, %u), "
+                               "jh->b_transaction (%p, %u), "
+                               "jh->b_next_transaction (%p, %u), jlist %u\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
+                               transaction, transaction->t_tid,
                               jh->b_transaction,
-                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                               jh->b_transaction ?
-                               journal->j_committing_transaction,
+                               jh->b_transaction->t_tid : 0,
-                               journal->j_committing_transaction ?
-                               journal->j_committing_transaction->t_tid : 0);
-                        ret = -EINVAL;
-                }
-                if (unlikely(jh->b_next_transaction != transaction)) {
-                        printk(KERN_ERR "JBD2: %s: "
-                               "jh->b_next_transaction (%llu, %p, %u) != "
-                               "transaction (%p, %u)",
-                               journal->j_devname,
-                               (unsigned long long) bh->b_blocknr,
                               jh->b_next_transaction,
                               jh->b_next_transaction ?
                               jh->b_next_transaction->t_tid : 0,
-                               transaction, transaction->t_tid);
+                               jh->b_jlist);
+                        WARN_ON(1);
                        ret = -EINVAL;
                }
                /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
        BUFFER_TRACE(bh, "entry");
        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
        if (!buffer_jbd(bh))
                goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                 * we know to remove the checkpoint after we commit.
                 */
+                spin_lock(&journal->j_list_lock);
                if (jh->b_cp_transaction) {
                        __jbd2_journal_temp_unlink_buffer(jh);
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                                goto drop;
                        }
                }
+                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                if (jh->b_next_transaction) {
                        J_ASSERT(jh->b_next_transaction == transaction);
+                        spin_lock(&journal->j_list_lock);
                        jh->b_next_transaction = NULL;
+                        spin_unlock(&journal->j_list_lock);
                        /*
                         * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
        }
 not_jbd:
-        spin_unlock(&journal->j_list_lock);
        jbd_unlock_bh_state(bh);
        __brelse(bh);
 drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
        if (buffer_locked(bh) || buffer_dirty(bh))
                goto out;
-        if (jh->b_next_transaction != NULL)
+        if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+        if (jh->b_cp_transaction != NULL) {
                /* written-back checkpointed metadata buffer */
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a35..0918f0e2e266 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
        int err;
+        sync_filesystem(sb);
        err = jffs2_parse_options(c, data);
        if (err)
                return -EINVAL;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483444fd..97f7fda51890 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        int flag = JFS_SBI(sb)->flag;
        int ret;
+        sync_filesystem(sb);
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf673..bf166e388f0d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
 * back on the lru list.
 */
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order.  While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain.  For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -34,9 +69,10 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/list_bl.h>
 #include <linux/mbcache.h>
+#include <linux/init.h>
+#include <linux/blockgroup_lock.h>
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+#define MB_CACHE_ENTRY_LOCK_BITS        __builtin_log2(NR_BG_LOCKS)
+#define MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
+        (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-                
+static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
 MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+        spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+        spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-        return !list_empty(&ce->e_block_list);
+        return !hlist_bl_unhashed(&ce->e_block_list);
 }
-static void
+static inline void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-        if (__mb_cache_entry_is_hashed(ce)) {
+        if (__mb_cache_entry_is_block_hashed(ce))
-                list_del_init(&ce->e_block_list);
+                hlist_bl_del_init(&ce->e_block_list);
-                list_del(&ce->e_index.o_list);
-        }
 }
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+        return !hlist_bl_unhashed(&ce->e_index.o_list);
+}
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+        if (__mb_cache_entry_is_index_hashed(ce))
+                hlist_bl_del_init(&ce->e_index.o_list);
+}
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
+static inline void
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+{
+        __mb_cache_entry_unhash_index(ce);
+        hlist_bl_unlock(ce->e_index_hash_p);
+        __mb_cache_entry_unhash_block(ce);
+        hlist_bl_unlock(ce->e_block_hash_p);
+}
 static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
        struct mb_cache *cache = ce->e_cache;
-        mb_assert(!(ce->e_used || ce->e_queued));
+        mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
        kmem_cache_free(cache->c_entry_cache, ce);
        atomic_dec(&cache->c_entry_count);
 }
 static void
-__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
+__mb_cache_entry_release(struct mb_cache_entry *ce)
-        __releases(mb_cache_spinlock)
 {
+        /* First lock the entry to serialize access to its local data. */
+        __spin_lock_mb_cache_entry(ce);
        /* Wake up all processes queuing for this cache entry. */
        if (ce->e_queued)
                wake_up_all(&mb_cache_queue);
        if (ce->e_used >= MB_CACHE_WRITER)
                ce->e_used -= MB_CACHE_WRITER;
+        /*
+         * Make sure that all cache entries on lru_list have
+         * both e_used and e_qued of 0s.
+         */
        ce->e_used--;
-        if (!(ce->e_used || ce->e_queued)) {
+        if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
-                if (!__mb_cache_entry_is_hashed(ce))
+                if (!__mb_cache_entry_is_block_hashed(ce)) {
+                        __spin_unlock_mb_cache_entry(ce);
                        goto forget;
-                mb_assert(list_empty(&ce->e_lru_list));
+                }
-                list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+                /*
+                 * Need access to lru list, first drop entry lock,
+                 * then reacquire the lock in the proper order.
+                 */
+                spin_lock(&mb_cache_spinlock);
+                if (list_empty(&ce->e_lru_list))
+                        list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+                spin_unlock(&mb_cache_spinlock);
        }
-        spin_unlock(&mb_cache_spinlock);
+        __spin_unlock_mb_cache_entry(ce);
        return;
 forget:
-        spin_unlock(&mb_cache_spinlock);
+        mb_assert(list_empty(&ce->e_lru_list));
        __mb_cache_entry_forget(ce, GFP_KERNEL);
 }
 /*
 * mb_cache_shrink_scan()  memory pressure callback
 *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        mb_debug("trying to free %d entries", nr_to_scan);
        spin_lock(&mb_cache_spinlock);
-        while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+        while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
                struct mb_cache_entry *ce =
                        list_entry(mb_cache_lru_list.next,
-                                   struct mb_cache_entry, e_lru_list);
+                                struct mb_cache_entry, e_lru_list);
-                list_move_tail(&ce->e_lru_list, &free_list);
+                list_del_init(&ce->e_lru_list);
-                __mb_cache_entry_unhash(ce);
+                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
-                freed++;
+                        continue;
+                spin_unlock(&mb_cache_spinlock);
+                /* Prevent any find or get operation on the entry */
+                hlist_bl_lock(ce->e_block_hash_p);
+                hlist_bl_lock(ce->e_index_hash_p);
+                /* Ignore if it is touched by a find/get */
+                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+                        !list_empty(&ce->e_lru_list)) {
+                        hlist_bl_unlock(ce->e_index_hash_p);
+                        hlist_bl_unlock(ce->e_block_hash_p);
+                        spin_lock(&mb_cache_spinlock);
+                        continue;
+                }
+                __mb_cache_entry_unhash_unlock(ce);
+                list_add_tail(&ce->e_lru_list, &free_list);
+                spin_lock(&mb_cache_spinlock);
        }
        spin_unlock(&mb_cache_spinlock);
        list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
                __mb_cache_entry_forget(entry, gfp_mask);
+                freed++;
        }
        return freed;
 }
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
        int n, bucket_count = 1 << bucket_bits;
        struct mb_cache *cache = NULL;
+        if (!mb_cache_bg_lock) {
+                mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+                        GFP_KERNEL);
+                if (!mb_cache_bg_lock)
+                        return NULL;
+                bgl_lock_init(mb_cache_bg_lock);
+        }
        cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
        if (!cache)
                return NULL;
        cache->c_name = name;
        atomic_set(&cache->c_entry_count, 0);
        cache->c_bucket_bits = bucket_bits;
-        cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
+        cache->c_block_hash = kmalloc(bucket_count *
-                                      GFP_KERNEL);
+                sizeof(struct hlist_bl_head), GFP_KERNEL);
        if (!cache->c_block_hash)
                goto fail;
        for (n=0; n<bucket_count; n++)
-                INIT_LIST_HEAD(&cache->c_block_hash[n]);
+                INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
-        cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+        cache->c_index_hash = kmalloc(bucket_count *
-                                      GFP_KERNEL);
+                sizeof(struct hlist_bl_head), GFP_KERNEL);
        if (!cache->c_index_hash)
                goto fail;
        for (n=0; n<bucket_count; n++)
-                INIT_LIST_HEAD(&cache->c_index_hash[n]);
+                INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
-        cache->c_entry_cache = kmem_cache_create(name,
+        if (!mb_cache_kmem_cache) {
-                sizeof(struct mb_cache_entry), 0,
+                mb_cache_kmem_cache = kmem_cache_create(name,
-                SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+                        sizeof(struct mb_cache_entry), 0,
-        if (!cache->c_entry_cache)
+                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-                goto fail2;
+                if (!mb_cache_kmem_cache)
+                        goto fail2;
+        }
+        cache->c_entry_cache = mb_cache_kmem_cache;
        /*
         * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
 mb_cache_shrink(struct block_device *bdev)
 {
        LIST_HEAD(free_list);
-        struct list_head *l, *ltmp;
+        struct list_head *l;
+        struct mb_cache_entry *ce, *tmp;
+        l = &mb_cache_lru_list;
        spin_lock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+        while (!list_is_last(l, &mb_cache_lru_list)) {
-                struct mb_cache_entry *ce =
+                l = l->next;
-                        list_entry(l, struct mb_cache_entry, e_lru_list);
+                ce = list_entry(l, struct mb_cache_entry, e_lru_list);
                if (ce->e_bdev == bdev) {
-                        list_move_tail(&ce->e_lru_list, &free_list);
+                        list_del_init(&ce->e_lru_list);
-                        __mb_cache_entry_unhash(ce);
+                        if (ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt))
+                                continue;
+                        spin_unlock(&mb_cache_spinlock);
+                        /*
+                         * Prevent any find or get operation on the entry.
+                         */
+                        hlist_bl_lock(ce->e_block_hash_p);
+                        hlist_bl_lock(ce->e_index_hash_p);
+                        /* Ignore if it is touched by a find/get */
+                        if (ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt) ||
+                                !list_empty(&ce->e_lru_list)) {
+                                hlist_bl_unlock(ce->e_index_hash_p);
+                                hlist_bl_unlock(ce->e_block_hash_p);
+                                l = &mb_cache_lru_list;
+                                spin_lock(&mb_cache_spinlock);
+                                continue;
+                        }
+                        __mb_cache_entry_unhash_unlock(ce);
+                        mb_assert(!(ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt)));
+                        list_add_tail(&ce->e_lru_list, &free_list);
+                        l = &mb_cache_lru_list;
+                        spin_lock(&mb_cache_spinlock);
                }
        }
        spin_unlock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &free_list) {
-                __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                                                   e_lru_list), GFP_KERNEL);
+                __mb_cache_entry_forget(ce, GFP_KERNEL);
        }
 }
@@ -303,23 +451,27 @@ void
 mb_cache_destroy(struct mb_cache *cache)
 {
        LIST_HEAD(free_list);
-        struct list_head *l, *ltmp;
+        struct mb_cache_entry *ce, *tmp;
        spin_lock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+        list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
-                struct mb_cache_entry *ce =
+                if (ce->e_cache == cache)
-                        list_entry(l, struct mb_cache_entry, e_lru_list);
-                if (ce->e_cache == cache) {
                        list_move_tail(&ce->e_lru_list, &free_list);
-                        __mb_cache_entry_unhash(ce);
-                }
        }
        list_del(&cache->c_cache_list);
        spin_unlock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &free_list) {
+        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+                list_del_init(&ce->e_lru_list);
-                                                   e_lru_list), GFP_KERNEL);
+                /*
+                 * Prevent any find or get operation on the entry.
+                 */
+                hlist_bl_lock(ce->e_block_hash_p);
+                hlist_bl_lock(ce->e_index_hash_p);
+                mb_assert(!(ce->e_used || ce->e_queued ||
+                        atomic_read(&ce->e_refcnt)));
+                __mb_cache_entry_unhash_unlock(ce);
+                __mb_cache_entry_forget(ce, GFP_KERNEL);
        }
        if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
                          atomic_read(&cache->c_entry_count));
        }
-        kmem_cache_destroy(cache->c_entry_cache);
+        if (list_empty(&mb_cache_list)) {
+                kmem_cache_destroy(mb_cache_kmem_cache);
+                mb_cache_kmem_cache = NULL;
+        }
        kfree(cache->c_index_hash);
        kfree(cache->c_block_hash);
        kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce = NULL;
+        struct mb_cache_entry *ce;
        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+                struct list_head *l;
+                l = &mb_cache_lru_list;
                spin_lock(&mb_cache_spinlock);
-                if (!list_empty(&mb_cache_lru_list)) {
+                while (!list_is_last(l, &mb_cache_lru_list)) {
-                        ce = list_entry(mb_cache_lru_list.next,
+                        l = l->next;
-                                        struct mb_cache_entry, e_lru_list);
+                        ce = list_entry(l, struct mb_cache_entry, e_lru_list);
-                        list_del_init(&ce->e_lru_list);
+                        if (ce->e_cache == cache) {
-                        __mb_cache_entry_unhash(ce);
+                                list_del_init(&ce->e_lru_list);
+                                if (ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt))
+                                        continue;
+                                spin_unlock(&mb_cache_spinlock);
+                                /*
+                                 * Prevent any find or get operation on the
+                                 * entry.
+                                 */
+                                hlist_bl_lock(ce->e_block_hash_p);
+                                hlist_bl_lock(ce->e_index_hash_p);
+                                /* Ignore if it is touched by a find/get */
+                                if (ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt) ||
+                                        !list_empty(&ce->e_lru_list)) {
+                                        hlist_bl_unlock(ce->e_index_hash_p);
+                                        hlist_bl_unlock(ce->e_block_hash_p);
+                                        l = &mb_cache_lru_list;
+                                        spin_lock(&mb_cache_spinlock);
+                                        continue;
+                                }
+                                mb_assert(list_empty(&ce->e_lru_list));
+                                mb_assert(!(ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt)));
+                                __mb_cache_entry_unhash_unlock(ce);
+                                goto found;
+                        }
                }
                spin_unlock(&mb_cache_spinlock);
        }
-        if (!ce) {
-                ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-                if (!ce)
+        if (!ce)
-                        return NULL;
+                return NULL;
-                atomic_inc(&cache->c_entry_count);
+        atomic_inc(&cache->c_entry_count);
-                INIT_LIST_HEAD(&ce->e_lru_list);
+        INIT_LIST_HEAD(&ce->e_lru_list);
-                INIT_LIST_HEAD(&ce->e_block_list);
+        INIT_HLIST_BL_NODE(&ce->e_block_list);
-                ce->e_cache = cache;
+        INIT_HLIST_BL_NODE(&ce->e_index.o_list);
-                ce->e_queued = 0;
+        ce->e_cache = cache;
-        }
+        ce->e_queued = 0;
+        atomic_set(&ce->e_refcnt, 0);
+found:
+        ce->e_block_hash_p = &cache->c_block_hash[0];
+        ce->e_index_hash_p = &cache->c_index_hash[0];
        ce->e_used = 1 + MB_CACHE_WRITER;
        return ce;
 }
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 {
        struct mb_cache *cache = ce->e_cache;
        unsigned int bucket;
-        struct list_head *l;
+        struct hlist_bl_node *l;
-        int error = -EBUSY;
+        struct hlist_bl_head *block_hash_p;
+        struct hlist_bl_head *index_hash_p;
+        struct mb_cache_entry *lce;
+        mb_assert(ce);
        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
                           cache->c_bucket_bits);
-        spin_lock(&mb_cache_spinlock);
+        block_hash_p = &cache->c_block_hash[bucket];
-        list_for_each_prev(l, &cache->c_block_hash[bucket]) {
+        hlist_bl_lock(block_hash_p);
-                struct mb_cache_entry *ce =
+        hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
-                        list_entry(l, struct mb_cache_entry, e_block_list);
+                if (lce->e_bdev == bdev && lce->e_block == block) {
-                if (ce->e_bdev == bdev && ce->e_block == block)
+                        hlist_bl_unlock(block_hash_p);
-                        goto out;
+                        return -EBUSY;
+                }
        }
-        __mb_cache_entry_unhash(ce);
+        mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+        __mb_cache_entry_unhash_block(ce);
+        __mb_cache_entry_unhash_index(ce);
        ce->e_bdev = bdev;
        ce->e_block = block;
-        list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+        ce->e_block_hash_p = block_hash_p;
        ce->e_index.o_key = key;
+        hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+        hlist_bl_unlock(block_hash_p);
        bucket = hash_long(key, cache->c_bucket_bits);
-        list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
+        index_hash_p = &cache->c_index_hash[bucket];
-        error = 0;
+        hlist_bl_lock(index_hash_p);
-out:
+        ce->e_index_hash_p = index_hash_p;
-        spin_unlock(&mb_cache_spinlock);
+        hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
-        return error;
+        hlist_bl_unlock(index_hash_p);
+        return 0;
 }
@@ -429,24 +625,26 @@ out:
 void
 mb_cache_entry_release(struct mb_cache_entry *ce)
 {
-        spin_lock(&mb_cache_spinlock);
+        __mb_cache_entry_release(ce);
-        __mb_cache_entry_release_unlock(ce);
 }
 /*
 * mb_cache_entry_free()
 *
- * This is equivalent to the sequence mb_cache_entry_takeout() --
- * mb_cache_entry_release().
 */
 void
 mb_cache_entry_free(struct mb_cache_entry *ce)
 {
-        spin_lock(&mb_cache_spinlock);
+        mb_assert(ce);
        mb_assert(list_empty(&ce->e_lru_list));
-        __mb_cache_entry_unhash(ce);
+        hlist_bl_lock(ce->e_index_hash_p);
-        __mb_cache_entry_release_unlock(ce);
+        __mb_cache_entry_unhash_index(ce);
+        hlist_bl_unlock(ce->e_index_hash_p);
+        hlist_bl_lock(ce->e_block_hash_p);
+        __mb_cache_entry_unhash_block(ce);
+        hlist_bl_unlock(ce->e_block_hash_p);
+        __mb_cache_entry_release(ce);
 }
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
                   sector_t block)
 {
        unsigned int bucket;
-        struct list_head *l;
+        struct hlist_bl_node *l;
        struct mb_cache_entry *ce;
+        struct hlist_bl_head *block_hash_p;
        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
                           cache->c_bucket_bits);
-        spin_lock(&mb_cache_spinlock);
+        block_hash_p = &cache->c_block_hash[bucket];
-        list_for_each(l, &cache->c_block_hash[bucket]) {
+        /* First serialize access to the block corresponding hash chain. */
-                ce = list_entry(l, struct mb_cache_entry, e_block_list);
+        hlist_bl_lock(block_hash_p);
+        hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+                mb_assert(ce->e_block_hash_p == block_hash_p);
                if (ce->e_bdev == bdev && ce->e_block == block) {
-                        DEFINE_WAIT(wait);
+                        /*
+                         * Prevent a free from removing the entry.
+                         */
+                        atomic_inc(&ce->e_refcnt);
+                        hlist_bl_unlock(block_hash_p);
+                        __spin_lock_mb_cache_entry(ce);
+                        atomic_dec(&ce->e_refcnt);
+                        if (ce->e_used > 0) {
+                                DEFINE_WAIT(wait);
+                                while (ce->e_used > 0) {
+                                        ce->e_queued++;
+                                        prepare_to_wait(&mb_cache_queue, &wait,
+                                                        TASK_UNINTERRUPTIBLE);
+                                        __spin_unlock_mb_cache_entry(ce);
+                                        schedule();
+                                        __spin_lock_mb_cache_entry(ce);
+                                        ce->e_queued--;
+                                }
+                                finish_wait(&mb_cache_queue, &wait);
+                        }
+                        ce->e_used += 1 + MB_CACHE_WRITER;
+                        __spin_unlock_mb_cache_entry(ce);
-                        if (!list_empty(&ce->e_lru_list))
+                        if (!list_empty(&ce->e_lru_list)) {
+                                spin_lock(&mb_cache_spinlock);
                                list_del_init(&ce->e_lru_list);
-                        while (ce->e_used > 0) {
-                                ce->e_queued++;
-                                prepare_to_wait(&mb_cache_queue, &wait,
-                                                TASK_UNINTERRUPTIBLE);
                                spin_unlock(&mb_cache_spinlock);
-                                schedule();
-                                spin_lock(&mb_cache_spinlock);
-                                ce->e_queued--;
                        }
-                        finish_wait(&mb_cache_queue, &wait);
+                        if (!__mb_cache_entry_is_block_hashed(ce)) {
-                        ce->e_used += 1 + MB_CACHE_WRITER;
+                                __mb_cache_entry_release(ce);
-                        if (!__mb_cache_entry_is_hashed(ce)) {
-                                __mb_cache_entry_release_unlock(ce);
                                return NULL;
                        }
-                        goto cleanup;
+                        return ce;
                }
        }
-        ce = NULL;
+        hlist_bl_unlock(block_hash_p);
+        return NULL;
-cleanup:
-        spin_unlock(&mb_cache_spinlock);
-        return ce;
 }
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
                      struct block_device *bdev, unsigned int key)
 {
-        while (l != head) {
+        /* The index hash chain is alredy acquire by caller. */
+        while (l != NULL) {
                struct mb_cache_entry *ce =
-                        list_entry(l, struct mb_cache_entry, e_index.o_list);
+                        hlist_bl_entry(l, struct mb_cache_entry,
+                                e_index.o_list);
+                mb_assert(ce->e_index_hash_p == head);
                if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-                        DEFINE_WAIT(wait);
+                        /*
+                         * Prevent a free from removing the entry.
-                        if (!list_empty(&ce->e_lru_list))
+                         */
-                                list_del_init(&ce->e_lru_list);
+                        atomic_inc(&ce->e_refcnt);
+                        hlist_bl_unlock(head);
+                        __spin_lock_mb_cache_entry(ce);
+                        atomic_dec(&ce->e_refcnt);
+                        ce->e_used++;
                        /* Incrementing before holding the lock gives readers
                           priority over writers. */
-                        ce->e_used++;
+                        if (ce->e_used >= MB_CACHE_WRITER) {
-                        while (ce->e_used >= MB_CACHE_WRITER) {
+                                DEFINE_WAIT(wait);
-                                ce->e_queued++;
-                                prepare_to_wait(&mb_cache_queue, &wait,
+                                while (ce->e_used >= MB_CACHE_WRITER) {
-                                                TASK_UNINTERRUPTIBLE);
+                                        ce->e_queued++;
-                                spin_unlock(&mb_cache_spinlock);
+                                        prepare_to_wait(&mb_cache_queue, &wait,
-                                schedule();
+                                                        TASK_UNINTERRUPTIBLE);
-                                spin_lock(&mb_cache_spinlock);
+                                        __spin_unlock_mb_cache_entry(ce);
-                                ce->e_queued--;
+                                        schedule();
+                                        __spin_lock_mb_cache_entry(ce);
+                                        ce->e_queued--;
+                                }
+                                finish_wait(&mb_cache_queue, &wait);
                        }
-                        finish_wait(&mb_cache_queue, &wait);
+                        __spin_unlock_mb_cache_entry(ce);
+                        if (!list_empty(&ce->e_lru_list)) {
-                        if (!__mb_cache_entry_is_hashed(ce)) {
-                                __mb_cache_entry_release_unlock(ce);
                                spin_lock(&mb_cache_spinlock);
+                                list_del_init(&ce->e_lru_list);
+                                spin_unlock(&mb_cache_spinlock);
+                        }
+                        if (!__mb_cache_entry_is_block_hashed(ce)) {
+                                __mb_cache_entry_release(ce);
                                return ERR_PTR(-EAGAIN);
                        }
                        return ce;
                }
                l = l->next;
        }
+        hlist_bl_unlock(head);
        return NULL;
 }
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
                          unsigned int key)
 {
        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-        struct list_head *l;
+        struct hlist_bl_node *l;
-        struct mb_cache_entry *ce;
+        struct mb_cache_entry *ce = NULL;
+        struct hlist_bl_head *index_hash_p;
-        spin_lock(&mb_cache_spinlock);
-        l = cache->c_index_hash[bucket].next;
+        index_hash_p = &cache->c_index_hash[bucket];
-        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+        hlist_bl_lock(index_hash_p);
-        spin_unlock(&mb_cache_spinlock);
+        if (!hlist_bl_empty(index_hash_p)) {
+                l = hlist_bl_first(index_hash_p);
+                ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+        } else
+                hlist_bl_unlock(index_hash_p);
        return ce;
 }
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
 {
        struct mb_cache *cache = prev->e_cache;
        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-        struct list_head *l;
+        struct hlist_bl_node *l;
        struct mb_cache_entry *ce;
+        struct hlist_bl_head *index_hash_p;
-        spin_lock(&mb_cache_spinlock);
+        index_hash_p = &cache->c_index_hash[bucket];
+        mb_assert(prev->e_index_hash_p == index_hash_p);
+        hlist_bl_lock(index_hash_p);
+        mb_assert(!hlist_bl_empty(index_hash_p));
        l = prev->e_index.o_list.next;
-        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+        ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-        __mb_cache_entry_release_unlock(prev);
+        __mb_cache_entry_release(prev);
        return ce;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0ad2ec9601de..f007a3355570 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
        struct minix_sb_info * sbi = minix_sb(sb);
        struct minix_super_block * ms;
+        sync_filesystem(sb);
        ms = sbi->s_ms;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ee59d35ff069..647d86d2db39 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -99,6 +99,7 @@ static void destroy_inodecache(void)
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return 0;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed906eb82..2cb56943e232 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
        u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+        sync_filesystem(sb);
        /*
         * Userspace mount programs that send binary options generally send
         * them populated with default values. We have no way to know which
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1d..8c532b2ca3ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        unsigned long old_mount_opt;
        int err;
+        sync_filesystem(sb);
        old_sb_flags = sb->s_flags;
        old_mount_opt = nilfs->ns_mount_opt;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d916..bd5610d48242 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -468,6 +468,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_debug("Entering with remount options string: %s", opt);
+        sync_filesystem(sb);
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1aecd626e645..a7cdd56f4c79 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -634,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        u32 tmp;
+        sync_filesystem(sb);
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
            !ocfs2_check_set_options(sb, &parsed_options)) {
                ret = -EINVAL;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f7..15e4500cda3e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NOATIME;
        return 0;
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7bbeb5257af1..5dbadecb234d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 int proc_remount(struct super_block *sb, int *flags, char *data)
 {
        struct pid_namespace *pid = sb->s_fs_info;
+        sync_filesystem(sb);
        return !proc_parse_options(data, pid);
 }
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..192297b0090d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
 static int pstore_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        parse_options(data);
        return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810381c..c4bcb778886e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct qnx4_sb_info *qs;
+        sync_filesystem(sb);
        qs = qnx4_sb(sb);
        qs->Version = QNX4_VERSION;
        *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa1..65cdaab3ed49 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed54a04c33bd..9fb20426005e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1318,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        int i;
 #endif
+        sync_filesystem(s);
        reiserfs_write_lock(s);
 #ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d8418782862b..ef90e8bca95a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 */
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df6312d4e..031c8d67fd51 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2ca765..e9dc3c3fe159 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                }
        }
-        sync_filesystem(sb);
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
                if (retval) {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 5625ca920f5e..88956309cc86 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        sync_filesystem(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
        return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 48f943f7f5d5..a1266089eca1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        int err;
        struct ubifs_info *c = sb->s_fs_info;
+        sync_filesystem(sb);
        dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
        err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f69bed..64f2b7334d08 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -646,6 +646,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        int error = 0;
        struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+        sync_filesystem(sb);
        if (lvidiu) {
                int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
                if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..b8c6791f046f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1280,6 +1280,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned new_mount_opt, ufstype;
        unsigned flags;
+        sync_filesystem(sb);
        lock_ufs(sb);
        mutex_lock(&UFS_SB(sb)->s_lock);
        uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0ef599218991..205376776377 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
        char                    *p;
        int                     error;
+        sync_filesystem(sb);
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a877ed3f389f..ea80f1cdff06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2572,6 +2572,9 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 void inode_dio_wait(struct inode *inode);
 void inode_dio_done(struct inode *inode);
+extern void inode_set_flags(struct inode *inode, unsigned int flags,
+                            unsigned int mask);
 extern const struct file_operations generic_ro_fops;
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 5525d370701d..6a392e7a723a 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -3,19 +3,21 @@
  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 struct mb_cache_entry {
        struct list_head                e_lru_list;
        struct mb_cache                 *e_cache;
        unsigned short                  e_used;
        unsigned short                  e_queued;
+        atomic_t                        e_refcnt;
        struct block_device             *e_bdev;
        sector_t                        e_block;
-        struct list_head                e_block_list;
+        struct hlist_bl_node            e_block_list;
        struct {
-                struct list_head        o_list;
+                struct hlist_bl_node    o_list;
                unsigned int            o_key;
        } e_index;
+        struct hlist_bl_head            *e_block_hash_p;
+        struct hlist_bl_head            *e_index_hash_p;
 };
 struct mb_cache {
@@ -25,8 +27,8 @@ struct mb_cache {
        int                             c_max_entries;
        int                             c_bucket_bits;
        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
+        struct hlist_bl_head            *c_block_hash;
-        struct list_head                *c_index_hash;
+        struct hlist_bl_head            *c_index_hash;
 };
 /* Functions on caches */
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 197d3125df2a..010ea89eeb0e 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -16,6 +16,15 @@ struct mpage_da_data;
 struct ext4_map_blocks;
 struct extent_status;
+/* shim until we merge in the xfs_collapse_range branch */
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE        0x08
+#endif
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE           0x10
+#endif
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 #define show_mballoc_flags(flags) __print_flags(flags, "|",     \
@@ -68,6 +77,13 @@ struct extent_status;
        { EXTENT_STATUS_DELAYED,        "D" },                  \
        { EXTENT_STATUS_HOLE,           "H" })
+#define show_falloc_mode(mode) __print_flags(mode, "|",         \
+        { FALLOC_FL_KEEP_SIZE,          "KEEP_SIZE"},           \
+        { FALLOC_FL_PUNCH_HOLE,         "PUNCH_HOLE"},          \
+        { FALLOC_FL_NO_HIDE_STALE,      "NO_HIDE_STALE"},       \
+        { FALLOC_FL_COLLAPSE_RANGE,     "COLLAPSE_RANGE"},      \
+        { FALLOC_FL_ZERO_RANGE,         "ZERO_RANGE"})
 TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),
@@ -1328,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
                  __entry->rw, __entry->ret)
 );
-TRACE_EVENT(ext4_fallocate_enter,
+DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
        TP_ARGS(inode, offset, len, mode),
@@ -1336,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(        ino_t,  ino                     )
-                __field(        loff_t, pos                     )
+                __field(        loff_t, offset                  )
-                __field(        loff_t, len                     )
+                __field(        loff_t, len                     )
                __field(        int,    mode                    )
        ),
        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
-                __entry->pos    = offset;
+                __entry->offset = offset;
                __entry->len    = len;
                __entry->mode   = mode;
        ),
-        TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
+        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  (unsigned long) __entry->ino, __entry->pos,
+                  (unsigned long) __entry->ino,
-                  __entry->len, __entry->mode)
+                  __entry->offset, __entry->len,
+                  show_falloc_mode(__entry->mode))
+);
+DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
+        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+        TP_ARGS(inode, offset, len, mode)
+);
+DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
+        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+        TP_ARGS(inode, offset, len, mode)
+);
+DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
+        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+        TP_ARGS(inode, offset, len, mode)
 );
 TRACE_EVENT(ext4_fallocate_exit,
@@ -1384,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
                  __entry->ret)
 );
-TRACE_EVENT(ext4_punch_hole,
-        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
-        TP_ARGS(inode, offset, len),
-        TP_STRUCT__entry(
-                __field(        dev_t,  dev                     )
-                __field(        ino_t,  ino                     )
-                __field(        loff_t, offset                  )
-                __field(        loff_t, len                     )
-        ),
-        TP_fast_assign(
-                __entry->dev    = inode->i_sb->s_dev;
-                __entry->ino    = inode->i_ino;
-                __entry->offset = offset;
-                __entry->len    = len;
-        ),
-        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
-                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  (unsigned long) __entry->ino,
-                  __entry->offset, __entry->len)
-);
 TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),
@@ -2410,6 +2423,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
                  __entry->shrunk_nr, __entry->cache_cnt)
 );
+TRACE_EVENT(ext4_collapse_range,
+        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
+        TP_ARGS(inode, offset, len),
+        TP_STRUCT__entry(
+                __field(dev_t,  dev)
+                __field(ino_t,  ino)
+                __field(loff_t, offset)
+                __field(loff_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev    = inode->i_sb->s_dev;
+                __entry->ino    = inode->i_ino;
+                __entry->offset = offset;
+                __entry->len    = len;
+        ),
+        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long) __entry->ino,
+                  __entry->offset, __entry->len)
+);
 #endif /* _TRACE_EXT4_H */
 /* This part must be outside protection */