Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/ext4
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
28 files changed, 5144 insertions, 2611 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,9 +4,10 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+                mmp.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
@@ -426,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
 #include "ext4_jbd2.h"
 #include "mballoc.h"
+#include <trace/events/ext4.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
 */
@@ -171,7 +173,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * less than the blocksize * 8 ( which is the size
                 * of bitmap ), set rest of the block bitmap to 1
                 */
-                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+                ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+                                     bh->b_data);
        }
        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -341,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_read_block_bitmap_load(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -358,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:                     handle to this transaction
- * @sb:                         super block
- * @block:                      start physcial block to add to the block group
- * @count:                      number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                         ext4_fsblk_t block, unsigned long count)
-{
-        struct buffer_head *bitmap_bh = NULL;
-        struct buffer_head *gd_bh;
-        ext4_group_t block_group;
-        ext4_grpblk_t bit;
-        unsigned int i;
-        struct ext4_group_desc *desc;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int err = 0, ret, blk_free_count;
-        ext4_grpblk_t blocks_freed;
-        struct ext4_group_info *grp;
-        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-        grp = ext4_get_group_info(sb, block_group);
-        /*
-         * Check to see if we are freeing blocks across a group
-         * boundary.
-         */
-        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                goto error_return;
-        }
-        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-        if (!bitmap_bh)
-                goto error_return;
-        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-        if (!desc)
-                goto error_return;
-        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-            in_range(block + count - 1, ext4_inode_table(sb, desc),
-                     sbi->s_itb_per_group)) {
-                ext4_error(sb, "Adding blocks in system zones - "
-                           "Block = %llu, count = %lu",
-                           block, count);
-                goto error_return;
-        }
-        /*
-         * We are about to add blocks to the bitmap,
-         * so we need undo access.
-         */
-        BUFFER_TRACE(bitmap_bh, "getting undo access");
-        err = ext4_journal_get_undo_access(handle, bitmap_bh);
-        if (err)
-                goto error_return;
-        /*
-         * We are about to modify some metadata.  Call the journal APIs
-         * to unshare ->b_data if a currently-committing transaction is
-         * using it
-         */
-        BUFFER_TRACE(gd_bh, "get_write_access");
-        err = ext4_journal_get_write_access(handle, gd_bh);
-        if (err)
-                goto error_return;
-        /*
-         * make sure we don't allow a parallel init on other groups in the
-         * same buddy cache
-         */
-        down_write(&grp->alloc_sem);
-        for (i = 0, blocks_freed = 0; i < count; i++) {
-                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-                                                bit + i, bitmap_bh->b_data)) {
-                        ext4_error(sb, "bit already cleared for block %llu",
-                                   (ext4_fsblk_t)(block + i));
-                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
-                } else {
-                        blocks_freed++;
-                }
-        }
-        ext4_lock_group(sb, block_group);
-        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-        ext4_free_blks_set(sb, desc, blk_free_count);
-        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        ext4_unlock_group(sb, block_group);
-        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-        if (sbi->s_log_groups_per_flex) {
-                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                atomic_add(blocks_freed,
-                           &sbi->s_flex_groups[flex_group].free_blocks);
-        }
-        /*
-         * request to reload the buddy with the
-         * new bitmap information
-         */
-        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-        grp->bb_free += blocks_freed;
-        up_write(&grp->alloc_sem);
-        /* We dirtied the bitmap block */
-        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-        /* And the group descriptor block */
-        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-        if (!err)
-                err = ret;
-error_return:
-        brelse(bitmap_bh);
-        ext4_std_error(sb, err);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -489,7 +369,8 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                s64 nblocks, unsigned int flags)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -503,11 +384,6 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                                                EXT4_FREEBLOCKS_WATERMARK) {
                free_blocks  = percpu_counter_sum_positive(fbc);
                dirty_blocks = percpu_counter_sum_positive(dbc);
-                if (dirty_blocks < 0) {
-                        printk(KERN_CRIT "Dirty block accounting "
-                                        "went wrong %lld\n",
-                                        (long long)dirty_blocks);
-                }
        }
        /* Check whether we have space after
         * accounting for current dirty blocks & root reserved blocks.
@@ -518,7 +394,9 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
        /* Hm, nope.  Are (enough) root reserved blocks available? */
        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-            capable(CAP_SYS_RESOURCE)) {
+            capable(CAP_SYS_RESOURCE) ||
+                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
                if (free_blocks >= (nblocks + dirty_blocks))
                        return 1;
        }
@@ -527,9 +405,9 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 }
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                                                s64 nblocks)
+                           s64 nblocks, unsigned int flags)
 {
-        if (ext4_has_free_blocks(sbi, nblocks)) {
+        if (ext4_has_free_blocks(sbi, nblocks, flags)) {
                percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
                return 0;
        } else
@@ -543,14 +421,14 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
            (*retries)++ > 3 ||
            !EXT4_SB(sb)->s_journal)
                return 0;
@@ -573,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 * error stores in errp pointer
 */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t goal, unsigned long *count, int *errp)
+                                  ext4_fsblk_t goal, unsigned int flags,
+                                  unsigned long *count, int *errp)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
@@ -583,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
+        ar.flags = flags;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
@@ -591,7 +471,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 static struct kmem_cache *ext4_system_zone_cachep;
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
-                                             SLAB_RECLAIM_ACCOUNT);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
 }
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
        kmem_cache_destroy(ext4_system_zone_cachep);
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
                                struct file *filp);
 const struct file_operations ext4_dir_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = generic_read_dir,
        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
        .unlocked_ioctl = ext4_ioctl,
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_DELALLOC_RESERVED       0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC            0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS         0x1000
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -168,7 +169,20 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define EXT4_IO_UNWRITTEN       0x1
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_END_ERROR       0x0002
+struct ext4_io_page {
+        struct page     *p_page;
+        atomic_t        p_count;
+};
+#define MAX_IO_PAGES 128
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
@@ -179,13 +193,25 @@ typedef struct ext4_io_end {
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        int                     num_io_pages;
+        struct ext4_io_page     *pages[MAX_IO_PAGES];
 } ext4_io_end_t;
+struct ext4_io_submit {
+        int                     io_op;
+        struct bio              *io_bio;
+        ext4_io_end_t           *io_end;
+        struct ext4_io_page     *io_page;
+        sector_t                io_next_block;
+};
 /*
 * Special inodes numbers
 */
 #define EXT4_BAD_INO             1      /* Bad blocks inode */
 #define EXT4_ROOT_INO            2      /* Root inode */
+#define EXT4_USR_QUOTA_INO       3      /* User quota inode */
+#define EXT4_GRP_QUOTA_INO       4      /* Group quota inode */
 #define EXT4_BOOT_LOADER_INO     5      /* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO       6      /* Undelete directory inode */
 #define EXT4_RESIZE_INO          7      /* Reserved group descriptors inode */
@@ -205,6 +231,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE             1024
 #define EXT4_MAX_BLOCK_SIZE             65536
 #define EXT4_MIN_BLOCK_LOG_SIZE         10
+#define EXT4_MAX_BLOCK_LOG_SIZE         16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)             ((s)->s_blocksize)
 #else
@@ -488,6 +515,10 @@ struct ext4_new_group_data {
        /* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT          (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+        /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT           0x0020
+        /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE            0x0040
 /*
 * Flags used by ext4_free_blocks
@@ -537,23 +568,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
+/* Max physical block we can address w/o extents */
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
-/* Max physical block we can addres w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
 /*
@@ -685,6 +700,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -695,6 +712,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -726,12 +745,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -750,10 +770,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -796,7 +818,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -816,14 +838,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -832,8 +852,12 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -885,24 +909,35 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
-#define ext4_set_bit                    ext2_set_bit
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
+#define ext4_set_bit                    __test_and_set_bit_le
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
-#define ext4_clear_bit                  ext2_clear_bit
+#define ext4_clear_bit                  __test_and_clear_bit_le
 #define ext4_clear_bit_atomic           ext2_clear_bit_atomic
-#define ext4_test_bit                   ext2_test_bit
+#define ext4_test_bit                   test_bit_le
-#define ext4_find_first_zero_bit        ext2_find_first_zero_bit
+#define ext4_find_first_zero_bit        find_first_zero_bit_le
-#define ext4_find_next_zero_bit         ext2_find_next_zero_bit
+#define ext4_find_next_zero_bit         find_next_zero_bit_le
-#define ext4_find_next_bit              ext2_find_next_bit
+#define ext4_find_next_bit              find_next_bit_le
 /*
 * Maximal mount counts between two filesystem checks
@@ -1000,7 +1035,7 @@ struct ext4_super_block {
        __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
        __le32  s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;          /* RAID stride */
-        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1060,6 +1095,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1087,7 +1123,6 @@ struct ext4_sb_info {
        struct completion s_kobj_unregister;
        /* Journaling */
-        struct inode *s_journal_inode;
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
@@ -1116,14 +1151,14 @@ struct ext4_sb_info {
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
 #endif
+        /* ext4 extent cache stats */
+        unsigned long extent_cache_hits;
+        unsigned long extent_cache_misses;
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
@@ -1141,7 +1176,6 @@ struct ext4_sb_info {
        unsigned long s_mb_last_start;
        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
        atomic_t s_bal_success; /* we found long enough chunks */
        atomic_t s_bal_allocated;       /* in blocks */
@@ -1172,6 +1206,14 @@ struct ext4_sb_info {
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
+        /* Lazy inode table initialization info */
+        struct ext4_li_request *s_li_request;
+        /* Wait multiplier for lazy initialization thread */
+        unsigned int s_li_wait_mult;
+        /* Kernel thread for multiple mount protection */
+        struct task_struct *s_mmp_tsk;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1210,24 +1252,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
-EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
-EXT4_INODE_BIT_FNS(state, state_flags)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1294,6 +1351,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM         0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE      0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA            0x0100
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION       0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE          0x0002
@@ -1307,13 +1365,29 @@ EXT4_INODE_BIT_FNS(state, state_flags)
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
+#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 #define EXT4_FEATURE_COMPAT_SUPP        EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
-                                         EXT4_FEATURE_INCOMPAT_FLEX_BG)
+                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                         EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1533,7 +1607,97 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT                   10
+#define EXT4_DEF_LI_MAX_START_DELAY             5
+#define EXT4_LAZYINIT_QUIT                      0x0001
+#define EXT4_LAZYINIT_RUNNING                   0x0002
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+        unsigned long           li_state;
+        struct list_head        li_request_list;
+        struct mutex            li_list_mtx;
+};
+struct ext4_li_request {
+        struct super_block      *lr_super;
+        struct ext4_sb_info     *lr_sbi;
+        ext4_group_t            lr_next_group;
+        struct list_head        lr_request;
+        unsigned long           lr_next_sched;
+        unsigned long           lr_timeout;
+};
+struct ext4_features {
+        struct kobject f_kobj;
+        struct completion f_kobj_unregister;
+};
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+struct mmp_struct {
+        __le32  mmp_magic;              /* Magic number for MMP */
+        __le32  mmp_seq;                /* Sequence no. updated periodically */
+        /*
+         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+         * purposes and do not affect the correctness of the algorithm
+         */
+        __le64  mmp_time;               /* Time last updated */
+        char    mmp_nodename[64];       /* Node which last updated MMP block */
+        char    mmp_bdevname[32];       /* Bdev which last updated MMP block */
+        /*
+         * mmp_check_interval is used to verify if the MMP block has been
+         * updated on the block device. The value is updated based on the
+         * maximum time to write the MMP block during an update cycle.
+         */
+        __le16  mmp_check_interval;
+        __le16  mmp_pad1;
+        __le32  mmp_pad2[227];
+};
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+        struct buffer_head *bh; /* bh from initial read_mmp_block() */
+        struct super_block *sb;  /* super block of the fs */
+};
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT             2UL
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL     5UL
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL     300UL
 /*
 * Function prototypes
@@ -1559,11 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+                                         ext4_fsblk_t goal,
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+                                         unsigned int flags,
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+                                         unsigned long *count,
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                                         int *errp);
-                                ext4_fsblk_t block, unsigned long count);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+                                  s64 nblocks, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1581,10 +1746,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1592,6 +1759,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1605,11 +1773,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-                                       struct buffer_head *bh,
+extern int ext4_init_inode_table(struct super_block *sb,
-                                       ext4_group_t group,
+                                 ext4_group_t group, int barrier);
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1620,16 +1786,17 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+                                ext4_fsblk_t block, unsigned long count);
-                                                ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
@@ -1646,24 +1813,25 @@ extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1696,8 +1864,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -1712,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                                                       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+                           const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
+                                                       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
                                    struct super_block *, ext4_group_t, \
                                    unsigned long, ext4_fsblk_t, \
@@ -1960,6 +2132,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2146,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
@@ -1987,9 +2160,11 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+                                loff_t length);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
@@ -2002,6 +2177,21 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+                               struct page *page,
+                               int len,
+                               struct writeback_control *wbc);
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
@@ -2031,6 +2221,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ         37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+                                            EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+                                             EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..095c36f3b612 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,17 +119,13 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
 * positive retcode - signal for ext4_ext_walk_space(), see below
 * callback must return valid extent (passed or newly created)
 */
-typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
                                        struct ext4_ext_cache *,
                                        struct ext4_extent *, void *);
@@ -137,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2
-/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
+/*
-#define EXT_MAX_BLOCK   0xffffffff
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS  0xffffffff
 /*
 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
@@ -197,7 +196,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -225,11 +224,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ex->ee_start_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ix->ei_leaf_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+                                         ext4_fsblk_t pb)
+{
+        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                      0xffff);
+}
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+                                         ext4_fsblk_t pb)
+{
+        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                     0xffff);
+}
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
@@ -237,19 +285,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-                                 struct ext4_ext_path *path,
-                                 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
 #include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh)
-{
-        int err = 0;
-        if (ext4_handle_valid(handle)) {
-                err = jbd2_journal_get_undo_access(handle, bh);
-                if (err)
-                        ext4_journal_abort_handle(where, line, __func__, bh,
-                                                  handle, err);
-        }
-        return err;
-}
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh)
 {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
+ * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
                               const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh);
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
@@ -202,13 +197,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
        return 1;
 }
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        if (ext4_handle_valid(handle))
-                jbd2_journal_release_buffer(handle, bh);
-}
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -253,7 +241,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..f815cc81e7a2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,54 +44,14 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
+#include <trace/events/ext4.h>
-/*
+static int ext4_split_extent(handle_t *handle,
- * ext_pblock:
+                                struct inode *inode,
- * combine low and high parts of physical block number into ext4_fsblk_t
+                                struct ext4_ext_path *path,
- */
+                                struct ext4_map_blocks *map,
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+                                int split_flag,
-{
+                                int flags);
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ex->ee_start_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-        return block;
-}
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ix->ei_leaf_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-        return block;
-}
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
@@ -166,10 +126,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -216,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 static ext4_fsblk_t
 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
-                        struct ext4_extent *ex, int *err)
+                        struct ext4_extent *ex, int *err, unsigned int flags)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                        NULL, err);
        return newblock;
 }
@@ -292,7 +276,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -354,7 +338,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext);
+        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +347,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx);
+        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +447,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                            idx_pblock(path->p_idx));
+                            ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                  ext_pblock(path->p_ext));
+                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@ -494,13 +478,47 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
 }
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+                        ext4_fsblk_t newblock, int level)
+{
+        int depth = ext_depth(inode);
+        struct ext4_extent *ex;
+        if (depth != level) {
+                struct ext4_extent_idx *idx;
+                idx = path[level].p_idx;
+                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
+                                        le32_to_cpu(idx->ei_block),
+                                        ext4_idx_pblock(idx),
+                                        newblock);
+                        idx++;
+                }
+                return;
+        }
+        ex = path[depth].p_ext;
+        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+                                le32_to_cpu(ex->ee_block),
+                                ext4_ext_pblock(ex),
+                                ext4_ext_is_uninitialized(ex),
+                                ext4_ext_get_actual_len(ex),
+                                newblock);
+                ex++;
+        }
+}
 #else
 #define ext4_ext_show_path(inode, path)
 #define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -545,7 +563,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                  idx_pblock(path->p_idx));
+                  ext4_idx_pblock(path->p_idx));
 #ifdef CHECK_BINSEARCH
        {
@@ -614,7 +632,7 @@ ext4_ext_binsearch(struct inode *inode,
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                        ext_pblock(path->p_ext),
+                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
@@ -682,7 +700,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
@@ -690,6 +708,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                if (unlikely(!bh))
                        goto err;
                if (!bh_uptodate_or_lock(bh)) {
+                        trace_ext4_ext_load_extent(inode, block,
+                                                path[ppos].p_block);
                        if (bh_submit_read(bh) < 0) {
                                put_bh(bh);
                                goto err;
@@ -721,7 +741,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -739,9 +759,9 @@ err:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *curp,
+                                 struct ext4_ext_path *curp,
-                                int logical, ext4_fsblk_t ptr)
+                                 int logical, ext4_fsblk_t ptr)
 {
        struct ext4_extent_idx *ix;
        int len, err;
@@ -814,14 +834,14 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 * - initializes subtree
 */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *path,
+                          unsigned int flags,
-                                struct ext4_extent *newext, int at)
+                          struct ext4_ext_path *path,
+                          struct ext4_extent *newext, int at)
 {
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
-        struct ext4_extent *ex;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
@@ -869,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
-                                                   newext, &err);
+                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -898,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
-        ex = EXT_FIRST_EXTENT(neh);
        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -910,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                goto cleanup;
        }
        /* start copy from next extent */
-        /* TODO: we could do it by single memmove */
+        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
-        m = 0;
+        ext4_ext_show_move(inode, path, newblock, depth);
-        path[depth].p_ext++;
-        while (path[depth].p_ext <=
-                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
-                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext_pblock(path[depth].p_ext),
-                                ext4_ext_is_uninitialized(path[depth].p_ext),
-                                ext4_ext_get_actual_len(path[depth].p_ext),
-                                newblock);
-                /*memmove(ex++, path[depth].p_ext++,
-                                sizeof(struct ext4_extent));
-                neh->eh_entries++;*/
-                path[depth].p_ext++;
-                m++;
-        }
        if (m) {
-                memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+                struct ext4_extent *ex;
+                ex = EXT_FIRST_EXTENT(neh);
+                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }
@@ -990,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);
-                /* copy indexes */
-                m = 0;
-                path[i].p_idx++;
-                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+                /* move remainder of path[i] to the new index block */
-                                EXT_MAX_INDEX(path[i].p_hdr));
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
@@ -1004,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        err = -EIO;
                        goto cleanup;
                }
-                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+                /* start copy indexes */
-                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
+                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
-                                        le32_to_cpu(path[i].p_idx->ei_block),
+                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-                                        idx_pblock(path[i].p_idx),
+                                EXT_MAX_INDEX(path[i].p_hdr));
-                                        newblock);
+                ext4_ext_show_move(inode, path, newblock, i);
-                        /*memmove(++fidx, path[i].p_idx++,
-                                        sizeof(struct ext4_extent_idx));
-                        neh->eh_entries++;
-                        BUG_ON(neh->eh_entries > neh->eh_max);*/
-                        path[i].p_idx++;
-                        m++;
-                }
                if (m) {
-                        memmove(++fidx, path[i].p_idx - m,
+                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
@@ -1060,7 +1055,7 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
@@ -1078,8 +1073,9 @@ cleanup:
 *   just created block
 */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                 unsigned int flags,
-                                        struct ext4_extent *newext)
+                                 struct ext4_ext_path *path,
+                                 struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
@@ -1087,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path,
+                newext, &err, flags);
        if (newblock == 0)
                return err;
@@ -1146,7 +1143,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                  idx_pblock(EXT_FIRST_INDEX(neh)));
+                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1162,8 +1159,9 @@ out:
 * if no free index is found, then it requests in-depth growing.
 */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                    unsigned int flags,
-                                        struct ext4_extent *newext)
+                                    struct ext4_ext_path *path,
+                                    struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
@@ -1183,7 +1181,7 @@ repeat:
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
-                err = ext4_ext_split(handle, inode, path, newext, i);
+                err = ext4_ext_split(handle, inode, flags, path, newext, i);
                if (err)
                        goto out;
@@ -1196,7 +1194,8 @@ repeat:
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-                err = ext4_ext_grow_indepth(handle, inode, path, newext);
+                err = ext4_ext_grow_indepth(handle, inode, flags,
+                                            path, newext);
                if (err)
                        goto out;
@@ -1232,9 +1231,9 @@ out:
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_left(struct inode *inode,
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+                                struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
@@ -1286,7 +1285,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        }
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-        *phys = ext_pblock(ex) + ee_len - 1;
+        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
 }
@@ -1297,9 +1296,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_right(struct inode *inode,
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1342,7 +1341,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1357,7 +1356,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1376,7 +1375,7 @@ got_index:
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
@@ -1388,7 +1387,7 @@ got_index:
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-                block = idx_pblock(ix);
+                block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
@@ -1402,14 +1401,14 @@ got_index:
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-        *phys = ext_pblock(ex);
+        *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
 /*
 * ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
@@ -1423,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
        depth = path->p_depth;
        if (depth == 0 && path->p_ext == NULL)
-                return EXT_MAX_BLOCK;
+                return EXT_MAX_BLOCKS;
        while (depth >= 0) {
                if (depth == path->p_depth) {
@@ -1440,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
                depth--;
        }
-        return EXT_MAX_BLOCK;
+        return EXT_MAX_BLOCKS;
 }
 /*
 * ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
                                        struct ext4_ext_path *path)
@@ -1457,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
        /* zero-tree has no leaf blocks at all */
        if (depth == 0)
-                return EXT_MAX_BLOCK;
+                return EXT_MAX_BLOCKS;
        /* go to index block */
        depth--;
@@ -1470,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
                depth--;
        }
-        return EXT_MAX_BLOCK;
+        return EXT_MAX_BLOCKS;
 }
 /*
@@ -1573,7 +1572,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                return 0;
 #endif
-        if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
 }
@@ -1585,9 +1584,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
-                          struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                          struct ext4_extent *ex)
+                                 struct ext4_extent *ex)
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
@@ -1625,6 +1624,31 @@ int ext4_ext_try_to_merge(struct inode *inode,
 }
 /*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+                                  struct ext4_ext_path *path,
+                                  struct ext4_extent *ex) {
+        struct ext4_extent_header *eh;
+        unsigned int depth;
+        int merge_done = 0;
+        int ret = 0;
+        depth = ext_depth(inode);
+        BUG_ON(path[depth].p_hdr == NULL);
+        eh = path[depth].p_hdr;
+        if (ex > EXT_FIRST_EXTENT(eh))
+                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+        if (!merge_done)
+                ret = ext4_ext_try_to_merge_right(inode, path, ex);
+        return ret;
+}
+/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
@@ -1632,9 +1656,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                    struct ext4_extent *newext,
+                                           struct ext4_extent *newext,
-                                    struct ext4_ext_path *path)
+                                           struct ext4_ext_path *path)
 {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@ -1653,13 +1677,13 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
-                if (b2 == EXT_MAX_BLOCK)
+                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
        }
        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
-                len1 = EXT_MAX_BLOCK - b1;
+                len1 = EXT_MAX_BLOCKS - b1;
                newext->ee_len = cpu_to_le16(len1);
                ret = 1;
        }
@@ -1690,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        int depth, len, err;
        ext4_lblk_t next;
        unsigned uninitialized = 0;
+        int flags = 0;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1706,11 +1731,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                                ext4_ext_is_uninitialized(newext),
+                          ext4_ext_is_uninitialized(newext),
-                                ext4_ext_get_actual_len(newext),
+                          ext4_ext_get_actual_len(newext),
-                                le32_to_cpu(ex->ee_block),
+                          le32_to_cpu(ex->ee_block),
-                                ext4_ext_is_uninitialized(ex),
+                          ext4_ext_is_uninitialized(ex),
-                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex),
+                          ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@ -1741,7 +1767,7 @@ repeat:
        fex = EXT_LAST_EXTENT(eh);
        next = ext4_ext_next_leaf_block(inode, path);
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
-            && next != EXT_MAX_BLOCK) {
+            && next != EXT_MAX_BLOCKS) {
                ext_debug("next leaf block - %d\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1750,7 +1776,7 @@ repeat:
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-                        ext_debug("next leaf isnt full(%d)\n",
+                        ext_debug("next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto repeat;
@@ -1763,7 +1789,9 @@ repeat:
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
-        err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+        if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+                flags = EXT4_MB_USE_ROOT_BLOCKS;
+        err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
@@ -1780,7 +1808,7 @@ has_space:
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1822,7 @@ has_space:
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                        ext_pblock(newext),
+                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1836,7 @@ has_space:
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1847,7 @@ has_space:
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-        ext4_ext_store_pblock(nearex, ext_pblock(newext));
+        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
 merge:
@@ -1845,9 +1873,9 @@ cleanup:
        return err;
 }
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                        ext4_lblk_t num, ext_prepare_callback func,
+                               ext4_lblk_t num, ext_prepare_callback func,
-                        void *cbdata)
+                               void *cbdata)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
@@ -1859,7 +1887,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
        BUG_ON(func == NULL);
        BUG_ON(inode == NULL);
-        while (block < last && block != EXT_MAX_BLOCK) {
+        while (block < last && block != EXT_MAX_BLOCKS) {
                num = last - block;
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
@@ -1919,12 +1947,10 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1932,7 +1958,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        err = -EIO;
                        break;
                }
-                err = func(inode, path, &cbex, ex, cbdata);
+                err = func(inode, next, &cbex, ex, cbdata);
                ext4_ext_drop_refs(path);
                if (err < 0)
@@ -1964,13 +1990,12 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -1995,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
                lblock = 0;
-                len = EXT_MAX_BLOCK;
+                len = EXT_MAX_BLOCKS;
                ext_debug("cache gap(whole file):");
        } else if (block < le32_to_cpu(ex->ee_block)) {
                lblock = block;
@@ -2023,43 +2048,90 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
-static int
+/*
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ * ext4_ext_in_cache()
-                        struct ext4_extent *ex)
+ * Checks to see if the given block is in the cache.
-{
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+        struct ext4_ext_cache *ex){
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        struct ext4_sb_info *sbi;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
+        sbi = EXT4_SB(inode->i_sb);
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
-                ex->ee_block = cpu_to_le32(cex->ec_block);
+                memcpy(ex, cex, sizeof(struct ext4_ext_cache));
-                ext4_ext_store_pblock(ex, cex->ec_start);
-                ex->ee_len = cpu_to_le16(cex->ec_len);
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
+        if (!ret)
+                sbi->extent_cache_misses++;
+        else
+                sbi->extent_cache_hits++;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return ret;
 }
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+                        struct ext4_extent *ex)
+{
+        struct ext4_ext_cache cex;
+        int ret = 0;
+        if (ext4_ext_check_cache(inode, block, &cex)) {
+                ex->ee_block = cpu_to_le32(cex.ec_block);
+                ext4_ext_store_pblock(ex, cex.ec_start);
+                ex->ee_len = cpu_to_le16(cex.ec_len);
+                ret = 1;
+        }
+        return ret;
+}
+/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 * It's used in truncate case only, thus all requests are for
@@ -2073,7 +2145,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        /* free index block */
        path--;
-        leaf = idx_pblock(path->p_idx);
+        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@ -2086,7 +2158,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        ext4_free_blocks(handle, inode, 0, leaf, 1,
+        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return err;
 }
@@ -2181,13 +2253,21 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-                start = ext_pblock(ex) + ee_len - num;
+                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, 0, start, num, flags);
+                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
+                /* head removal */
-                        from, to, le32_to_cpu(ex->ee_block), ee_len);
+                ext4_lblk_t num;
+                ext4_fsblk_t start;
+                num = to - from;
+                start = ext4_ext_pblock(ex);
+                ext_debug("free first %u blocks starting %llu\n", num, start);
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else {
                printk(KERN_INFO "strange request: removal(2) "
                                "%u-%u from %u:%u\n",
@@ -2196,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
        return 0;
 }
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, ext4_lblk_t start)
+                struct ext4_ext_path *path, ext4_lblk_t start,
+                ext4_lblk_t end)
 {
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
@@ -2209,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
+        struct ext4_map_blocks map;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2238,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                path[depth].p_ext = ex;
                a = ex_ee_block > start ? ex_ee_block : start;
-                b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+                b = ex_ee_block+ex_ee_len - 1 < end ?
-                        ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+                        ex_ee_block+ex_ee_len - 1 : end;
                ext_debug("  border %u:%u\n", a, b);
-                if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+                /* If this extent is beyond the end of the hole, skip it */
-                        block = 0;
+                if (end <= ex_ee_block) {
-                        num = 0;
+                        ex--;
-                        BUG();
+                        ex_ee_block = le32_to_cpu(ex->ee_block);
+                        ex_ee_len = ext4_ext_get_actual_len(ex);
+                        continue;
+                } else if (a != ex_ee_block &&
+                        b != ex_ee_block + ex_ee_len - 1) {
+                        /*
+                         * If this is a truncate, then this condition should
+                         * never happen because at least one of the end points
+                         * needs to be on the edge of the extent.
+                         */
+                        if (end == EXT_MAX_BLOCKS - 1) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                                start, end);
+                                block = 0;
+                                num = 0;
+                                err = -EIO;
+                                goto out;
+                        }
+                        /*
+                         * else this is a hole punch, so the extent needs to
+                         * be split since neither edge of the hole is on the
+                         * extent edge
+                         */
+                        else{
+                                map.m_pblk = ext4_ext_pblock(ex);
+                                map.m_lblk = ex_ee_block;
+                                map.m_len = b - ex_ee_block;
+                                err = ext4_split_extent(handle,
+                                        inode, path, &map, 0,
+                                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                        EXT4_GET_BLOCKS_PRE_IO);
+                                if (err < 0)
+                                        goto out;
+                                ex_ee_len = ext4_ext_get_actual_len(ex);
+                                b = ex_ee_block+ex_ee_len - 1 < end ?
+                                        ex_ee_block+ex_ee_len - 1 : end;
+                                /* Then remove tail of this extent */
+                                block = ex_ee_block;
+                                num = a - block;
+                        }
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        block = ex_ee_block;
                        num = a - block;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        /* remove head of the extent */
-                        block = a;
+                        block = b;
-                        num = b - a;
+                        num =  ex_ee_block + ex_ee_len - b;
-                        /* there is no "make a hole" API yet */
-                        BUG();
+                        /*
+                         * If this is a truncate, this condition
+                         * should never happen
+                         */
+                        if (end == EXT_MAX_BLOCKS - 1) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                } else {
                        /* remove whole extent: excellent! */
                        block = ex_ee_block;
                        num = 0;
-                        BUG_ON(a != ex_ee_block);
+                        if (a != ex_ee_block) {
-                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
+                        if (b != ex_ee_block + ex_ee_len - 1) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                }
                /*
@@ -2293,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (num == 0) {
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-                        le16_add_cpu(&eh->eh_entries, -1);
+                } else if (block != ex_ee_block) {
+                        /*
+                         * If this was a head removal, then we need to update
+                         * the physical block since it is now at a different
+                         * location
+                         */
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
                }
                ex->ee_block = cpu_to_le32(block);
@@ -2309,8 +2473,29 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
+                /*
+                 * If the extent was completely released,
+                 * we need to remove it from the leaf
+                 */
+                if (num == 0) {
+                        if (end != EXT_MAX_BLOCKS - 1) {
+                                /*
+                                 * For hole punching, we need to scoot all the
+                                 * extents up when an extent is removed so that
+                                 * we dont have blank extents in the middle
+                                 */
+                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+                                        sizeof(struct ext4_extent));
+                                /* Now get rid of the one at the end */
+                                memset(EXT_LAST_EXTENT(eh), 0,
+                                        sizeof(struct ext4_extent));
+                        }
+                        le16_add_cpu(&eh->eh_entries, -1);
+                }
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                                ext_pblock(ex));
+                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2349,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2388,7 +2574,8 @@ again:
        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
-                        err = ext4_ext_rm_leaf(handle, inode, path, start);
+                        err = ext4_ext_rm_leaf(handle, inode, path,
+                                        start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2421,9 +2608,9 @@ again:
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                  i + 1, idx_pblock(path[i].p_idx));
+                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                        bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                        bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@ -2535,84 +2722,217 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
-static void bi_complete(struct bio *bio, int error)
-{
-        complete((struct completion *)bio->bi_private);
-}
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+        ext4_fsblk_t ee_pblock;
+        unsigned int ee_len;
        int ret;
-        struct bio *bio;
-        int blkbits, blocksize;
-        sector_t ee_pblock;
-        struct completion event;
-        unsigned int ee_len, len, done, offset;
-        blkbits   = inode->i_blkbits;
-        blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-        ee_pblock = ext_pblock(ex);
+        ee_pblock = ext4_ext_pblock(ex);
+        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+        if (ret > 0)
+                ret = 0;
+        return ret;
+}
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
+                                        due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *               the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             ext4_lblk_t split,
+                             int split_flag,
+                             int flags)
+{
+        ext4_fsblk_t newblock;
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_extent *ex2 = NULL;
+        unsigned int ee_len, depth;
+        int err = 0;
+        ext_debug("ext4_split_extents_at: inode %lu, logical"
+                "block %llu\n", inode->i_ino, (unsigned long long)split);
-        /* convert ee_pblock to 512 byte sectors */
+        ext4_ext_show_leaf(inode, path);
-        ee_pblock = ee_pblock << (blkbits - 9);
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        newblock = split - ee_block + ext4_ext_pblock(ex);
-        while (ee_len > 0) {
+        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
-                if (ee_len > BIO_MAX_PAGES)
+        err = ext4_ext_get_access(handle, inode, path + depth);
-                        len = BIO_MAX_PAGES;
+        if (err)
+                goto out;
+        if (split == ee_block) {
+                /*
+                 * case b: block @split is the block that the extent begins with
+                 * then we just change the state of the extent, and splitting
+                 * is not needed.
+                 */
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        ext4_ext_mark_uninitialized(ex);
                else
-                        len = ee_len;
+                        ext4_ext_mark_initialized(ex);
-                bio = bio_alloc(GFP_NOIO, len);
+                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
-                if (!bio)
+                        ext4_ext_try_to_merge(inode, path, ex);
-                        return -ENOMEM;
-                bio->bi_sector = ee_pblock;
+                err = ext4_ext_dirty(handle, inode, path + depth);
-                bio->bi_bdev   = inode->i_sb->s_bdev;
+                goto out;
+        }
-                done = 0;
+        /* case a */
-                offset = 0;
+        memcpy(&orig_ex, ex, sizeof(orig_ex));
-                while (done < len) {
+        ex->ee_len = cpu_to_le16(split - ee_block);
-                        ret = bio_add_page(bio, ZERO_PAGE(0),
+        if (split_flag & EXT4_EXT_MARK_UNINIT1)
-                                                        blocksize, offset);
+                ext4_ext_mark_uninitialized(ex);
-                        if (ret != blocksize) {
-                                /*
-                                 * We can't add any more pages because of
-                                 * hardware limitations.  Start a new bio.
-                                 */
-                                break;
-                        }
-                        done++;
-                        offset += blocksize;
-                        if (offset >= PAGE_CACHE_SIZE)
-                                offset = 0;
-                }
-                init_completion(&event);
+        /*
-                bio->bi_private = &event;
+         * path may lead to new leaf, not to original leaf any more
-                bio->bi_end_io = bi_complete;
+         * after ext4_ext_insert_extent() returns,
-                submit_bio(WRITE, bio);
+         */
-                wait_for_completion(&event);
+        err = ext4_ext_dirty(handle, inode, path + depth);
+        if (err)
+                goto fix_extent_len;
-                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+        ex2 = &newex;
-                        bio_put(bio);
+        ex2->ee_block = cpu_to_le32(split);
-                        return -EIO;
+        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
-                }
+        ext4_ext_store_pblock(ex2, newblock);
-                bio_put(bio);
+        if (split_flag & EXT4_EXT_MARK_UNINIT2)
-                ee_len    -= done;
+                ext4_ext_mark_uninitialized(ex2);
-                ee_pblock += done  << (blkbits - 9);
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, &orig_ex);
+                if (err)
+                        goto fix_extent_len;
+                /* update the extent length and mark as initialized */
+                ex->ee_len = cpu_to_le32(ee_len);
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                goto out;
+        } else if (err)
+                goto fix_extent_len;
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+fix_extent_len:
+        ex->ee_len = orig_ex.ee_len;
+        ext4_ext_dirty(handle, inode, path + depth);
+        return err;
+}
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+                              struct inode *inode,
+                              struct ext4_ext_path *path,
+                              struct ext4_map_blocks *map,
+                              int split_flag,
+                              int flags)
+{
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex;
+        unsigned int ee_len, depth;
+        int err = 0;
+        int uninitialized;
+        int split_flag1, flags1;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        uninitialized = ext4_ext_is_uninitialized(ex);
+        if (map->m_lblk + map->m_len < ee_block + ee_len) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+                                       EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk + map->m_len, split_flag1, flags1);
+                if (err)
+                        goto out;
        }
-        return 0;
+        ext4_ext_drop_refs(path);
+        path = ext4_ext_find_extent(inode, map->m_lblk, path);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        if (map->m_lblk >= ee_block) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk, split_flag1, flags);
+                if (err)
+                        goto out;
+        }
+        ext4_ext_show_leaf(inode, path);
+out:
+        return err ? err : map->m_len;
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
+ * extent into multiple extents (up to three - one initialized and two
 * uninitialized).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
@@ -2624,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_map_blocks split_map;
-        struct ext4_extent *ex1 = NULL;
+        struct ext4_extent zero_ex;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
-        struct ext4_extent_header *eh;
        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
        int err = 0;
-        int ret = 0;
+        int split_flag = 0;
-        int may_zeroout;
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2646,279 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-                err =  ext4_ext_zeroout(inode, &orig_ex);
+            (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, ex);
                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zeroed the full extent */
-                return allocated;
-        }
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
-                        /*
-                         * map->m_lblk == ee_block is handled by the zerouout
-                         * at the beginning.
-                         * Mark first half uninitialized.
-                         * Mark second half initialized and zero out the
-                         * initialized extent
-                         */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
-                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(map->m_lblk);
-                        ext4_ext_store_pblock(ex3, newblock);
-                        ex3->ee_len = cpu_to_le16(allocated);
-                        err = ext4_ext_insert_extent(handle, inode, path,
-                                                        ex3, 0);
-                        if (err == -ENOSPC) {
-                                err =  ext4_ext_zeroout(inode, &orig_ex);
-                                if (err)
-                                        goto fix_extent_len;
-                                ex->ee_block = orig_ex.ee_block;
-                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from map->m_lblk */
-                                return allocated;
-                        } else if (err)
-                                goto fix_extent_len;
-                        /*
-                         * We need to zero out the second half because
-                         * an fallocate request can update file size and
-                         * converting the second half to initialized extent
-                         * implies that we can leak some junk data to user
-                         * space.
-                         */
-                        err =  ext4_ext_zeroout(inode, ex3);
-                        if (err) {
-                                /*
-                                 * We should actually mark the
-                                 * second half as uninit and return error
-                                 * Insert would have changed the extent
-                                 */
-                                depth = ext_depth(inode);
-                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                            path);
-                                if (IS_ERR(path)) {
-                                        err = PTR_ERR(path);
-                                        return err;
-                                }
-                                /* get the second half extent details */
-                                ex = path[depth].p_ext;
-                                err = ext4_ext_get_access(handle, inode,
-                                                                path + depth);
-                                if (err)
-                                        return err;
-                                ext4_ext_mark_uninitialized(ex);
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                return err;
-                        }
-                        /* zeroed the second half */
-                        return allocated;
-                }
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
                        goto out;
-                }
-                eh = path[depth].p_hdr;
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;
+                ext4_ext_mark_initialized(ex);
-                allocated = map->m_len;
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
-                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
+                goto out;
-                 * to insert a extent in the middle zerout directly
-                 * otherwise give the extent a chance to merge to left
-                 */
-                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                        map->m_lblk != ee_block && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zero out the first half */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                }
-        }
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        if (ex2 != ex)
-                goto insert;
-        /*
-         * New (initialized) extent starts from the first block
-         * in the current extent. i.e., ex2 == ex
-         * We have to see if it can be merged with the extent
-         * on the left.
-         */
-        if (ex2 > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex2 - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex2--;
-                }
        }
        /*
-         * Try to Merge towards right. This might be required
+         * four cases:
-         * only when the whole extent is being written to.
+         * 1. split the extent into three extents.
-         * i.e. ex2 == ex and ex3 == NULL.
+         * 2. split the extent into two extents, zeroout the first half.
+         * 3. split the extent into two extents, zeroout the second half.
+         * 4. split the extent into two extents with out zeroout.
         */
-        if (!ex3) {
+        split_map.m_lblk = map->m_lblk;
-                ret = ext4_ext_try_to_merge(inode, path, ex2);
+        split_map.m_len = map->m_len;
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
+        if (allocated > map->m_len) {
+                if (allocated <= EXT4_EXT_ZERO_LEN &&
+                    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 3 */
+                        zero_ex.ee_block =
+                                         cpu_to_le32(map->m_lblk);
+                        zero_ex.ee_len = cpu_to_le16(allocated);
+                        ext4_ext_store_pblock(&zero_ex,
+                                ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+                        err = ext4_ext_zeroout(inode, &zero_ex);
                        if (err)
                                goto out;
+                        split_map.m_lblk = map->m_lblk;
+                        split_map.m_len = allocated;
+                } else if ((map->m_lblk - ee_block + map->m_len <
+                           EXT4_EXT_ZERO_LEN) &&
+                           (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 2 */
+                        if (map->m_lblk != ee_block) {
+                                zero_ex.ee_block = ex->ee_block;
+                                zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+                                                        ee_block);
+                                ext4_ext_store_pblock(&zero_ex,
+                                                      ext4_ext_pblock(ex));
+                                err = ext4_ext_zeroout(inode, &zero_ex);
+                                if (err)
+                                        goto out;
+                        }
+                        split_map.m_lblk = ee_block;
+                        split_map.m_len = map->m_lblk - ee_block + map->m_len;
+                        allocated = map->m_len;
                }
        }
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
+        allocated = ext4_split_extent(handle, inode, path,
-        goto out;
+                                       &split_map, split_flag, 0);
-insert:
+        if (allocated < 0)
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+                err = allocated;
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
 out:
-        ext4_ext_show_leaf(inode, path);
        return err ? err : allocated;
-fix_extent_len:
-        ex->ee_block = orig_ex.ee_block;
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 /*
@@ -2926,15 +3049,15 @@ fix_extent_len:
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
- * Writing to an uninitized extent may result in splitting the uninitialized
+ * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
 * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
@@ -2949,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct ext4_ext_path *path,
                                        int flags)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        ext4_lblk_t eof_block;
-        struct ext4_extent *ex1 = NULL;
+        ext4_lblk_t ee_block;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
+        unsigned int ee_len;
-        ext4_lblk_t ee_block, eof_block;
+        int split_flag = 0, depth;
-        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
-        int err = 0;
-        int may_zeroout;
        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2967,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
-        depth = ext_depth(inode);
-        ex = path[depth].p_ext;
-        ee_block = le32_to_cpu(ex->ee_block);
-        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
-        /*
+        ee_block = le32_to_cpu(ex->ee_block);
-         * If the uninitialized extent begins at the same logical
+        ee_len = ext4_ext_get_actual_len(ex);
-         * block where the write begins, and the write completely
-         * covers the extent, then we don't need to split it.
-         */
-        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
-                return allocated;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
-                        goto out;
-                }
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
-                err = ext4_ext_get_access(handle, inode, path + depth);
-                if (err)
-                        goto out;
-                allocated = map->m_len;
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-        }
+        split_flag |= EXT4_EXT_MARK_UNINIT2;
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * using direct I/O, uninitialised still.
-         */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        ext4_ext_mark_uninitialized(ex2);
-        if (ex2 != ex)
-                goto insert;
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
-        ext_debug("out here\n");
-        goto out;
-insert:
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
-out:
-        ext4_ext_show_leaf(inode, path);
-        return err ? err : allocated;
-fix_extent_len:
+        flags |= EXT4_GET_BLOCKS_PRE_IO;
-        ex->ee_block = orig_ex.ee_block;
+        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_ext_path *path)
@@ -3125,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
        struct ext4_extent_header *eh;
        int depth;
        int err = 0;
-        int ret = 0;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
+        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)le32_to_cpu(ex->ee_block),
+                ext4_ext_get_actual_len(ex));
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* first mark the extent as initialized */
        ext4_ext_mark_initialized(ex);
-        /*
+        /* note: ext4_ext_correct_indexes() isn't needed here because
-         * We have to see if it can be merged with the extent
+         * borders are not changed
-         * on the left.
-         */
-        if (ex > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex--;
-                }
-        }
-        /*
-         * Try to Merge towards right.
         */
-        ret = ext4_ext_try_to_merge(inode, path, ex);
+        ext4_ext_try_to_merge(inode, path, ex);
-        if (ret) {
-                err = ext4_ext_correct_indexes(handle, inode, path);
-                if (err)
-                        goto out;
-                depth = ext_depth(inode);
-        }
        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + depth);
 out:
@@ -3180,6 +3146,56 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                unmap_underlying_metadata(bdev, block + i);
 }
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                              ext4_lblk_t lblk,
+                              struct ext4_ext_path *path,
+                              unsigned int len)
+{
+        int i, depth;
+        struct ext4_extent_header *eh;
+        struct ext4_extent *last_ex;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                return 0;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        if (unlikely(!eh->eh_entries)) {
+                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                 "EOFBLOCKS_FL set");
+                return -EIO;
+        }
+        last_ex = EXT_LAST_EXTENT(eh);
+        /*
+         * We should clear the EOFBLOCKS_FL flag if we are writing the
+         * last block in the last extent in the file.  We test this by
+         * first checking to see if the caller to
+         * ext4_ext_get_blocks() was interested in the last block (or
+         * a block beyond the last block) in the current extent.  If
+         * this turns out to be false, we can bail out from this
+         * function immediately.
+         */
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+            ext4_ext_get_actual_len(last_ex))
+                return 0;
+        /*
+         * If the caller does appear to be planning to write at or
+         * beyond the end of the current extent, we then test to see
+         * if the current extent is the last extent in the file, by
+         * checking to make sure it was reached via the rightmost node
+         * at each level of the tree.
+         */
+        for (i = depth-1; i >= 0; i--)
+                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                        return 0;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+        return ext4_mark_inode_dirty(handle, inode);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3202,12 +3218,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                   path, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
-                 * that this IO needs to convertion to written when IO is
+                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
-                if (io)
+                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                        io->flag = EXT4_IO_UNWRITTEN;
+                        io->flag = EXT4_IO_END_UNWRITTEN;
-                else
+                        atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                } else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3217,8 +3234,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-                if (ret >= 0)
+                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                                 path, map->m_len);
+                } else
+                        err = ret;
                goto out2;
        }
        /* buffered IO case */
@@ -3244,8 +3265,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-        if (ret >= 0)
+        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
+                if (err < 0)
+                        goto out2;
+        }
 out:
        if (ret <= 0) {
                err = ret;
@@ -3292,6 +3319,7 @@ out2:
        }
        return err ? err : allocated;
 }
 /*
 * Block allocation/map/preallocation routine for extents based files
 *
@@ -3314,21 +3342,24 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
 {
        struct ext4_ext_path *path = NULL;
-        struct ext4_extent_header *eh;
+        struct ext4_extent newex, *ex;
-        struct ext4_extent newex, *ex, *last_ex;
+        ext4_fsblk_t newblock = 0;
-        ext4_fsblk_t newblock;
+        int err = 0, depth, ret;
-        int i, err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
+        unsigned int punched_out = 0;
+        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+        struct ext4_map_blocks punch_map;
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
+        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
-        if (cache_type) {
+                ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3337,17 +3368,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                   + ext_pblock(&newex);
+                                   + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3374,12 +3403,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                err = -EIO;
                goto out2;
        }
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-                ext4_fsblk_t ee_start = ext_pblock(ex);
+                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
@@ -3395,17 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        /* Do not put uninitialized extent in the cache */
+                        if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-                        if (!ext4_ext_is_uninitialized(ex)) {
+                                /*
-                                ext4_ext_put_in_cache(inode, ee_block,
+                                 * Do not put uninitialized extent
-                                                        ee_len, ee_start,
+                                 * in the cache
-                                                        EXT4_EXT_CACHE_EXTENT);
+                                 */
-                                goto out;
+                                if (!ext4_ext_is_uninitialized(ex)) {
+                                        ext4_ext_put_in_cache(inode, ee_block,
+                                                ee_len, ee_start);
+                                        goto out;
+                                }
+                                ret = ext4_ext_handle_uninitialized_extents(
+                                        handle, inode, map, path, flags,
+                                        allocated, newblock);
+                                return ret;
                        }
-                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, map, path, flags, allocated,
+                        /*
-                                        newblock);
+                         * Punch out the map length, but only to the
-                        return ret;
+                         * end of the extent
+                         */
+                        punched_out = allocated < map->m_len ?
+                                allocated : map->m_len;
+                        /*
+                         * Sense extents need to be converted to
+                         * uninitialized, they must fit in an
+                         * uninitialized extent
+                         */
+                        if (punched_out > EXT_UNINIT_MAX_LEN)
+                                punched_out = EXT_UNINIT_MAX_LEN;
+                        punch_map.m_lblk = map->m_lblk;
+                        punch_map.m_pblk = newblock;
+                        punch_map.m_len = punched_out;
+                        punch_map.m_flags = 0;
+                        /* Check to see if the extent needs to be split */
+                        if (punch_map.m_len != ee_len ||
+                                punch_map.m_lblk != ee_block) {
+                                ret = ext4_split_extent(handle, inode,
+                                path, &punch_map, 0,
+                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                EXT4_GET_BLOCKS_PRE_IO);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out2;
+                                }
+                                /*
+                                 * find extent for the block at
+                                 * the start of the hole
+                                 */
+                                ext4_ext_drop_refs(path);
+                                kfree(path);
+                                path = ext4_ext_find_extent(inode,
+                                map->m_lblk, NULL);
+                                if (IS_ERR(path)) {
+                                        err = PTR_ERR(path);
+                                        path = NULL;
+                                        goto out2;
+                                }
+                                depth = ext_depth(inode);
+                                ex = path[depth].p_ext;
+                                ee_len = ext4_ext_get_actual_len(ex);
+                                ee_block = le32_to_cpu(ex->ee_block);
+                                ee_start = ext4_ext_pblock(ex);
+                        }
+                        ext4_ext_mark_uninitialized(ex);
+                        err = ext4_ext_remove_space(inode, map->m_lblk,
+                                map->m_lblk + punched_out);
+                        goto out2;
                }
        }
@@ -3467,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
+        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@ -3481,15 +3578,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_mark_uninitialized(&newex);
                /*
                 * io_end structure was created for every IO write to an
-                 * uninitialized extent. To avoid unecessary conversion,
+                 * uninitialized extent. To avoid unnecessary conversion,
                 * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
-                 * that we need to perform convertion when IO is done.
+                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                        if (io)
+                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                                io->flag = EXT4_IO_UNWRITTEN;
+                                io->flag = EXT4_IO_END_UNWRITTEN;
-                        else
+                                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                        } else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
@@ -3497,44 +3595,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
-                if (unlikely(!eh->eh_entries)) {
+        if (err)
-                        EXT4_ERROR_INODE(inode,
+                goto out2;
-                                         "eh->eh_entries == 0 and "
-                                         "EOFBLOCKS_FL set");
-                        err = -EIO;
-                        goto out2;
-                }
-                last_ex = EXT_LAST_EXTENT(eh);
-                /*
-                 * If the current leaf block was reached by looking at
-                 * the last index block all the way down the tree, and
-                 * we are extending the inode beyond the last extent
-                 * in the current leaf block, then clear the
-                 * EOFBLOCKS_FL flag.
-                 */
-                for (i = depth-1; i >= 0; i--) {
-                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                                break;
-                }
-                if ((i < 0) &&
-                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                     ext4_ext_get_actual_len(last_ex)))
-                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
        /* previous routine could use block we allocated */
-        newblock = ext_pblock(&newex);
+        newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3552,8 +3629,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3569,7 +3645,13 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-        return err ? err : allocated;
+        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+                newblock, map->m_len, err ? err : allocated);
+        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+                        punched_out : allocated;
+        return err ? err : result;
 }
 void ext4_ext_truncate(struct inode *inode)
@@ -3581,6 +3663,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3611,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -3619,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
-out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
+out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -3667,14 +3756,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        handle_t *handle;
        loff_t new_size;
        unsigned int max_blocks;
@@ -3691,10 +3781,14 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
-        /* preallocation to directories is currently not supported */
+        /* Return error if mode is not supported */
-        if (S_ISDIR(inode->i_mode))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-                return -ENODEV;
+                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return ext4_punch_hole(file, offset, len);
+        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3710,6 +3804,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        ret = inode_newsize_ok(inode, (len + offset));
        if (ret) {
                mutex_unlock(&inode->i_mutex);
+                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
 retry:
@@ -3722,14 +3817,15 @@ retry:
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map,
-                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                                      EXT4_GET_BLOCKS_NO_NORMALIZE);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
@@ -3754,6 +3850,8 @@ retry:
                goto retry;
        }
        mutex_unlock(&inode->i_mutex);
+        trace_ext4_fallocate_exit(inode, offset, max_blocks,
+                                ret > 0 ? ret2 : ret);
        return ret > 0 ? ret2 : ret;
 }
@@ -3812,45 +3910,190 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        }
        return ret > 0 ? ret2 : ret;
 }
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
-static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
-        struct fiemap_extent_info *fieinfo = data;
-        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
        __u32   flags = 0;
-        int     error;
+        int             ret = 0;
+        struct fiemap_extent_info *fieinfo = data;
+        unsigned char blksize_bits;
-        logical =  (__u64)newex->ec_block << blksize_bits;
+        blksize_bits = inode->i_sb->s_blocksize_bits;
+        logical = (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
-                pgoff_t offset;
+                /*
-                struct page *page;
+                 * No extent in extent-tree contains block @newex->ec_start,
+                 * then the block may stay in 1)a hole or 2)delayed-extent.
+                 *
+                 * Holes or delayed-extents are processed as follows.
+                 * 1. lookup dirty pages with specified range in pagecache.
+                 *    If no page is got, then there is no delayed-extent and
+                 *    return with EXT_CONTINUE.
+                 * 2. find the 1st mapped buffer,
+                 * 3. check if the mapped buffer is both in the request range
+                 *    and a delayed buffer. If not, there is no delayed-extent,
+                 *    then return.
+                 * 4. a delayed-extent is found, the extent will be collected.
+                 */
+                ext4_lblk_t     end = 0;
+                pgoff_t         last_offset;
+                pgoff_t         offset;
+                pgoff_t         index;
+                pgoff_t         start_index = 0;
+                struct page     **pages = NULL;
                struct buffer_head *bh = NULL;
+                struct buffer_head *head = NULL;
+                unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+                pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+                if (pages == NULL)
+                        return -ENOMEM;
                offset = logical >> PAGE_SHIFT;
-                page = find_get_page(inode->i_mapping, offset);
+repeat:
-                if (!page || !page_has_buffers(page))
+                last_offset = offset;
-                        return EXT_CONTINUE;
+                head = NULL;
+                ret = find_get_pages_tag(inode->i_mapping, &offset,
+                                        PAGECACHE_TAG_DIRTY, nr_pages, pages);
+                if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                        /* First time, try to find a mapped buffer. */
+                        if (ret == 0) {
+out:
+                                for (index = 0; index < ret; index++)
+                                        page_cache_release(pages[index]);
+                                /* just a hole. */
+                                kfree(pages);
+                                return EXT_CONTINUE;
+                        }
+                        index = 0;
-                bh = page_buffers(page);
+next_page:
+                        /* Try to find the 1st mapped buffer. */
+                        end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
+                                  blksize_bits;
+                        if (!page_has_buffers(pages[index]))
+                                goto out;
+                        head = page_buffers(pages[index]);
+                        if (!head)
+                                goto out;
-                if (!bh)
+                        index++;
-                        return EXT_CONTINUE;
+                        bh = head;
+                        do {
+                                if (end >= newex->ec_block +
+                                        newex->ec_len)
+                                        /* The buffer is out of
+                                         * the request range.
+                                         */
+                                        goto out;
+                                if (buffer_mapped(bh) &&
+                                    end >= newex->ec_block) {
+                                        start_index = index - 1;
+                                        /* get the 1st mapped buffer. */
+                                        goto found_mapped_buffer;
+                                }
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
-                if (buffer_delay(bh)) {
+                        /* No mapped buffer in the range found in this page,
-                        flags |= FIEMAP_EXTENT_DELALLOC;
+                         * We need to look up next page.
-                        page_cache_release(page);
+                         */
+                        if (index >= ret) {
+                                /* There is no page left, but we need to limit
+                                 * newex->ec_len.
+                                 */
+                                newex->ec_len = end - newex->ec_block;
+                                goto out;
+                        }
+                        goto next_page;
                } else {
-                        page_cache_release(page);
+                        /*Find contiguous delayed buffers. */
-                        return EXT_CONTINUE;
+                        if (ret > 0 && pages[0]->index == last_offset)
+                                head = page_buffers(pages[0]);
+                        bh = head;
+                        index = 1;
+                        start_index = 0;
+                }
+found_mapped_buffer:
+                if (bh != NULL && buffer_delay(bh)) {
+                        /* 1st or contiguous delayed buffer found. */
+                        if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                                /*
+                                 * 1st delayed buffer found, record
+                                 * the start of extent.
+                                 */
+                                flags |= FIEMAP_EXTENT_DELALLOC;
+                                newex->ec_block = end;
+                                logical = (__u64)end << blksize_bits;
+                        }
+                        /* Find contiguous delayed buffers. */
+                        do {
+                                if (!buffer_delay(bh))
+                                        goto found_delayed_extent;
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
+                        for (; index < ret; index++) {
+                                if (!page_has_buffers(pages[index])) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                head = page_buffers(pages[index]);
+                                if (!head) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                if (pages[index]->index !=
+                                    pages[start_index]->index + index
+                                    - start_index) {
+                                        /* Blocks are not contiguous. */
+                                        bh = NULL;
+                                        break;
+                                }
+                                bh = head;
+                                do {
+                                        if (!buffer_delay(bh))
+                                                /* Delayed-extent ends. */
+                                                goto found_delayed_extent;
+                                        bh = bh->b_this_page;
+                                        end++;
+                                } while (bh != head);
+                        }
+                } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+                        /* a hole found. */
+                        goto out;
+found_delayed_extent:
+                newex->ec_len = min(end - newex->ec_block,
+                                                (ext4_lblk_t)EXT_INIT_MAX_LEN);
+                if (ret == nr_pages && bh != NULL &&
+                        newex->ec_len < EXT_INIT_MAX_LEN &&
+                        buffer_delay(bh)) {
+                        /* Have not collected an extent and continue. */
+                        for (index = 0; index < ret; index++)
+                                page_cache_release(pages[index]);
+                        goto repeat;
                }
+                for (index = 0; index < ret; index++)
+                        page_cache_release(pages[index]);
+                kfree(pages);
        }
        physical = (__u64)newex->ec_start << blksize_bits;
@@ -3859,32 +4102,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        if (ex && ext4_ext_is_uninitialized(ex))
                flags |= FIEMAP_EXTENT_UNWRITTEN;
-        /*
+        if (next == EXT_MAX_BLOCKS)
-         * If this extent reaches EXT_MAX_BLOCK, it must be last.
-         *
-         * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
-         * this also indicates no more allocated blocks.
-         *
-         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
-         */
-        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
-                loff_t size = i_size_read(inode);
-                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
-                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
-                    logical+length > size)
-                        length = (size - logical + bs - 1) & ~(bs-1);
-        }
-        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+        ret = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
-        if (error < 0)
+        if (ret < 0)
-                return error;
+                return ret;
-        if (error == 1)
+        if (ret == 1)
                return EXT_BREAK;
        return EXT_CONTINUE;
 }
@@ -3926,6 +4152,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
        return (error < 0 ? error : 0);
 }
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct ext4_ext_cache cache_ex;
+        ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+        struct address_space *mapping = inode->i_mapping;
+        struct ext4_map_blocks map;
+        handle_t *handle;
+        loff_t first_block_offset, last_block_offset, block_len;
+        loff_t first_page, last_page, first_page_offset, last_page_offset;
+        int ret, credits, blocks_released, err = 0;
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+        first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+        last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                err = filemap_write_and_wait_range(mapping,
+                        first_page_offset == 0 ? 0 : first_page_offset-1,
+                        last_page_offset);
+                        if (err)
+                                return err;
+        }
+        /* Now release the pages */
+        if (last_page_offset > first_page_offset) {
+                truncate_inode_pages_range(mapping, first_page_offset,
+                                           last_page_offset-1);
+        }
+        /* finish any pending end_io work */
+        ext4_flush_completed_IO(inode);
+        credits = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, credits);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        err = ext4_orphan_add(handle, inode);
+        if (err)
+                goto out;
+        /*
+         * Now we need to zero out the un block aligned data.
+         * If the file is smaller than a block, just
+         * zero out the middle
+         */
+        if (first_block > last_block)
+                ext4_block_zero_page_range(handle, mapping, offset, length);
+        else {
+                /* zero out the head of the hole before the first block */
+                block_len  = first_block_offset - offset;
+                if (block_len > 0)
+                        ext4_block_zero_page_range(handle, mapping,
+                                                   offset, block_len);
+                /* zero out the tail of the hole after the last block */
+                block_len = offset + length - last_block_offset;
+                if (block_len > 0) {
+                        ext4_block_zero_page_range(handle, mapping,
+                                        last_block_offset, block_len);
+                }
+        }
+        /* If there are no blocks to remove, return now */
+        if (first_block >= last_block)
+                goto out;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_ext_invalidate_cache(inode);
+        ext4_discard_preallocations(inode);
+        /*
+         * Loop over all the blocks and identify blocks
+         * that need to be punched out
+         */
+        iblock = first_block;
+        blocks_released = 0;
+        while (iblock < last_block) {
+                max_blocks = last_block - iblock;
+                num_blocks = 1;
+                memset(&map, 0, sizeof(map));
+                map.m_lblk = iblock;
+                map.m_len = max_blocks;
+                ret = ext4_ext_map_blocks(handle, inode, &map,
+                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                if (ret > 0) {
+                        blocks_released += ret;
+                        num_blocks = ret;
+                } else if (ret == 0) {
+                        /*
+                         * If map blocks could not find the block,
+                         * then it is in a hole.  If the hole was
+                         * not already cached, then map blocks should
+                         * put it in the cache.  So we can get the hole
+                         * out of the cache
+                         */
+                        memset(&cache_ex, 0, sizeof(cache_ex));
+                        if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+                                !cache_ex.ec_start) {
+                                /* The hole is cached */
+                                num_blocks = cache_ex.ec_block +
+                                cache_ex.ec_len - iblock;
+                        } else {
+                                /* The block could not be identified */
+                                err = -EIO;
+                                break;
+                        }
+                } else {
+                        /* Map blocks error */
+                        err = ret;
+                        break;
+                }
+                if (num_blocks == 0) {
+                        /* This condition should never happen */
+                        ext_debug("Block lookup failed");
+                        err = -EIO;
+                        break;
+                }
+                iblock += num_blocks;
+        }
+        if (blocks_released > 0) {
+                ext4_ext_invalidate_cache(inode);
+                ext4_discard_preallocations(inode);
+        }
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        up_write(&EXT4_I(inode)->i_data_sem);
+out:
+        ext4_orphan_del(handle, inode);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+        return err;
+}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
@@ -3948,8 +4345,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                start_blk = start >> inode->i_sb->s_blocksize_bits;
                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
-                if (last_blk >= EXT_MAX_BLOCK)
+                if (last_blk >= EXT_MAX_BLOCKS)
-                        last_blk = EXT_MAX_BLOCK-1;
+                        last_blk = EXT_MAX_BLOCKS-1;
                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
                /*
@@ -3962,4 +4359,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        return error;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
+static void ext4_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+                   unsigned long nr_segs, loff_t pos)
+{
+        struct super_block *sb = inode->i_sb;
+        int blockmask = sb->s_blocksize - 1;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = pos + count;
+        if (pos >= inode->i_size)
+                return 0;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int unaligned_aio = 0;
+        int ret;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                        nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
                                              sbi->s_bitmap_maxbytes - pos);
                }
+        } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+                   !is_sync_kiocb(iocb))) {
+                unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+        }
+        /* Unaligned direct AIO must be serialized; see comment above */
+        if (unaligned_aio) {
+                static unsigned long unaligned_warn_time;
+                /* Warn about this once per day */
+                if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+                        ext4_msg(inode->i_sb, KERN_WARNING,
+                                 "Unaligned AIO/DIO on inode %ld by %s; "
+                                 "performance will be poor.",
+                                 inode->i_ino, current->comm);
+                mutex_lock(ext4_aio_mutex(inode));
+                ext4_aiodio_wait(inode);
        }
-        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        if (unaligned_aio)
+                mutex_unlock(ext4_aio_mutex(inode));
+        return ret;
 }
 static const struct vm_operations_struct ext4_file_vm_ops = {
@@ -104,6 +162,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,11 +186,74 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        loff_t maxbytes;
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+        else
+                maxbytes = inode->i_sb->s_maxbytes;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                if (offset == 0) {
+                        mutex_unlock(&inode->i_mutex);
+                        return file->f_pos;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > maxbytes) {
+                mutex_unlock(&inode->i_mutex);
+                return -EINVAL;
+        }
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
 const struct file_operations ext4_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
@@ -146,10 +268,10 @@ const struct file_operations ext4_file_operations = {
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = ext4_fallocate,
 };
 const struct inode_operations ext4_file_inode_operations = {
-        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -159,7 +281,6 @@ const struct inode_operations ext4_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .check_acl      = ext4_check_acl,
-        .fallocate      = ext4_fallocate,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 #include <trace/events/ext4.h>
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef  EXT4FS_DEBUG
+        struct list_head *cur, *before, *after;
+        ext4_io_end_t *io, *io0, *io1;
+        unsigned long flags;
+        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+                return;
+        }
+        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+                cur = &io->list;
+                before = cur->prev;
+                io0 = container_of(before, ext4_io_end_t, list);
+                after = cur->next;
+                io1 = container_of(after, ext4_io_end_t, list);
+                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                            io, inode->i_ino, io0, io1);
+        }
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+extern int ext4_flush_completed_IO(struct inode *inode)
+{
+        ext4_io_end_t *io;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long flags;
+        int ret = 0;
+        int ret2 = 0;
+        if (list_empty(&ei->i_completed_io_list))
+                return ret;
+        dump_completed_IO(inode);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        while (!list_empty(&ei->i_completed_io_list)){
+                io = list_entry(ei->i_completed_io_list.next,
+                                ext4_io_end_t, list);
+                /*
+                 * Calling ext4_end_io_nolock() to convert completed
+                 * IO to written.
+                 *
+                 * When ext4_sync_file() is called, run_queue() may already
+                 * about to flush the work corresponding to this io structure.
+                 * It will be upset if it founds the io structure related
+                 * to the work-to-be schedule is freed.
+                 *
+                 * Thus we need to keep the io structure still valid here after
+                 * conversion finished. The io structure has a flag to
+                 * avoid double converting from both fsync and background work
+                 * queue work.
+                 */
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                ret = ext4_end_io_nolock(io);
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                if (ret < 0)
+                        ret2 = ret;
+                else
+                        list_del_init(&io->list);
+        }
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        return (ret2 < 0) ? ret2 : 0;
+}
 /*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
@@ -42,9 +125,11 @@
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        int ret = 0;
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -53,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-                sync_mapping_buffers(inode->i_mapping);
+                ret = sync_mapping_buffers(inode->i_mapping);
+                if (ret)
+                        break;
+                memset(&wbc, 0, sizeof(wbc));
+                wbc.sync_mode = WB_SYNC_ALL;
+                wbc.nr_to_write = 0;         /* only write out the inode */
+                ret = sync_inode(inode, &wbc);
+                if (ret)
+                        break;
        }
+        return ret;
 }
 /*
@@ -78,23 +172,24 @@ int ext4_sync_file(struct file *file, int datasync)
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
        tid_t commit_tid;
+        bool needs_barrier = false;
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, datasync);
+        trace_ext4_sync_file_enter(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
-                return ret;
+                goto out;
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                        ext4_sync_parent(inode);
+                        ret = ext4_sync_parent(inode);
-                return ret;
+                goto out;
        }
        /*
@@ -111,27 +206,20 @@ int ext4_sync_file(struct file *file, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode))
+        if (ext4_should_journal_data(inode)) {
-                return ext4_force_commit(inode->i_sb);
+                ret = ext4_force_commit(inode->i_sb);
+                goto out;
+        }
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-        if (jbd2_log_start_commit(journal, commit_tid)) {
+        if (journal->j_flags & JBD2_BARRIER &&
-                /*
+            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-                 * When the journal is on a different device than the
+                needs_barrier = true;
-                 * fs data disk, we need to issue the barrier in
+        jbd2_log_start_commit(journal, commit_tid);
-                 * writeback mode.  (In ordered mode, the jbd2 layer
+        ret = jbd2_log_wait_commit(journal, commit_tid);
-                 * will take care of issuing the barrier.  In
+        if (needs_barrier)
-                 * data=journal, all of the data blocks are written to
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                 * the journal device.)
+ out:
-                 */
+        trace_ext4_sync_file_exit(inode, ret);
-                if (ext4_should_writeback_data(inode) &&
-                    (journal->j_fs_dev != journal->j_dev) &&
-                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                        NULL, BLKDEV_IFL_WAIT);
-                ret = jbd2_log_wait_commit(journal, commit_tid);
-        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
        int i;
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                ext4_group_t block_group,
+                                       struct buffer_head *bh,
-                                struct ext4_group_desc *gdp)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -148,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_load_inode_bitmap(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -411,8 +416,8 @@ struct orlov_stats {
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                       int flex_size, struct orlov_stats *stats)
+                            int flex_size, struct orlov_stats *stats)
 {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -645,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
-                return find_group_orlov(sb, parent, group, mode, 0);
+                return find_group_orlov(sb, parent, group, mode, NULL);
        }
        /*
@@ -712,8 +717,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        /*
+         * We have to be sure that new inode allocation does not race with
+         * inode table initialization, because otherwise we may end up
+         * allocating and writing new inode right before sb_issue_zeroout
+         * takes place and overwriting our new inode with zeroes. So we
+         * take alloc_sem to prevent it.
+         */
+        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +738,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+                up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +787,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+        up_read(&grp->alloc_sem);
        return retval;
 }
@@ -1012,7 +1028,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
@@ -1027,7 +1043,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext4_init_security(handle, inode, dir);
+        err = ext4_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
@@ -1039,6 +1055,11 @@ got:
                }
        }
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
@@ -1205,3 +1226,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                 int barrier)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *group_desc_bh;
+        handle_t *handle;
+        ext4_fsblk_t blk;
+        int num, ret = 0, used_blks = 0;
+        /* This should not happen, but just to be sure check this */
+        if (sb->s_flags & MS_RDONLY) {
+                ret = 1;
+                goto out;
+        }
+        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+        if (!gdp)
+                goto out;
+        /*
+         * We do not need to lock this, because we are the only one
+         * handling this flag.
+         */
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+                goto out;
+        handle = ext4_journal_start_sb(sb, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        down_write(&grp->alloc_sem);
+        /*
+         * If inode bitmap was already initialized there may be some
+         * used inodes so we need to skip blocks with used inodes in
+         * inode table.
+         */
+        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+                used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                            ext4_itable_unused_count(sb, gdp)),
+                            sbi->s_inodes_per_block);
+        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+                ext4_error(sb, "Something is wrong with group %u\n"
+                           "Used itable blocks: %d"
+                           "itable unused count: %u\n",
+                           group, used_blks,
+                           ext4_itable_unused_count(sb, gdp));
+                ret = 1;
+                goto out;
+        }
+        blk = ext4_inode_table(sb, gdp) + used_blks;
+        num = sbi->s_itb_per_group - used_blks;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        ret = ext4_journal_get_write_access(handle,
+                                            group_desc_bh);
+        if (ret)
+                goto err_out;
+        /*
+         * Skip zeroout if the inode table is full. But we set the ZEROED
+         * flag anyway, because obviously, when it is full it does not need
+         * further zeroing.
+         */
+        if (unlikely(num == 0))
+                goto skip_zeroout;
+        ext4_debug("going to zero out inode table in group %d\n",
+                   group);
+        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+        if (ret < 0)
+                goto err_out;
+        if (barrier)
+                blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+skip_zeroout:
+        ext4_lock_group(sb, group);
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+        ext4_unlock_group(sb, group);
+        BUFFER_TRACE(group_desc_bh,
+                     "call ext4_handle_dirty_metadata");
+        ret = ext4_handle_dirty_metadata(handle, NULL,
+                                         group_desc_bh);
+err_out:
+        up_write(&grp->alloc_sem);
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..e3126c051006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -53,13 +55,27 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(
+        trace_ext4_begin_ordered_truncate(inode, new_size);
-                                        EXT4_SB(inode->i_sb)->s_journal,
+        /*
-                                        &EXT4_I(inode)->jinode,
+         * If jinode is zero, then we never opened the file for
-                                        new_size);
+         * writing, so there's no need to call
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -157,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        up_write(&EXT4_I(inode)->i_data_sem);
-        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        ret = ext4_journal_restart(handle, nblocks);
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -616,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode,
+                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                        goal, &count, err);
+                                                     0, &count, err);
                if (*err)
                        goto failed_out;
@@ -697,15 +720,17 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
        return ret;
 }
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -793,26 +823,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
-        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
        return err;
 }
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -893,7 +924,7 @@ err_out:
                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
                         blks, 0);
        return err;
@@ -942,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1027,6 +1059,8 @@ cleanup:
                partial--;
        }
 out:
+        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+                                map->m_pblk, map->m_len, err);
        return err;
 }
@@ -1068,7 +1102,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1241,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                        if (num >= max_pages)
+                        if (num >= max_pages) {
+                                done = 1;
                                break;
+                        }
                }
                pagevec_release(&pvec);
        }
@@ -1305,7 +1341,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1335,7 +1371,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1574,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1863,7 +1899,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1894,7 +1930,7 @@ repeat:
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
@@ -1995,16 +2031,23 @@ static void ext4_da_page_release_reservation(struct page *page,
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,124 +2063,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * skip the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                skip_page:
-        }
+                                        unlock_page(page);
-        return ret;
+                                        continue;
-}
+                                }
+                                commit_write = 1;
-/*
+                        }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct ext4_map_blocks *map)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = map->m_len;
-        sector_t pblock = map->m_pblk, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= map->m_lblk)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= map->m_lblk + blocks)
+                                if (!bh)
-                                        break;
+                                        goto skip_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
+                                    (cur_logical <= (map->m_lblk +
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* skip page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        skip_page = 1;
-                                if (map->m_flags & EXT4_MAP_UNINIT)
+                                bh = bh->b_this_page;
-                                        set_buffer_uninit(bh);
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
+                        if (skip_page)
+                                goto skip_page;
+                        if (commit_write)
+                                /* mark the buffer_heads as dirty & uptodate */
+                                block_commit_write(page, 0, len);
+                        clear_page_dirty_for_io(page);
+                        /*
+                         * Delalloc doesn't support data journalling,
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
                }
                pagevec_release(&pvec);
        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
-                                        sector_t logical, long blk_cnt)
 {
        int nr_pages, i;
        pgoff_t index, end;
@@ -2145,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = mpd->first_page;
-        end   = (logical + blk_cnt - 1) >>
+        end   = mpd->next_page - 1;
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
+         * any blocks, then proceed immediately to the submission stage.
         */
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+        if ((mpd->b_size == 0) ||
-                !(mpd->b_state & (1 << BH_Delay)) &&
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
+             !(mpd->b_state & (1 << BH_Delay)) &&
-                return 0;
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
-        /*
-         * If we didn't accumulate anything to write simply return
-         */
-        if (!mpd->b_size)
-                return 0;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will just let
-                 * writepages will find the dirty page again
+                 * mpage_da_submit_io() unlock all of the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2285,12 +2311,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd, next,
+                ext4_da_block_invalidatepages(mpd);
-                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                /* Mark this page range as having been completed */
+                mpd->io_done = 1;
+                return;
        }
        BUG_ON(blks == 0);
+        mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2328,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2321,10 +2343,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2429,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
@@ -2413,104 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
-{
-        struct mpage_da_data *mpd = data;
-        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head;
-        sector_t logical;
-        /*
-         * Can we merge this page to current extent?
-         */
-        if (mpd->next_page != page->index) {
-                /*
-                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
-                 */
-                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
-                                mpage_da_submit_io(mpd);
-                        /*
-                         * skip rest of the page in the page_vec
-                         */
-                        mpd->io_done = 1;
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return MPAGE_DA_EXTENT_TAIL;
-                }
-                /*
-                 * Start next extent of pages ...
-                 */
-                mpd->first_page = page->index;
-                /*
-                 * ... and blocks
-                 */
-                mpd->b_size = 0;
-                mpd->b_state = 0;
-                mpd->b_blocknr = 0;
-        }
-        mpd->next_page = page->index + 1;
-        logical = (sector_t) page->index <<
-                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        if (!page_has_buffers(page)) {
-                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                if (mpd->io_done)
-                        return MPAGE_DA_EXTENT_TAIL;
-        } else {
-                /*
-                 * Page with regular buffer heads, just add all dirty ones
-                 */
-                head = page_buffers(page);
-                bh = head;
-                do {
-                        BUG_ON(buffer_locked(bh));
-                        /*
-                         * We need to try to allocate
-                         * unmapped blocks in the same page.
-                         * Otherwise we won't make progress
-                         * with the page in ext4_writepage
-                         */
-                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                mpage_add_bh_to_extent(mpd, logical,
-                                                       bh->b_size,
-                                                       bh->b_state);
-                                if (mpd->io_done)
-                                        return MPAGE_DA_EXTENT_TAIL;
-                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                /*
-                                 * mapped dirty buffer. We need to update
-                                 * the b_state because we look at
-                                 * b_state in mpage_da_map_blocks. We don't
-                                 * update b_size because if we find an
-                                 * unmapped buffer_head later we need to
-                                 * use the b_state flag of that buffer_head.
-                                 */
-                                if (mpd->b_size == 0)
-                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                        }
-                        logical++;
-                } while ((bh = bh->b_this_page) != head);
-        }
-        return 0;
-}
-/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -2550,8 +2478,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2510,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2550,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2661,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
@@ -2700,84 +2628,57 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
-        trace_ext4_writepage(inode, page);
+        trace_ext4_writepage(page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_delay_or_unwritten)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                goto redirty_page;
-                                          noalloc_get_block_write);
+        }
-                if (!ret) {
+        if (commit_write)
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_delay_or_unwritten)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-        }
-        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-                ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-        }
-        if (page_bufs && buffer_uninit(page_bufs)) {
+        if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2790,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 /*
 * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
@@ -2815,37 +2716,42 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 /*
 * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
+ * address space and accumulate pages that need writing, and call
- * the pages).
+ * mpage_da_map_and_submit to map a single contiguous memory region
- *
+ * and then write them.
- * This is a forked version of write_cache_pages().  Differences:
- *      Range cyclic is ignored.
- *      no_nrwrite_index_update is always presumed true
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
 {
-        int ret = 0;
+        struct buffer_head      *bh, *head;
-        int done = 0;
+        struct inode            *inode = mapping->host;
-        struct pagevec pvec;
+        struct pagevec          pvec;
-        int nr_pages;
+        unsigned int            nr_pages;
-        pgoff_t index;
+        sector_t                logical;
-        pgoff_t end;            /* Inclusive */
+        pgoff_t                 index, end;
-        long nr_to_write = wbc->nr_to_write;
+        long                    nr_to_write = wbc->nr_to_write;
+        int                     i, tag, ret = 0;
+        memset(mpd, 0, sizeof(struct mpage_da_data));
+        mpd->wbc = wbc;
+        mpd->inode = inode;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        while (!done && (index <= end)) {
+        if (wbc->sync_mode == WB_SYNC_ALL)
-                int i;
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        *done_index = index;
-                              PAGECACHE_TAG_DIRTY,
+        while (index <= end) {
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        break;
+                        return 0;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2857,58 +2763,98 @@ static int write_cache_pages_da(struct address_space *mapping,
                         * mapping. However, page->index will not change
                         * because we have a reference on the page.
                         */
-                        if (page->index > end) {
+                        if (page->index > end)
-                                done = 1;
+                                goto out;
-                                break;
+                        *done_index = page->index + 1;
+                        /*
+                         * If we can't merge this page, and we have
+                         * accumulated an contiguous region, write it
+                         */
+                        if ((mpd->next_page != page->index) &&
+                            (mpd->next_page != mpd->first_page)) {
+                                mpage_da_map_and_submit(mpd);
+                                goto ret_extent_tail;
                        }
                        lock_page(page);
                        /*
-                         * Page truncated or invalidated. We can freely skip it
+                         * If the page is no longer dirty, or its
-                         * then, even for data integrity operations: the page
+                         * mapping no longer corresponds to inode we
-                         * has disappeared concurrently, so there could be no
+                         * are writing (which means it has been
-                         * real expectation of this data interity operation
+                         * truncated or invalidated), or the page is
-                         * even if there is now a new, dirty page at the same
+                         * already under writeback and we are not
-                         * pagecache address.
+                         * doing a data integrity writeback, skip the page
                         */
-                        if (unlikely(page->mapping != mapping)) {
+                        if (!PageDirty(page) ||
-continue_unlock:
+                            (PageWriteback(page) &&
+                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        }
-                        if (!PageDirty(page)) {
+                        wait_on_page_writeback(page);
-                                /* someone wrote it for us */
-                                goto continue_unlock;
-                        }
-                        if (PageWriteback(page)) {
-                                if (wbc->sync_mode != WB_SYNC_NONE)
-                                        wait_on_page_writeback(page);
-                                else
-                                        goto continue_unlock;
-                        }
                        BUG_ON(PageWriteback(page));
-                        if (!clear_page_dirty_for_io(page))
-                                goto continue_unlock;
-                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (mpd->next_page != page->index)
-                        if (unlikely(ret)) {
+                                mpd->first_page = page->index;
-                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                        mpd->next_page = page->index + 1;
-                                        unlock_page(page);
+                        logical = (sector_t) page->index <<
-                                        ret = 0;
+                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                                } else {
-                                        done = 1;
+                        if (!page_has_buffers(page)) {
-                                        break;
+                                mpage_add_bh_to_extent(mpd, logical,
-                                }
+                                                       PAGE_CACHE_SIZE,
+                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                                if (mpd->io_done)
+                                        goto ret_extent_tail;
+                        } else {
+                                /*
+                                 * Page with regular buffer heads,
+                                 * just add all dirty ones
+                                 */
+                                head = page_buffers(page);
+                                bh = head;
+                                do {
+                                        BUG_ON(buffer_locked(bh));
+                                        /*
+                                         * We need to try to allocate
+                                         * unmapped blocks in the same page.
+                                         * Otherwise we won't make progress
+                                         * with the page in ext4_writepage
+                                         */
+                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                                mpage_add_bh_to_extent(mpd, logical,
+                                                                       bh->b_size,
+                                                                       bh->b_state);
+                                                if (mpd->io_done)
+                                                        goto ret_extent_tail;
+                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                                /*
+                                                 * mapped dirty buffer. We need
+                                                 * to update the b_state
+                                                 * because we look at b_state
+                                                 * in mpage_da_map_blocks.  We
+                                                 * don't update b_size because
+                                                 * if we find an unmapped
+                                                 * buffer_head later we need to
+                                                 * use the b_state flag of that
+                                                 * buffer_head.
+                                                 */
+                                                if (mpd->b_size == 0)
+                                                        mpd->b_state = bh->b_state & BH_FLAGS;
+                                        }
+                                        logical++;
+                                } while ((bh = bh->b_this_page) != head);
                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                    wbc->sync_mode == WB_SYNC_NONE)
                                        /*
                                         * We stop writing back only if we are
                                         * not doing integrity sync. In case of
@@ -2919,14 +2865,18 @@ continue_unlock:
                                         * pages, but have not synced all of the
                                         * old dirty pages.
                                         */
-                                        done = 1;
+                                        goto out;
-                                        break;
-                                }
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
+        return 0;
+ret_extent_tail:
+        ret = MPAGE_DA_EXTENT_TAIL;
+out:
+        pagevec_release(&pvec);
+        cond_resched();
        return ret;
 }
@@ -2940,13 +2890,14 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int pages_written = 0;
-        long pages_skipped;
        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2982,8 +2933,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3002,9 +2956,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole)
+        if (!range_cyclic && range_whole) {
-                desired_nr_to_write = wbc->nr_to_write * 8;
+                if (wbc->nr_to_write == LONG_MAX)
-        else
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3015,12 +2972,10 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = desired_nr_to_write;
        }
-        mpd.wbc = wbc;
-        mpd.inode = mapping->host;
-        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -3043,32 +2998,18 @@ retry:
                }
                /*
-                 * Now call __mpage_da_writepage to find the next
+                 * Now call write_cache_pages_da() to find the next
                 * contiguous region of logical blocks that need
-                 * blocks to be allocated by ext4.  We don't actually
+                 * blocks to be allocated by ext4 and submit them.
-                 * submit the blocks for I/O here, even though
-                 * write_cache_pages thinks it will, and will set the
-                 * pages as clean for write before calling
-                 * __mpage_da_writepage().
                 */
-                mpd.b_size = 0;
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
-                mpd.b_state = 0;
-                mpd.b_blocknr = 0;
-                mpd.first_page = 0;
-                mpd.next_page = 0;
-                mpd.io_done = 0;
-                mpd.pages_written = 0;
-                mpd.retval = 0;
-                ret = write_cache_pages_da(mapping, wbc, &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3082,7 +3023,6 @@ retry:
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
@@ -3090,7 +3030,6 @@ retry:
                         * rest of the pages
                         */
                        pages_written += mpd.pages_written;
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                        io_done = 1;
                } else if (wbc->nr_to_write)
@@ -3108,21 +3047,15 @@ retry:
                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
-        if (pages_skipped != wbc->pages_skipped)
-                ext4_msg(inode->i_sb, KERN_CRIT,
-                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d",
-                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3367,10 +3300,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
-         * the pages by calling redirty_page_for_writeback() but that
+         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
-         * simplifying them becuase we wouldn't actually intend to
+         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
@@ -3447,6 +3380,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 static int ext4_readpage(struct file *file, struct page *page)
 {
+        trace_ext4_readpage(page);
        return mpage_readpage(page, ext4_get_block);
 }
@@ -3457,15 +3391,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-        BUG_ON(!io);
-        if (io->page)
-                put_page(io->page);
-        iput(io->inode);
-        kfree(io);
-}
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3490,6 +3415,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_invalidatepage(page, offset);
        /*
         * free any io_end structure allocated for buffers to be discarded
         */
@@ -3511,6 +3438,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -3582,7 +3511,7 @@ retry:
                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
-                                vmtruncate(inode, isize);
+                                ext4_truncate_failed_write(inode);
                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3642,173 +3571,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef  EXT4_DEBUG
-        struct list_head *cur, *before, *after;
-        ext4_io_end_t *io, *io0, *io1;
-        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-                cur = &io->list;
-                before = cur->prev;
-                io0 = container_of(before, ext4_io_end_t, list);
-                after = cur->next;
-                io1 = container_of(after, ext4_io_end_t, list);
-                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                            io, inode->i_ino, io0, io1);
-        }
-        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-        struct inode *inode = io->inode;
-        loff_t offset = io->offset;
-        ssize_t size = io->size;
-        int ret = 0;
-        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                   "list->prev 0x%p\n",
-                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (io->flag != EXT4_IO_UNWRITTEN)
-                return ret;
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
-        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten"
-                        "extents to written extents, error is %d"
-                        " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-                return ret;
-        }
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        io->flag = 0;
-        return ret;
-}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode            *inode = io->inode;
-        struct ext4_inode_info  *ei = EXT4_I(inode);
-        unsigned long           flags;
-        int                     ret;
-        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_io_nolock(io);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        mutex_unlock(&inode->i_mutex);
-        ext4_free_io_end(io);
-}
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-        ext4_io_end_t *io;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long flags;
-        int ret = 0;
-        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
-        dump_completed_IO(inode);
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        while (!list_empty(&ei->i_completed_io_list)){
-                io = list_entry(ei->i_completed_io_list.next,
-                                ext4_io_end_t, list);
-                /*
-                 * Calling ext4_end_io_nolock() to convert completed
-                 * IO to written.
-                 *
-                 * When ext4_sync_file() is called, run_queue() may already
-                 * about to flush the work corresponding to this io structure.
-                 * It will be upset if it founds the io structure related
-                 * to the work-to-be schedule is freed.
-                 *
-                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
-                 * avoid double converting from both fsync and background work
-                 * queue work.
-                 */
-                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                if (ret < 0)
-                        ret2 = ret;
-                else
-                        list_del_init(&io->list);
-        }
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        return (ret2 < 0) ? ret2 : 0;
-}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), flags);
-        if (io) {
-                igrab(inode);
-                io->inode = inode;
-                io->flag = 0;
-                io->offset = 0;
-                io->size = 0;
-                io->page = NULL;
-                io->iocb = NULL;
-                io->result = 0;
-                INIT_WORK(&io->work, ext4_end_io_work);
-                INIT_LIST_HEAD(&io->list);
-        }
-        return io;
-}
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3828,7 +3590,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != EXT4_IO_UNWRITTEN){
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3845,14 +3607,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
@@ -3873,7 +3635,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
        /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3663,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -3926,13 +3687,13 @@ retry:
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
 * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
 *
 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3955,7 +3716,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 * We could direct write to holes and fallocate.
                 *
                 * Allocated blocks to fill the hole are marked as uninitialized
-                 * to prevent paralel buffered read to expose the stale data
+                 * to prevent parallel buffered read to expose the stale data
                 * before DIO complete the data IO.
                 *
                 * As to previously fallocated extents, ext4 get_block
@@ -4016,7 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                        int err;
                        /*
                         * for non AIO case, since the IO is already
-                         * completed, we could do the convertion right here
+                         * completed, we could do the conversion right here
                         */
                        err = ext4_convert_unwritten_extents(inode,
                                                             offset, ret);
@@ -4037,11 +3798,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        else
-        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+        trace_ext4_direct_IO_exit(inode, offset,
+                                iov_length(iov, nr_segs), rw, ret);
+        return ret;
 }
 /*
@@ -4067,7 +3833,6 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
        .bmap                   = ext4_bmap,
@@ -4083,7 +3848,6 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
@@ -4099,7 +3863,6 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -4115,7 +3878,6 @@ static const struct address_space_operations ext4_da_aops = {
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -4152,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
 int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, length, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -4167,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
+        max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
        if (!page_has_buffers(page))
@@ -4226,7 +4017,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4262,7 +4053,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -4341,6 +4132,9 @@ no_top:
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
 */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
@@ -4350,6 +4144,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,22 +4160,33 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
-                ext4_mark_inode_dirty(handle, inode);
+                err = ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
+                if (unlikely(err))
-                                            blocks_for_truncate(inode));
+                        goto out_err;
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err))
+                        goto out_err;
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
        }
        for (p = first; p < last; p++)
                *p = 0;
-        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
+out_err:
+        ext4_std_error(inode->i_sb, err);
+        return err;
 }
 /**
@@ -4391,7 +4197,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4414,7 +4220,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
-        int err;
+        int err = 0;
        if (this_bh) {                          /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
@@ -4436,9 +4242,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                if (ext4_clear_blocks(handle, inode, this_bh,
+                                err = ext4_clear_blocks(handle, inode, this_bh,
-                                                      block_to_free, count,
+                                                        block_to_free, count,
-                                                      block_to_free_p, p))
+                                                        block_to_free_p, p);
+                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
@@ -4447,9 +4254,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                }
        }
-        if (count > 0)
+        if (!err && count > 0)
-                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-                                  count, block_to_free_p, p);
+                                        count, block_to_free_p, p);
+        if (err < 0)
+                /* fatal error */
+                return;
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4479,7 +4289,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -4530,6 +4340,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4566,7 +4377,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * transaction where the data blocks are
                         * actually freed.
                         */
-                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -4596,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 int ext4_can_truncate(struct inode *inode)
 {
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -4608,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
 }
 /*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (!S_ISREG(inode->i_mode))
+                return -ENOTSUPP;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                /* TODO: Add support for non extent hole punching */
+                return -ENOTSUPP;
+        }
+        return ext4_ext_punch_hole(file, offset, length);
+}
+/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
@@ -4646,10 +4480,12 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
+        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
                return;
@@ -4660,6 +4496,7 @@ void ext4_truncate(struct inode *inode)
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
+                trace_ext4_truncate_exit(inode);
                return;
        }
@@ -4669,14 +4506,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4707,7 +4548,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4767,6 +4614,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -4789,6 +4637,7 @@ out_stop:
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);
+        trace_ext4_truncate_exit(inode);
 }
 /*
@@ -4818,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        /*
         * Figure out the offset within the block group inode table
         */
-        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((inode->i_ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -4920,6 +4769,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -5025,7 +4875,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-        iloc.bh = 0;
+        iloc.bh = NULL;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
@@ -5510,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
-            (attr->ia_size < inode->i_size ||
+            (attr->ia_size < inode->i_size)) {
-             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5519,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5538,18 +5390,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
                }
-                /* ext4_truncate will clear the flag */
-                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-                        ext4_truncate(inode);
        }
-        if ((attr->ia_valid & ATTR_SIZE) &&
+        if (attr->ia_valid & ATTR_SIZE) {
-            attr->ia_size != i_size_read(inode))
+                if (attr->ia_size != i_size_read(inode)) {
-                rc = vmtruncate(inode, attr->ia_size);
+                        truncate_setsize(inode, attr->ia_size);
+                        ext4_truncate(inode);
+                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                        ext4_truncate(inode);
+        }
        if (!rc) {
                setattr_copy(inode, attr);
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-        if (inode->i_nlink)
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5608,13 +5460,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
@@ -5643,7 +5494,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -5688,7 +5539,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
@@ -5831,6 +5682,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
@@ -5881,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
@@ -6009,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out_unlock;
        }
        ret = 0;
-        if (PageMappedToDisk(page))
-                goto out_unlock;
+        lock_page(page);
+        wait_on_page_writeback(page);
+        if (PageMappedToDisk(page)) {
+                up_read(&inode->i_alloc_sem);
+                return VM_FAULT_LOCKED;
+        }
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        lock_page(page);
        /*
         * return if we have all the buffers mapped. This avoid
         * the need to call write_begin/write_end which does a
@@ -6027,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                        ext4_bh_unmapped)) {
-                        unlock_page(page);
+                        up_read(&inode->i_alloc_sem);
-                        goto out_unlock;
+                        return VM_FAULT_LOCKED;
                }
        }
        unlock_page(page);
@@ -6048,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret < 0)
                goto out_unlock;
        ret = 0;
+        /*
+         * write_begin/end might have created a dirty page and someone
+         * could wander in and start the IO.  Make sure that hasn't
+         * happened.
+         */
+        lock_page(page);
+        wait_on_page_writeback(page);
+        up_read(&inode->i_alloc_sem);
+        return VM_FAULT_LOCKED;
 out_unlock:
        if (ret)
                ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
        case EXT4_IOC_MIGRATE:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
        case EXT4_IOC_ALLOC_DA_BLKS:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -331,6 +331,36 @@ mext_out:
                return err;
        }
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (!blk_queue_discard(q))
+                        return -EOPNOTSUPP;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                range.minlen = max((unsigned int)range.minlen,
+                                   q->limits.discard_granularity);
+                ret = ext4_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
        }
@@ -397,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                return err;
        }
        case EXT4_IOC_MOVE_EXT:
+        case FITRIM:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..6ed859d56850 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -338,6 +338,19 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -419,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        }
        /* at order 0 we see each particular block */
-        *max = 1 << (e4b->bd_blkbits + 3);
+        if (order == 0) {
-        if (order == 0)
+                *max = 1 << (e4b->bd_blkbits + 3);
                return EXT4_MB_BITMAP(e4b);
+        }
        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -603,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
        grp = ext4_get_group_info(sb, e4b->bd_group);
-        buddy = mb_find_buddy(e4b, 0, &max);
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
@@ -622,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 #define mb_check_buddy(e4b)
 #endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
@@ -769,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        struct inode *inode;
        char *data;
        char *bitmap;
+        struct ext4_group_info *grinfo;
        mb_debug(1, "init page %lu\n", page->index);
@@ -801,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (first_group + i >= ngroups)
                        break;
+                grinfo = ext4_get_group_info(sb, first_group + i);
+                /*
+                 * If page is uptodate then we came here after online resize
+                 * which added some new uninitialized group info structs, so
+                 * we must skip all initialized uptodate buddies on the page,
+                 * which may be currently in use by an allocating task.
+                 */
+                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+                        bh[i] = NULL;
+                        continue;
+                }
                err = -EIO;
                desc = ext4_get_group_desc(sb, first_group + i, NULL);
                if (desc == NULL)
@@ -853,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        }
        /* wait for I/O completion */
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                wait_on_buffer(bh[i]);
+                if (bh[i])
+                        wait_on_buffer(bh[i]);
        err = -EIO;
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                if (!buffer_uptodate(bh[i]))
+                if (bh[i] && !buffer_uptodate(bh[i]))
                        goto out;
        err = 0;
        first_block = page->index * blocks_per_page;
-        /* init the page  */
-        memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
-                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;
+                if (!bh[group - first_group])
+                        /* skip initialized uptodate buddy */
+                        continue;
                /*
                 * data carry information regarding this
                 * particular group in the format specified
@@ -901,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
+                        /* init the buddy */
+                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
@@ -930,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 out:
        if (bh) {
-                for (i = 0; i < groups_per_page && bh[i]; i++)
+                for (i = 0; i < groups_per_page; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
@@ -939,6 +974,67 @@ out:
 }
 /*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+                ext4_group_t group, struct ext4_buddy *e4b)
+{
+        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+        int block, pnum, poff;
+        int blocks_per_page;
+        struct page *page;
+        e4b->bd_buddy_page = NULL;
+        e4b->bd_bitmap_page = NULL;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (!page)
+                return -EIO;
+        BUG_ON(page->mapping != inode->i_mapping);
+        e4b->bd_bitmap_page = page;
+        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+        if (blocks_per_page >= 2) {
+                /* buddy and bitmap are on the same page */
+                return 0;
+        }
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (!page)
+                return -EIO;
+        BUG_ON(page->mapping != inode->i_mapping);
+        e4b->bd_buddy_page = page;
+        return 0;
+}
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+        if (e4b->bd_bitmap_page) {
+                unlock_page(e4b->bd_bitmap_page);
+                page_cache_release(e4b->bd_bitmap_page);
+        }
+        if (e4b->bd_buddy_page) {
+                unlock_page(e4b->bd_buddy_page);
+                page_cache_release(e4b->bd_buddy_page);
+        }
+}
+/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
@@ -947,93 +1043,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
-        int ret = 0;
-        void *bitmap;
-        int blocks_per_page;
-        int block, pnum, poff;
-        int num_grp_locked = 0;
        struct ext4_group_info *this_grp;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
-        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page;
-        struct page *page = NULL, *bitmap_page = NULL;
+        int ret = 0;
        mb_debug(1, "init group %u\n", group);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        this_grp = ext4_get_group_info(sb, group);
        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
-         * would have taken the alloc_sem lock.
+         * would have pinned buddy page to page cache.
         */
-        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
-        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
-                ret = 0;
                goto err;
        }
-        /*
-         * the buddy cache inode stores the block bitmap
+        page = e4b.bd_bitmap_page;
-         * and buddy information in consecutive blocks.
+        ret = ext4_mb_init_cache(page, NULL);
-         * So for each group we need two blocks.
+        if (ret)
-         */
+                goto err;
-        block = group * 2;
+        if (!PageUptodate(page)) {
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page) {
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, NULL);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
-        }
-        if (page == NULL || !PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
-        bitmap_page = page;
-        bitmap = page_address(page) + (poff * sb->s_blocksize);
-        /* init buddy cache */
+        if (e4b.bd_buddy_page == NULL) {
-        block++;
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page == bitmap_page) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
-                unlock_page(page);
+                ret = 0;
-        } else if (page) {
+                goto err;
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, bitmap);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
        }
-        if (page == NULL || !PageUptodate(page)) {
+        /* init buddy cache */
+        page = e4b.bd_buddy_page;
+        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+        if (ret)
+                goto err;
+        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
 err:
-        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        ext4_mb_put_buddy_page_lock(&e4b);
-        if (bitmap_page)
-                page_cache_release(bitmap_page);
-        if (page)
-                page_cache_release(page);
        return ret;
 }
@@ -1067,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
-        e4b->alloc_semp = &grp->alloc_sem;
-        /* Take the read lock on the group alloc
-         * sem. This would make sure a parallel
-         * ext4_mb_init_group happening on other
-         * groups mapped by the page is blocked
-         * till we are done with allocation
-         */
-repeat_load_buddy:
-        down_read(e4b->alloc_semp);
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                /* we need to check for group need init flag
-                 * with alloc_semp held so that we can be sure
-                 * that new blocks didn't get added to the group
-                 * when we are loading the buddy cache
-                 */
-                up_read(e4b->alloc_semp);
                /*
                 * we need full data about the group
                 * to make a good selection
@@ -1092,7 +1139,6 @@ repeat_load_buddy:
                ret = ext4_mb_init_group(sb, group);
                if (ret)
                        return ret;
-                goto repeat_load_buddy;
        }
        /*
@@ -1176,15 +1222,14 @@ repeat_load_buddy:
        return 0;
 err:
+        if (page)
+                page_cache_release(page);
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        /* Done with the buddy cache */
-        up_read(e4b->alloc_semp);
        return ret;
 }
@@ -1194,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
-        /* Done with the buddy cache */
-        if (e4b->alloc_semp)
-                up_read(e4b->alloc_semp);
 }
@@ -1509,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
-        /* on allocation we use ac to track the held semaphore */
-        ac->alloc_semp =  e4b->alloc_semp;
-        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
@@ -1915,84 +1954,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        int groups_per_page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        groups_per_page = blocks_per_page >> 1;
-        if (groups_per_page == 0)
-                groups_per_page = 1;
-        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= ngroups)
-                        break;
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
-        }
-        return i;
-}
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
-        }
-}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2233,15 +2194,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+        BUG_ON(!cachep);
+        return cachep;
+}
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
 {
-        int i, len;
+        int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        /*
         * First check if this group is the first of a reserved block.
@@ -2261,22 +2231,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = offsetof(typeof(**meta_group_info),
-                       bb_counters[sb->s_blocksize_bits + 2]);
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
@@ -2331,6 +2295,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+        struct kmem_cache *cachep;
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2363,7 +2328,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
+        sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2373,6 +2338,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
+        sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2354,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
        return 0;
 err_freebuddy:
+        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-                kfree(ext4_get_group_info(sb, i));
+                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@ -2399,6 +2366,55 @@ err_freesgi:
        return -ENOMEM;
 }
+static void ext4_groupinfo_destroy_slabs(void)
+{
+        int i;
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                if (ext4_groupinfo_caches[i])
+                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
+                ext4_groupinfo_caches[i] = NULL;
+        }
+}
+static int ext4_groupinfo_create_slab(size_t size)
+{
+        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+        int slab_size;
+        int blocksize_bits = order_base_2(size);
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep;
+        if (cache_index >= NR_GRPINFO_CACHES)
+                return -EINVAL;
+        if (unlikely(cache_index < 0))
+                cache_index = 0;
+        mutex_lock(&ext4_grpinfo_slab_create_mutex);
+        if (ext4_groupinfo_caches[cache_index]) {
+                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+                return 0;       /* Already created */
+        }
+        slab_size = offsetof(struct ext4_group_info,
+                                bb_counters[blocksize_bits + 2]);
+        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+                                        NULL);
+        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+        if (!cachep) {
+                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+                return -ENOMEM;
+        }
+        ext4_groupinfo_caches[cache_index] = cachep;
+        return 0;
+}
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2411,16 +2427,21 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
        }
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out;
        }
+        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+        if (ret < 0)
+                goto out;
        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
        sbi->s_mb_offsets[0] = 0;
@@ -2439,9 +2460,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-                kfree(sbi->s_mb_offsets);
+                goto out;
-                kfree(sbi->s_mb_maxs);
-                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2475,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                kfree(sbi->s_mb_maxs);
+                goto out;
-                return -ENOMEM;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2475,7 +2493,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-        return 0;
+out:
+        if (ret) {
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+        }
+        return ret;
 }
 /* need to called with the ext4 group lock held */
@@ -2503,6 +2526,7 @@ int ext4_mb_release(struct super_block *sb)
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2537,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                        kfree(grinfo);
+                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,20 +2581,15 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
 }
 /*
@@ -2594,7 +2613,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                if (test_opt(sb, DISCARD))
                        ext4_issue_discard(sb, entry->group,
-                                        entry->start_blk, entry->count);
+                                           entry->start_blk, entry->count);
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2677,22 @@ static void ext4_remove_debugfs_entry(void)
 #endif
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-        ext4_pspace_cachep =
+        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
-                kmem_cache_create("ext4_prealloc_space",
+                                        SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_prealloc_space),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
-        ext4_ac_cachep =
+        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
-                kmem_cache_create("ext4_alloc_context",
+                                    SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_allocation_context),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
-        ext4_free_ext_cachep =
+        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-                kmem_cache_create("ext4_free_block_extents",
+                                          SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_free_data),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,7 +2702,7 @@ int __init init_ext4_mballoc(void)
        return 0;
 }
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
@@ -2699,6 +2712,7 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        ext4_groupinfo_destroy_slabs();
        ext4_remove_debugfs_entry();
 }
@@ -3135,7 +3149,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);
-        if (cur_distance < new_distance)
+        if (cur_distance <= new_distance)
                return cpa;
        /* drop the previous reference */
@@ -3535,8 +3549,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *pa)
-                        struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3567,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = pa->pa_inode;
-        }
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
@@ -3569,15 +3577,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
-                if (ac) {
+                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                        ac->ac_b_ex.fe_group = group;
+                trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
-                        ac->ac_b_ex.fe_start = bit;
-                        ac->ac_b_ex.fe_len = next - bit;
-                        ac->ac_b_ex.fe_logical = 0;
-                        trace_ext4_mballoc_discard(ac);
-                }
-                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
@@ -3601,29 +3602,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                                struct ext4_prealloc_space *pa,
+                                struct ext4_prealloc_space *pa)
-                                struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(sb, ac, pa);
+        trace_ext4_mb_release_group_pa(pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = NULL;
-                ac->ac_b_ex.fe_group = group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = pa->pa_len;
-                ac->ac_b_ex.fe_logical = 0;
-                trace_ext4_mballoc_discard(ac);
-        }
        return 0;
 }
@@ -3644,7 +3635,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
@@ -3673,9 +3663,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3717,9 @@ repeat:
                spin_unlock(pa->pa_obj_lock);
                if (pa->pa_type == MB_GROUP_PA)
-                        ext4_mb_release_group_pa(&e4b, pa, ac);
+                        ext4_mb_release_group_pa(&e4b, pa);
                else
-                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3727,6 @@ repeat:
 out:
        ext4_unlock_group(sb, group);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@ -3762,7 +3747,6 @@ void ext4_discard_preallocations(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
@@ -3778,11 +3762,6 @@ void ext4_discard_preallocations(struct inode *inode)
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = inode;
-        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3831,7 @@ repeat:
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
@@ -3861,30 +3840,16 @@ repeat:
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
-        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+        if (!mb_enable_debug ||
+            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4060,14 +4025,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        mb_debug(1, "discard locality group preallocation\n");
        INIT_LIST_HEAD(&discard_list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4080,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_group_pa(&e4b, pa, ac);
+                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4203,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        spin_unlock(&pa->pa_lock);
                }
        }
-        if (ac->alloc_semp)
-                up_read(ac->alloc_semp);
        if (pa) {
                /*
                 * We want to add the pa to the right bucket.
                 * Remove it from the list and while adding
                 * make sure the list to which we are adding
-                 * doesn't grow big.  We need to release
+                 * doesn't grow big.
-                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
@@ -4273,14 +4229,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
-                while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+                while (ar->len &&
+                        ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
                        /* let others to free the space */
                        yield();
                        ar->len = ar->len >> 1;
@@ -4290,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        return 0;
                }
                reserv_blks = ar->len;
-                while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
+                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-                        ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                        dquot_alloc_block_nofail(ar->inode, ar->len);
-                        ar->len--;
+                } else {
+                        while (ar->len &&
+                                dquot_alloc_block(ar->inode, ar->len)) {
+                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                                ar->len--;
+                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
@@ -4370,7 +4334,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4483,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 * @inode:              inode
 * @block:              start physical block to free
 * @count:              number of blocks to count
- * @metadata:           Are these metadata blocks
+ * @flags:              flags used by ext4_free_blocks
 */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
                      struct buffer_head *bh, ext4_fsblk_t block,
@@ -4491,7 +4456,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
@@ -4531,6 +4495,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                        if (unlikely(!tbh))
+                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
@@ -4546,12 +4512,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_inode = inode;
-                ac->ac_sb = sb;
-        }
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4569,7 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        if (ac) {
+        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
-                ac->ac_b_ex.fe_group = block_group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = count;
-                trace_ext4_mballoc_free(ac);
-        }
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4626,7 +4581,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4643,9 +4602,6 @@ do_more:
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4641,316 @@ error_return:
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        return;
 }
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                     handle to this transaction
+ * @sb:                         super block
+ * @block:                      start physcial block to add to the block group
+ * @count:                      number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                         ext4_fsblk_t block, unsigned long count)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *gd_bh;
+        ext4_group_t block_group;
+        ext4_grpblk_t bit;
+        unsigned int i;
+        struct ext4_group_desc *desc;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
+        int err = 0, ret, blk_free_count;
+        ext4_grpblk_t blocks_freed;
+        struct ext4_group_info *grp;
+        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        grp = ext4_get_group_info(sb, block_group);
+        /*
+         * Check to see if we are freeing blocks across a group
+         * boundary.
+         */
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+                goto error_return;
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+        if (!desc)
+                goto error_return;
+        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+            in_range(block + count - 1, ext4_inode_table(sb, desc),
+                     sbi->s_itb_per_group)) {
+                ext4_error(sb, "Adding blocks in system zones - "
+                           "Block = %llu, count = %lu",
+                           block, count);
+                goto error_return;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting write access");
+        err = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (err)
+                goto error_return;
+        /*
+         * We are about to modify some metadata.  Call the journal APIs
+         * to unshare ->b_data if a currently-committing transaction is
+         * using it
+         */
+        BUFFER_TRACE(gd_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, gd_bh);
+        if (err)
+                goto error_return;
+        for (i = 0, blocks_freed = 0; i < count; i++) {
+                BUFFER_TRACE(bitmap_bh, "clear bit");
+                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                        ext4_error(sb, "bit already cleared for block %llu",
+                                   (ext4_fsblk_t)(block + i));
+                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                } else {
+                        blocks_freed++;
+                }
+        }
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
+        if (err)
+                goto error_return;
+        /*
+         * need to update group_info->bb_free and bitmap
+         * with group lock held. generate_buddy look at
+         * them with group lock_held
+         */
+        ext4_lock_group(sb, block_group);
+        mb_clear_bits(bitmap_bh->b_data, bit, count);
+        mb_free_blocks(NULL, &e4b, bit, count);
+        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        ext4_free_blks_set(sb, desc, blk_free_count);
+        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+        ext4_unlock_group(sb, block_group);
+        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                atomic_add(blocks_freed,
+                           &sbi->s_flex_groups[flex_group].free_blocks);
+        }
+        ext4_mb_unload_buddy(&e4b);
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        /* And the group descriptor block */
+        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+        if (!err)
+                err = ret;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, err);
+        return;
+}
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:         super block for the file system
+ * @start:      starting block of the free extent in the alloc. group
+ * @count:      number of blocks to TRIM
+ * @group:      alloc. group we are working with
+ * @e4b:        ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+                             ext4_group_t group, struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent ex;
+        assert_spin_locked(ext4_group_lock_ptr(sb, group));
+        ex.fe_start = start;
+        ex.fe_group = group;
+        ex.fe_len = count;
+        /*
+         * Mark blocks used, so no one can reuse them while
+         * being trimmed.
+         */
+        mb_mark_used(e4b, &ex);
+        ext4_unlock_group(sb, group);
+        ext4_issue_discard(sb, group, start, count);
+        ext4_lock_group(sb, group);
+        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @e4b:                ext4 buddy
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @minblocks:          minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+                   ext4_grpblk_t start, ext4_grpblk_t max,
+                   ext4_grpblk_t minblocks)
+{
+        void *bitmap;
+        ext4_grpblk_t next, count = 0;
+        struct ext4_buddy e4b;
+        int ret;
+        ret = ext4_mb_load_buddy(sb, group, &e4b);
+        if (ret) {
+                ext4_error(sb, "Error in loading buddy "
+                                "information for %u", group);
+                return ret;
+        }
+        bitmap = e4b.bd_bitmap;
+        ext4_lock_group(sb, group);
+        start = (e4b.bd_info->bb_first_free > start) ?
+                e4b.bd_info->bb_first_free : start;
+        while (start < max) {
+                start = mb_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = mb_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minblocks) {
+                        ext4_trim_extent(sb, start,
+                                         next - start, group, &e4b);
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if (need_resched()) {
+                        ext4_unlock_group(sb, group);
+                        cond_resched();
+                        ext4_lock_group(sb, group);
+                }
+                if ((e4b.bd_info->bb_free - count) < minblocks)
+                        break;
+        }
+        ext4_unlock_group(sb, group);
+        ext4_mb_unload_buddy(&e4b);
+        ext4_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+        return count;
+}
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @range:              fstrim_range structure
+ *
+ * start:       First Byte to trim
+ * len:         number of Bytes to trim from start
+ * minlen:      minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ext4_group_info *grp;
+        ext4_group_t first_group, last_group;
+        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+        ext4_grpblk_t cnt = 0, first_block, last_block;
+        uint64_t start, len, minlen, trimmed = 0;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
+        /* Determine first and last group to examine based on start and len */
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT4_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                grp = ext4_get_group_info(sb, group);
+                /* We only do this if the grp has never been initialized */
+                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                        ret = ext4_mb_init_group(sb, group);
+                        if (ret)
+                                break;
+                }
+                /*
+                 * For all the groups except the last one, last block will
+                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case start +
+                 * len < EXT4_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
+                        last_block = first_block + len;
+                len -= last_block - first_block;
+                if (grp->bb_free >= minlen) {
+                        cnt = ext4_trim_all_free(sb, group, first_block,
+                                                last_block, minlen);
+                        if (cnt < 0) {
+                                ret = cnt;
+                                break;
+                        }
+                }
+                trimmed += cnt;
+                first_block = 0;
+        }
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
        /* original request */
        struct ext4_free_extent ac_o_ex;
-        /* goal request (after normalization) */
+        /* goal request (normalized ac_o_ex) */
        struct ext4_free_extent ac_g_ex;
        /* the best found extent */
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
-        /*
-         * pointer to the held semaphore upon successful
-         * block allocation
-         */
-        struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -215,7 +210,6 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
-        struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode, 0,
+                        ext4_free_blocks(handle, inode, NULL,
                                         le32_to_cpu(tmp_idata[i]), 1,
                                         EXT4_FREE_BLOCKS_METADATA |
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode, 0,
+                ext4_free_blocks(handle, inode, NULL,
                                le32_to_cpu(i_data[0]), 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
         */
-        ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
        /*
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        struct buffer_head *bh;
        struct ext4_extent_header *eh;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                return -EIO;
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, block, 1,
+        ext4_free_blocks(handle, inode, NULL, block, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
         * start with one credit accounted for
         * superblock modification.
         *
-         * For the tmp_inode we already have commited the
+         * For the tmp_inode we already have committed the
         * trascation that created the inode. Later as and
         * when we add extents we extent the journal
         */
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+#include "ext4.h"
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+        mark_buffer_dirty(bh);
+        lock_buffer(bh);
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(WRITE_SYNC, bh);
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh)))
+                return 1;
+        return 0;
+}
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+                          ext4_fsblk_t mmp_block)
+{
+        struct mmp_struct *mmp;
+        if (*bh)
+                clear_buffer_uptodate(*bh);
+        /* This would be sb_bread(sb, mmp_block), except we need to be sure
+         * that the MD RAID device cache has been bypassed, and that the read
+         * is not blocked in the elevator. */
+        if (!*bh)
+                *bh = sb_getblk(sb, mmp_block);
+        if (*bh) {
+                get_bh(*bh);
+                lock_buffer(*bh);
+                (*bh)->b_end_io = end_buffer_read_sync;
+                submit_bh(READ_SYNC, *bh);
+                wait_on_buffer(*bh);
+                if (!buffer_uptodate(*bh)) {
+                        brelse(*bh);
+                        *bh = NULL;
+                }
+        }
+        if (!*bh) {
+                ext4_warning(sb, "Error while reading MMP block %llu",
+                             mmp_block);
+                return -EIO;
+        }
+        mmp = (struct mmp_struct *)((*bh)->b_data);
+        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+                    const char *function, unsigned int line, const char *msg)
+{
+        __ext4_warning(sb, function, line, msg);
+        __ext4_warning(sb, function, line,
+                       "MMP failure info: last update time: %llu, last update "
+                       "node: %s, last update device: %s\n",
+                       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+                       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+        struct super_block *sb = ((struct mmpd_data *) data)->sb;
+        struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct mmp_struct *mmp;
+        ext4_fsblk_t mmp_block;
+        u32 seq = 0;
+        unsigned long failed_writes = 0;
+        int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned mmp_check_interval;
+        unsigned long last_update_time;
+        unsigned long diff;
+        int retval;
+        mmp_block = le64_to_cpu(es->s_mmp_block);
+        mmp = (struct mmp_struct *)(bh->b_data);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        /*
+         * Start with the higher mmp_check_interval and reduce it if
+         * the MMP block is being updated on time.
+         */
+        mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+                                 EXT4_MMP_MIN_CHECK_INTERVAL);
+        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        bdevname(bh->b_bdev, mmp->mmp_bdevname);
+        memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+               sizeof(mmp->mmp_nodename));
+        while (!kthread_should_stop()) {
+                if (++seq > EXT4_MMP_SEQ_MAX)
+                        seq = 1;
+                mmp->mmp_seq = cpu_to_le32(seq);
+                mmp->mmp_time = cpu_to_le64(get_seconds());
+                last_update_time = jiffies;
+                retval = write_mmp_block(bh);
+                /*
+                 * Don't spew too many error messages. Print one every
+                 * (s_mmp_update_interval * 60) seconds.
+                 */
+                if (retval && (failed_writes % 60) == 0) {
+                        ext4_error(sb, "Error writing to MMP block");
+                        failed_writes++;
+                }
+                if (!(le32_to_cpu(es->s_feature_incompat) &
+                    EXT4_FEATURE_INCOMPAT_MMP)) {
+                        ext4_warning(sb, "kmmpd being stopped since MMP feature"
+                                     " has been disabled.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                if (sb->s_flags & MS_RDONLY) {
+                        ext4_warning(sb, "kmmpd being stopped since filesystem "
+                                     "has been remounted as readonly.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                diff = jiffies - last_update_time;
+                if (diff < mmp_update_interval * HZ)
+                        schedule_timeout_interruptible(mmp_update_interval *
+                                                       HZ - diff);
+                /*
+                 * We need to make sure that more than mmp_check_interval
+                 * seconds have not passed since writing. If that has happened
+                 * we need to check if the MMP block is as we left it.
+                 */
+                diff = jiffies - last_update_time;
+                if (diff > mmp_check_interval * HZ) {
+                        struct buffer_head *bh_check = NULL;
+                        struct mmp_struct *mmp_check;
+                        retval = read_mmp_block(sb, &bh_check, mmp_block);
+                        if (retval) {
+                                ext4_error(sb, "error reading MMP data: %d",
+                                           retval);
+                                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                                goto failed;
+                        }
+                        mmp_check = (struct mmp_struct *)(bh_check->b_data);
+                        if (mmp->mmp_seq != mmp_check->mmp_seq ||
+                            memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+                                   sizeof(mmp->mmp_nodename))) {
+                                dump_mmp_msg(sb, mmp_check,
+                                             "Error while updating MMP info. "
+                                             "The filesystem seems to have been"
+                                             " multiply mounted.");
+                                ext4_error(sb, "abort");
+                                goto failed;
+                        }
+                        put_bh(bh_check);
+                }
+                 /*
+                 * Adjust the mmp_check_interval depending on how much time
+                 * it took for the MMP block to be written.
+                 */
+                mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+                                             EXT4_MMP_MAX_CHECK_INTERVAL),
+                                         EXT4_MMP_MIN_CHECK_INTERVAL);
+                mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        }
+        /*
+         * Unmount seems to be clean.
+         */
+        mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        retval = write_mmp_block(bh);
+failed:
+        kfree(data);
+        brelse(bh);
+        return retval;
+}
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+        u32 new_seq;
+        do {
+                get_random_bytes(&new_seq, sizeof(u32));
+        } while (new_seq > EXT4_MMP_SEQ_MAX);
+        return new_seq;
+}
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+                                    ext4_fsblk_t mmp_block)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct buffer_head *bh = NULL;
+        struct mmp_struct *mmp = NULL;
+        struct mmpd_data *mmpd_data;
+        u32 seq;
+        unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned int wait_time = 0;
+        int retval;
+        if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+            mmp_block >= ext4_blocks_count(es)) {
+                ext4_warning(sb, "Invalid MMP block in superblock");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+                mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+        /*
+         * If check_interval in MMP block is larger, use that instead of
+         * update_interval from the superblock.
+         */
+        if (mmp->mmp_check_interval > mmp_check_interval)
+                mmp_check_interval = mmp->mmp_check_interval;
+        seq = le32_to_cpu(mmp->mmp_seq);
+        if (seq == EXT4_MMP_SEQ_CLEAN)
+                goto skip;
+        if (seq == EXT4_MMP_SEQ_FSCK) {
+                dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+                goto failed;
+        }
+        wait_time = min(mmp_check_interval * 2 + 1,
+                        mmp_check_interval + 60);
+        /* Print MMP interval if more than 20 secs. */
+        if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+                ext4_warning(sb, "MMP interval %u higher than expected, please"
+                             " wait.\n", wait_time * 2);
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+skip:
+        /*
+         * write a new random sequence number.
+         */
+        mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+        retval = write_mmp_block(bh);
+        if (retval)
+                goto failed;
+        /*
+         * wait for MMP interval and check mmp_seq.
+         */
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+        mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+        if (!mmpd_data) {
+                ext4_warning(sb, "not enough memory for mmpd_data");
+                goto failed;
+        }
+        mmpd_data->sb = sb;
+        mmpd_data->bh = bh;
+        /*
+         * Start a kernel thread to update the MMP block periodically.
+         */
+        EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+                                             bdevname(bh->b_bdev,
+                                                      mmp->mmp_bdevname));
+        if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                kfree(mmpd_data);
+                ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+                             sb->s_id);
+                goto failed;
+        }
+        return 0;
+failed:
+        brelse(bh);
+        return 1;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..f57455a1b1b2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        /* index block */
                        path[ppos].p_idx++;
-                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                        if (path[ppos+1].p_bh)
                                brelse(path[ppos+1].p_bh);
                        path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                path[cur_ppos].p_idx =
                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
                                path[cur_ppos].p_block =
-                                        idx_pblock(path[cur_ppos].p_idx);
+                                        ext4_idx_pblock(path[cur_ppos].p_idx);
                                if (path[cur_ppos+1].p_bh)
                                        brelse(path[cur_ppos+1].p_bh);
                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
                        path[leaf_ppos].p_block =
-                                        ext_pblock(path[leaf_ppos].p_ext);
+                                        ext4_ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                         */
                        o_end->ee_block = end_ext->ee_block;
                        o_end->ee_len = end_ext->ee_len;
-                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                        ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                }
                o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 */
                o_end->ee_block = end_ext->ee_block;
                o_end->ee_len = end_ext->ee_len;
-                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                /*
                 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
        /* Insert new entry */
        if (new_ext->ee_len) {
                o_start[i] = *new_ext;
-                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+                ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
        }
        /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        start_ext.ee_len = end_ext.ee_len = 0;
        new_ext.ee_block = cpu_to_le32(*from);
-        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                copy_extent_status(oext, &end_ext);
                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
                ext4_ext_store_pblock(&end_ext,
-                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                        (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
                end_ext.ee_block =
                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
                        oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        /* When tmp_dext is too large, pick up the target range. */
        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
        tmp_dext->ee_block =
                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                tmp_dext->ee_len = cpu_to_le16(max_count);
        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
        /* Adjust extent length if donor extent is larger than orig */
        if (ext4_ext_get_actual_len(tmp_dext) >
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * It needs to call wait_on_page_writeback() to wait for the
         * writeback of the page.
         */
-        if (PageWriteback(page))
+        wait_on_page_writeback(page);
-                wait_on_page_writeback(page);
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
@@ -1003,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if ((orig_start > EXT_MAX_BLOCK) ||
+        if ((orig_start >= EXT_MAX_BLOCKS) ||
-            (donor_start > EXT_MAX_BLOCK) ||
+            (donor_start >= EXT_MAX_BLOCKS) ||
-            (*len > EXT_MAX_BLOCK) ||
+            (*len > EXT_MAX_BLOCKS) ||
-            (orig_start + *len > EXT_MAX_BLOCK))  {
+            (orig_start + *len >= EXT_MAX_BLOCKS))  {
                ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
-                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
+                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/ext4.h>
 /*
 * define how far ahead to read directories while searching them.
 */
@@ -581,9 +582,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +821,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -856,6 +857,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+        const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -870,6 +872,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == '\0')) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@ -960,55 +972,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-        struct super_block * sb;
+        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-        int namelen = d_name->len;
-        const u8 *name = d_name->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-                if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
-                if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+                if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-                de = (struct ext4_dir_entry_2 *) bh->b_data;
-                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext4_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, d_name,
-                                *res_dir = de;
+                                         block << EXT4_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {      /* Success! */
-                        }
+                        dx_release(frames);
+                        return bh;
                }
                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext4_htree_next_block(dir, hash, frame,
+                retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
@@ -1045,7 +1037,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1278,7 +1270,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1421,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        ext4_handle_dirty_metadata(handle, dir, frame->bh);
+        ext4_handle_dirty_metadata(handle, dir, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                /*
+                 * Even if the block split failed, we have to properly write
+                 * out all the changes we did so far. Otherwise we can end up
+                 * with corrupted filesystem.
+                 */
+                ext4_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
@@ -1611,7 +1615,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1639,17 +1647,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1661,7 +1673,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1798,7 +1814,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1831,7 +1847,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1848,10 +1866,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1862,10 +1882,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1928,7 +1951,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2173,6 +2196,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        struct ext4_dir_entry_2 *de;
        handle_t *handle;
+        trace_ext4_unlink_enter(dir, dentry);
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        dquot_initialize(dir);
@@ -2218,6 +2242,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 end_unlink:
        ext4_journal_stop(handle);
        brelse(bh);
+        trace_ext4_unlink_exit(dentry, retval);
        return retval;
 }
@@ -2227,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
        handle_t *handle;
        struct inode *inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2234,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
        dquot_initialize(dir);
+        if (l > EXT4_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext4_journal_start(dir, credits);
-                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2250,21 +2292,44 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof(EXT4_I(inode)->i_data)) {
+        if (l > EXT4_N_BLOCKS * 4) {
                inode->i_op = &ext4_symlink_inode_operations;
                ext4_set_aops(inode);
                /*
-                 * page_symlink() calls into ext4_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext4_write_begin() which can wait
-                 * i_size in generic_commit_write().
+                 * for transaction commit if we are running out of space
+                 * and thus we deadlock. So we have to stop transaction now
+                 * and restart it when symlink contents is written.
+                 * 
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+                 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext4_journal_start(dir,
+                                EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext4_orphan_del(handle, inode);
                if (err) {
+                        ext4_journal_stop(handle);
                        clear_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext4_mark_inode_dirty(handle, inode);
-                        iput(inode);
-                        goto out_stop;
                }
        } else {
                /* clear the extent format for fast symlink */
@@ -2280,6 +2345,10 @@ out_stop:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext4_link(struct dentry *old_dentry,
@@ -2294,13 +2363,6 @@ static int ext4_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
@@ -2312,7 +2374,7 @@ retry:
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
@@ -2399,6 +2461,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!new_inode && new_dir != old_dir &&
                    EXT4_DIR_LINK_MAX(new_dir))
                        goto end_rename;
+                BUFFER_TRACE(dir_bh, "get_write_access");
+                retval = ext4_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto end_rename;
        }
        if (!new_bh) {
                retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2406,7 +2472,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext4_journal_get_write_access(handle, new_bh);
+                retval = ext4_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto end_rename;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2416,7 +2484,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2463,12 +2535,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
        ext4_update_dx_flag(old_dir);
        if (dir_bh) {
-                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7bb8f76d470a
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,417 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+int __init ext4_init_pageio(void)
+{
+        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL)
+                return -ENOMEM;
+        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+        if (io_end_cachep == NULL) {
+                kmem_cache_destroy(io_page_cachep);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void ext4_exit_pageio(void)
+{
+        kmem_cache_destroy(io_end_cachep);
+        kmem_cache_destroy(io_page_cachep);
+}
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+static void put_io_page(struct ext4_io_page *io_page)
+{
+        if (atomic_dec_and_test(&io_page->p_count)) {
+                end_page_writeback(io_page->p_page);
+                put_page(io_page->p_page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+}
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+        int i;
+        wait_queue_head_t *wq;
+        BUG_ON(!io);
+        if (io->page)
+                put_page(io->page);
+        for (i = 0; i < io->num_io_pages; i++)
+                put_io_page(io->pages[i]);
+        io->num_io_pages = 0;
+        wq = ext4_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
+        kmem_cache_free(io_end_cachep, io);
+}
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+        struct inode *inode = io->inode;
+        loff_t offset = io->offset;
+        ssize_t size = io->size;
+        wait_queue_head_t *wq;
+        int ret = 0;
+        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+                   "list->prev 0x%p\n",
+                   io, inode->i_ino, io->list.next, io->list.prev);
+        if (list_empty(&io->list))
+                return ret;
+        if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+                return ret;
+        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        if (ret < 0) {
+                printk(KERN_EMERG "%s: failed to convert unwritten "
+                        "extents to written extents, error is %d "
+                        "io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+                return ret;
+        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
+        /* clear the DIO AIO unwritten flag */
+        if (io->flag & EXT4_IO_END_UNWRITTEN) {
+                io->flag &= ~EXT4_IO_END_UNWRITTEN;
+                /* Wake up anyone waiting on unwritten extent conversion */
+                wq = ext4_ioend_wq(io->inode);
+                if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+                    waitqueue_active(wq)) {
+                        wake_up_all(wq);
+                }
+        }
+        return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+        struct inode            *inode = io->inode;
+        struct ext4_inode_info  *ei = EXT4_I(inode);
+        unsigned long           flags;
+        int                     ret;
+        mutex_lock(&inode->i_mutex);
+        ret = ext4_end_io_nolock(io);
+        if (ret < 0) {
+                mutex_unlock(&inode->i_mutex);
+                return;
+        }
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (!list_empty(&io->list))
+                list_del_init(&io->list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        mutex_unlock(&inode->i_mutex);
+        ext4_free_io_end(io);
+}
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+        if (io) {
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
+                io->inode = inode;
+                INIT_WORK(&io->work, ext4_end_io_work);
+                INIT_LIST_HEAD(&io->list);
+        }
+        return io;
+}
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
+static void ext4_end_bio(struct bio *bio, int error)
+{
+        ext4_io_end_t *io_end = bio->bi_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        int i;
+        sector_t bi_sector = bio->bi_sector;
+        BUG_ON(!io_end);
+        bio->bi_private = NULL;
+        bio->bi_end_io = NULL;
+        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                error = 0;
+        bio_put(bio);
+        for (i = 0; i < io_end->num_io_pages; i++) {
+                struct page *page = io_end->pages[i]->p_page;
+                struct buffer_head *bh, *head;
+                loff_t offset;
+                loff_t io_end_offset;
+                if (error) {
+                        SetPageError(page);
+                        set_bit(AS_EIO, &page->mapping->flags);
+                        head = page_buffers(page);
+                        BUG_ON(!head);
+                        io_end_offset = io_end->offset + io_end->size;
+                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+                        bh = head;
+                        do {
+                                if ((offset >= io_end->offset) &&
+                                    (offset+bh->b_size <= io_end_offset))
+                                        buffer_io_error(bh);
+                                offset += bh->b_size;
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                put_io_page(io_end->pages[i]);
+        }
+        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bi_sector >> (inode->i_blkbits - 9));
+        }
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                ext4_free_io_end(io_end);
+                return;
+        }
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+}
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+        struct bio *bio = io->io_bio;
+        if (bio) {
+                bio_get(io->io_bio);
+                submit_bio(io->io_op, io->io_bio);
+                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+                bio_put(io->io_bio);
+        }
+        io->io_bio = NULL;
+        io->io_op = 0;
+        io->io_end = NULL;
+}
+static int io_submit_init(struct ext4_io_submit *io,
+                          struct inode *inode,
+                          struct writeback_control *wbc,
+                          struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        int nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio *bio;
+        io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_end)
+                return -ENOMEM;
+        do {
+                bio = bio_alloc(GFP_NOIO, nvecs);
+                nvecs >>= 1;
+        } while (bio == NULL);
+        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_bdev = bh->b_bdev;
+        bio->bi_private = io->io_end = io_end;
+        bio->bi_end_io = ext4_end_bio;
+        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+        io->io_bio = bio;
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+        io->io_next_block = bh->b_blocknr;
+        return 0;
+}
+static int io_submit_add_bh(struct ext4_io_submit *io,
+                            struct ext4_io_page *io_page,
+                            struct inode *inode,
+                            struct writeback_control *wbc,
+                            struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        int ret;
+        if (buffer_new(bh)) {
+                clear_buffer_new(bh);
+                unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+        }
+        if (!buffer_mapped(bh) || buffer_delay(bh)) {
+                if (!buffer_mapped(bh))
+                        clear_buffer_dirty(bh);
+                if (io->io_bio)
+                        ext4_io_submit(io);
+                return 0;
+        }
+        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+                ext4_io_submit(io);
+        }
+        if (io->io_bio == NULL) {
+                ret = io_submit_init(io, inode, wbc, bh);
+                if (ret)
+                        return ret;
+        }
+        io_end = io->io_end;
+        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+            (io_end->pages[io_end->num_io_pages-1] != io_page))
+                goto submit_and_retry;
+        if (buffer_uninit(bh))
+                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+        io->io_end->size += bh->b_size;
+        io->io_next_block++;
+        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (ret != bh->b_size)
+                goto submit_and_retry;
+        if ((io_end->num_io_pages == 0) ||
+            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+                io_end->pages[io_end->num_io_pages++] = io_page;
+                atomic_inc(&io_page->p_count);
+        }
+        return 0;
+}
+int ext4_bio_write_page(struct ext4_io_submit *io,
+                        struct page *page,
+                        int len,
+                        struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        struct ext4_io_page *io_page;
+        struct buffer_head *bh, *head;
+        int ret = 0;
+        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(!PageLocked(page));
+        BUG_ON(PageWriteback(page));
+        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+        if (!io_page) {
+                set_page_dirty(page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
+        io_page->p_page = page;
+        atomic_set(&io_page->p_count, 1);
+        get_page(page);
+        set_page_writeback(page);
+        ClearPageError(page);
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + blocksize;
+                if (block_start >= len) {
+                        clear_buffer_dirty(bh);
+                        set_buffer_uptodate(bh);
+                        continue;
+                }
+                clear_buffer_dirty(bh);
+                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+                if (ret) {
+                        /*
+                         * We only get here on ENOMEM.  Not much else
+                         * we can do but mark the page as dirty, and
+                         * better luck next time.
+                         */
+                        set_page_dirty(page);
+                        break;
+                }
+        }
+        unlock_page(page);
+        /*
+         * If the page was truncated before we could do the writeback,
+         * or we had a memory allocation error while trying to write
+         * the first buffer head, we won't have submitted any pages for
+         * I/O.  In that case we need to make sure we've cleared the
+         * PageWriteback bit from the page to prevent the system from
+         * wedging later on.
+         */
+        put_io_page(io_page);
+        return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,29 +220,25 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        for (i = 0, bit = gdblocks + 1, block = start + bit;
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-             i < reserved_gdb; i++, block++, bit++) {
+                        block, sbi->s_itb_per_group);
-                struct buffer_head *gdb;
+        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+                               GFP_NOFS);
-                ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
-                        goto exit_bh;
-                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(gdb);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
-                brelse(gdb);
-        }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,29 +247,26 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
        /* Zero out all of the inode table blocks */
-        for (i = 0, block = input->inode_table, bit = block - start;
+        block = input->inode_table;
-             i < sbi->s_itb_per_group; i++, bit++, block++) {
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-                struct buffer_head *it;
+                        block, sbi->s_itb_per_group);
+        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-                ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
+        for (i = 0, bit = input->inode_table - start;
-                        goto exit_bh;
+             i < sbi->s_itb_per_group; i++, bit++)
-                if (IS_ERR(it = bclean(handle, sb, block))) {
-                        err = PTR_ERR(it);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, it);
-                brelse(it);
                ext4_set_bit(bit, bh->b_data);
-        }
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+                             bh->b_data);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -283,9 +276,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                        bh->b_data);
+                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -437,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -469,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -485,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
-        /* ext4_journal_release_buffer(handle, iloc.bh); */
+        /* ext4_handle_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
-        /* ext4_journal_release_buffer(handle, dind); */
+        /* ext4_handle_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -579,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        /*
                        int j;
                        for (j = 0; j < i; j++)
-                                ext4_journal_release_buffer(handle, primary[j]);
+                                ext4_handle_release_buffer(handle, primary[j]);
                         */
                        goto exit_bh;
                }
@@ -680,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -898,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..9ea71aa864b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -39,8 +38,12 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -50,8 +53,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -68,14 +74,34 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt);
+                       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .mount          = ext4_mount,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -233,27 +259,44 @@ static void ext4_put_nojournal(handle_t *handle)
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+        handle_t  *handle;
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_TRANS);
-        /* Special case here: if the journal has aborted behind our
-         * backs (eg. EIO in the commit thread), then we still need to
-         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        handle = ext4_journal_current_handle();
-                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, "Detected aborted journal");
+        /*
-                        return ERR_PTR(-EROFS);
+         * If a handle has been started, it should be allowed to
-                }
+         * finish, otherwise deadlock could happen between freeze
-                return jbd2_journal_start(journal, nblocks);
+         * and others(e.g. truncate) due to the restart of the
+         * journal handle if the filesystem is forzen and active
+         * handles are not stopped.
+         */
+        if (!handle)
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
+        if (!journal)
+                return ext4_get_nojournal();
+        /*
+         * Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly.
+         */
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, "Detected aborted journal");
+                return ERR_PTR(-EROFS);
        }
-        return ext4_get_nojournal();
+        return jbd2_journal_start(journal, nblocks);
 }
 /*
@@ -381,13 +424,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -398,28 +442,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -427,17 +474,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -536,28 +584,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -568,21 +617,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -598,7 +651,7 @@ __acquires(bitlock)
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
-         * ext4_grp_locked_error() to distinguish beween the
+         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
@@ -640,7 +693,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -656,8 +709,7 @@ fail:
 */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -702,13 +754,13 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
-        lock_kernel();
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
@@ -719,6 +771,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
+        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -770,12 +823,13 @@ static void ext4_put_super(struct super_block *sb)
                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
-        unlock_kernel();
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
@@ -801,32 +855,44 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
+        atomic_set(&ei->i_ioend_count, 0);
+        atomic_set(&ei->i_aiodio_unwritten, 0);
        return &ei->vfs_inode;
 }
+static int ext4_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext4_drop_inode(inode, drop);
+        return drop;
+}
+static void ext4_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
+        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -836,7 +902,7 @@ static void ext4_destroy_inode(struct inode *inode)
                                true);
                dump_stack();
        }
-        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+        call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
@@ -874,9 +940,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -965,13 +1034,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, OLDALLOC))
                seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (test_opt(sb, XATTR_USER) &&
+        if (test_opt(sb, XATTR_USER))
-                !(def_mount_opts & EXT4_DEFM_XATTR_USER))
                seq_puts(seq, ",user_xattr");
-        if (!test_opt(sb, XATTR_USER) &&
+        if (!test_opt(sb, XATTR_USER))
-            (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
                seq_puts(seq, ",nouser_xattr");
-        }
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1009,6 +1075,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
+        if (!test_opt(sb, MBLK_IO_SUBMIT))
+                seq_puts(seq, ",nomblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1045,6 +1113,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
+        if (!test_opt(sb, INIT_INODE_TABLE))
+                seq_puts(seq, ",noinit_inode_table");
+        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
+                seq_printf(seq, ",init_inode_table=%u",
+                           (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1123,7 +1197,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1132,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
 static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
        .get_reserved_space = ext4_get_reserved_space,
-#endif
        .write_dquot    = ext4_write_dquot,
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
@@ -1160,6 +1232,7 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
@@ -1180,6 +1253,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
@@ -1214,11 +1288,12 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-        Opt_block_validity, Opt_noblock_validity,
+        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+        Opt_init_inode_table, Opt_noinit_inode_table,
 };
 static const match_table_t tokens = {
@@ -1278,6 +1353,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_mblk_io_submit, "mblk_io_submit"},
+        {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1289,6 +1366,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+        {Opt_init_inode_table, "init_itable=%u"},
+        {Opt_init_inode_table, "init_itable"},
+        {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
 };
@@ -1353,7 +1433,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1403,26 +1483,26 @@ static int parse_options(char *options, struct super_block *sb,
                 * Initialize args struct so we know whether arg was
                 * found; some options take optional arguments.
                 */
-                args[0].to = args[0].from = 0;
+                args[0].to = args[0].from = NULL;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1440,38 +1520,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1481,10 +1561,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1503,7 +1583,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1516,14 +1596,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1566,15 +1646,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1614,12 +1694,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1627,9 +1707,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1655,7 +1735,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1664,9 +1744,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1690,11 +1770,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
+                        break;
+                case Opt_mblk_io_submit:
+                        set_opt(sb, MBLK_IO_SUBMIT);
+                        break;
+                case Opt_nomblk_io_submit:
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1704,20 +1790,20 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (!is_power_of_2(option)) {
+                        if (option && !is_power_of_2(option)) {
                                ext4_msg(sb, KERN_ERR,
                                         "EXT4-fs: inode_readahead_blks"
                                         " must be a power of 2");
@@ -1734,7 +1820,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1743,21 +1829,35 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
+                        break;
+                case Opt_init_inode_table:
+                        set_opt(sb, INIT_INODE_TABLE);
+                        if (args[0].from) {
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = EXT4_DEF_LI_WAIT_MULT;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_li_wait_mult = option;
+                        break;
+                case Opt_noinit_inode_table:
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1769,10 +1869,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1817,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
@@ -1842,13 +1942,14 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
+        cleancache_init_fs(sb);
        return res;
 }
@@ -1877,14 +1978,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -1942,7 +2042,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+                                  ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +2052,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-        ext4_group_t i;
+        ext4_group_t i, grp = sbi->s_groups_count;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1967,6 +2068,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+                if ((grp == sbi->s_groups_count) &&
+                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2109,8 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+        if (NULL != first_not_zeroed)
+                *first_not_zeroed = grp;
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2046,6 +2153,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                return;
        }
+        /* Check if feature set would not allow a r/w mount */
+        if (!ext4_feature_set_ok(sb, 0)) {
+                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -2129,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
 static loff_t ext4_max_size(int blkbits, int has_huge_files)
@@ -2150,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
                upper_limit <<= blkbits;
        }
-        /* 32-bit extent-start container, ee_block */
+        /*
-        res = 1LL << 32;
+         * 32-bit extent-start container, ee_block. We lower the maxbytes
+         * by one fs block, so ee_len can cover the extent of maximum file
+         * size
+         */
+        res = (1LL << 32) - 1;
        res <<= blkbits;
-        res -= 1;
        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
@@ -2329,6 +2452,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+                                      struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+                                        struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                          struct ext4_sb_info *sbi,
                                          const char *buf, size_t count)
@@ -2338,7 +2473,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        if (!is_power_of_2(t))
+        if (t && !is_power_of_2(t))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2376,6 +2511,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
@@ -2385,6 +2521,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2400,6 +2538,8 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(extent_cache_hits),
+        ATTR_LIST(extent_cache_misses),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@ -2412,6 +2552,16 @@ static struct attribute *ext4_attrs[] = {
        NULL,
 };
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+static struct attribute *ext4_feat_attrs[] = {
+        ATTR_LIST(lazy_itable_init),
+        ATTR_LIST(batched_discard),
+        NULL,
+};
 static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -2440,7 +2590,6 @@ static void ext4_sb_release(struct kobject *kobj)
        complete(&sbi->s_kobj_unregister);
 }
 static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@ -2452,6 +2601,17 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
+static void ext4_feat_release(struct kobject *kobj)
+{
+        complete(&ext4_feat->f_kobj_unregister);
+}
+static struct kobj_type ext4_feat_ktype = {
+        .default_attrs  = ext4_feat_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_feat_release,
+};
 /*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2702,343 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_group_desc *gdp = NULL;
+        ext4_group_t group, ngroups;
+        struct super_block *sb;
+        unsigned long timeout = 0;
+        int ret = 0;
+        sb = elr->lr_super;
+        ngroups = EXT4_SB(sb)->s_groups_count;
+        for (group = elr->lr_next_group; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp) {
+                        ret = 1;
+                        break;
+                }
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        if (group == ngroups)
+                ret = 1;
+        if (!ret) {
+                timeout = jiffies;
+                ret = ext4_init_inode_table(sb, group,
+                                            elr->lr_timeout ? 0 : 1);
+                if (elr->lr_timeout == 0) {
+                        timeout = (jiffies - timeout) *
+                                  elr->lr_sbi->s_li_wait_mult;
+                        elr->lr_timeout = timeout;
+                }
+                elr->lr_next_sched = jiffies + elr->lr_timeout;
+                elr->lr_next_group = group + 1;
+        }
+        return ret;
+}
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_sb_info *sbi;
+        if (!elr)
+                return;
+        sbi = elr->lr_sbi;
+        list_del(&elr->lr_request);
+        sbi->s_li_request = NULL;
+        kfree(elr);
+}
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+        mutex_lock(&ext4_li_mtx);
+        if (!ext4_li_info) {
+                mutex_unlock(&ext4_li_mtx);
+                return;
+        }
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+        mutex_unlock(&ext4_li_mtx);
+}
+static struct task_struct *ext4_lazyinit_task;
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        unsigned long next_wakeup, cur;
+        BUG_ON(NULL == eli);
+cont_thread:
+        while (true) {
+                next_wakeup = MAX_JIFFY_OFFSET;
+                mutex_lock(&eli->li_list_mtx);
+                if (list_empty(&eli->li_request_list)) {
+                        mutex_unlock(&eli->li_list_mtx);
+                        goto exit_thread;
+                }
+                list_for_each_safe(pos, n, &eli->li_request_list) {
+                        elr = list_entry(pos, struct ext4_li_request,
+                                         lr_request);
+                        if (time_after_eq(jiffies, elr->lr_next_sched)) {
+                                if (ext4_run_li_request(elr) != 0) {
+                                        /* error, remove the lazy_init job */
+                                        ext4_remove_li_request(elr);
+                                        continue;
+                                }
+                        }
+                        if (time_before(elr->lr_next_sched, next_wakeup))
+                                next_wakeup = elr->lr_next_sched;
+                }
+                mutex_unlock(&eli->li_list_mtx);
+                if (freezing(current))
+                        refrigerator();
+                cur = jiffies;
+                if ((time_after_eq(cur, next_wakeup)) ||
+                    (MAX_JIFFY_OFFSET == next_wakeup)) {
+                        cond_resched();
+                        continue;
+                }
+                schedule_timeout_interruptible(next_wakeup - cur);
+                if (kthread_should_stop()) {
+                        ext4_clear_request_list();
+                        goto exit_thread;
+                }
+        }
+exit_thread:
+        /*
+         * It looks like the request list is empty, but we need
+         * to check it under the li_list_mtx lock, to prevent any
+         * additions into it, and of course we should lock ext4_li_mtx
+         * to atomically free the list and ext4_li_info, because at
+         * this point another ext4 filesystem could be registering
+         * new one.
+         */
+        mutex_lock(&ext4_li_mtx);
+        mutex_lock(&eli->li_list_mtx);
+        if (!list_empty(&eli->li_request_list)) {
+                mutex_unlock(&eli->li_list_mtx);
+                mutex_unlock(&ext4_li_mtx);
+                goto cont_thread;
+        }
+        mutex_unlock(&eli->li_list_mtx);
+        kfree(ext4_li_info);
+        ext4_li_info = NULL;
+        mutex_unlock(&ext4_li_mtx);
+        return 0;
+}
+static void ext4_clear_request_list(void)
+{
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+                elr = list_entry(pos, struct ext4_li_request,
+                                 lr_request);
+                ext4_remove_li_request(elr);
+        }
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+static int ext4_run_lazyinit_thread(void)
+{
+        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+                                         ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(ext4_lazyinit_task)) {
+                int err = PTR_ERR(ext4_lazyinit_task);
+                ext4_clear_request_list();
+                kfree(ext4_li_info);
+                ext4_li_info = NULL;
+                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                 "initialization thread\n",
+                                 err);
+                return err;
+        }
+        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+        return 0;
+}
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+        struct ext4_group_desc *gdp = NULL;
+        for (group = 0; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        continue;
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        return group;
+}
+static int ext4_li_info_new(void)
+{
+        struct ext4_lazy_init *eli = NULL;
+        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+        if (!eli)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&eli->li_request_list);
+        mutex_init(&eli->li_list_mtx);
+        eli->li_state |= EXT4_LAZYINIT_QUIT;
+        ext4_li_info = eli;
+        return 0;
+}
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                            ext4_group_t start)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        unsigned long rnd;
+        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+        if (!elr)
+                return NULL;
+        elr->lr_super = sb;
+        elr->lr_sbi = sbi;
+        elr->lr_next_group = start;
+        /*
+         * Randomize first schedule time of the request to
+         * spread the inode table initialization requests
+         * better.
+         */
+        get_random_bytes(&rnd, sizeof(rnd));
+        elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                             (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+        return elr;
+}
+static int ext4_register_li_request(struct super_block *sb,
+                                    ext4_group_t first_not_zeroed)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        int ret = 0;
+        if (sbi->s_li_request != NULL) {
+                /*
+                 * Reset timeout so it can be computed again, because
+                 * s_li_wait_mult might have changed.
+                 */
+                sbi->s_li_request->lr_timeout = 0;
+                return 0;
+        }
+        if (first_not_zeroed == ngroups ||
+            (sb->s_flags & MS_RDONLY) ||
+            !test_opt(sb, INIT_INODE_TABLE))
+                return 0;
+        elr = ext4_li_request_new(sb, first_not_zeroed);
+        if (!elr)
+                return -ENOMEM;
+        mutex_lock(&ext4_li_mtx);
+        if (NULL == ext4_li_info) {
+                ret = ext4_li_info_new();
+                if (ret)
+                        goto out;
+        }
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+        sbi->s_li_request = elr;
+        /*
+         * set elr to NULL here since it has been inserted to
+         * the request_list and the removal and free of it is
+         * handled by ext4_clear_request_list from now on.
+         */
+        elr = NULL;
+        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+                ret = ext4_run_lazyinit_thread();
+                if (ret)
+                        goto out;
+        }
+out:
+        mutex_unlock(&ext4_li_mtx);
+        if (ret)
+                kfree(elr);
+        return ret;
+}
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+        /*
+         * If thread exited earlier
+         * there's nothing to be done.
+         */
+        if (!ext4_li_info || !ext4_lazyinit_task)
+                return;
+        kthread_stop(ext4_lazyinit_task);
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2567,6 +3064,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+        ext4_group_t first_not_zeroed;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -2588,8 +3086,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
-        unlock_kernel();
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
@@ -2629,40 +3125,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
+        /* xattr user namespace & acls are now defaulted on */
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+        set_opt(sb, XATTR_USER);
-                set_opt(sbi->s_mount_opt, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (def_mount_opts & EXT4_DEFM_ACL)
+        set_opt(sb, POSIX_ACL);
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
+        set_opt(sb, MBLK_IO_SUBMIT);
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2671,7 +3168,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -2679,7 +3176,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
+        /*
+         * set default s_li_wait_mult for lazyinit, for the case there is
+         * no mount option specified.
+         */
+        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -2702,6 +3205,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
+        if (IS_EXT2_SB(sb)) {
+                if (ext2_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
+        if (IS_EXT3_SB(sb)) {
+                if (ext3_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
@@ -2831,15 +3356,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-        if ((ext4_blocks_count(es) >
+        err = generic_check_addressable(sb->s_blocksize_bits,
-             (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+                                        ext4_blocks_count(es));
-            (ext4_blocks_count(es) >
+        if (err) {
-             (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-                ret = -EFBIG;
+                ret = err;
                goto failed_mount;
        }
@@ -2908,7 +3432,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors(sb)) {
+        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@ -2924,6 +3448,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext4_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext4_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext4_count_dirs(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount3;
+        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2941,6 +3487,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext4_qctl_operations;
        sb->dq_op = &ext4_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -2951,6 +3499,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                          EXT4_HAS_INCOMPAT_FEATURE(sb,
                                    EXT4_FEATURE_INCOMPAT_RECOVER));
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+            !(sb->s_flags & MS_RDONLY))
+                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+                        goto failed_mount3;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
@@ -2965,8 +3518,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3004,9 +3556,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3022,23 +3574,25 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
+        /*
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+         * The journal may have updated the bg summary counts, so we
-                                  ext4_count_free_blocks(sb));
+         * need to update the global counters.
-        if (!err)
+         */
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+        percpu_counter_set(&sbi->s_freeblocks_counter,
-                                          ext4_count_free_inodes(sb));
+                           ext4_count_free_blocks(sb));
-        if (!err)
+        percpu_counter_set(&sbi->s_freeinodes_counter,
-                err = percpu_counter_init(&sbi->s_dirs_counter,
+                           ext4_count_free_inodes(sb));
-                                          ext4_count_dirs(sb));
+        percpu_counter_set(&sbi->s_dirs_counter,
-        if (!err)
+                           ext4_count_dirs(sb));
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount_wq;
-        }
-        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+no_journal:
+        /*
+         * The maximum number of concurrent works can be high and
+         * concurrency isn't really necessary.  Limit it to 1.
+         */
+        EXT4_SB(sb)->dio_unwritten_wq =
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
@@ -3053,17 +3607,16 @@ no_journal:
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
+                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-                iput(root);
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
-                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
        }
@@ -3099,18 +3652,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -3129,6 +3682,10 @@ no_journal:
                goto failed_mount4;
        }
+        err = ext4_register_li_request(sb, first_not_zeroed);
+        if (err)
+                goto failed_mount4;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3160,13 +3717,9 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-        init_timer(&sbi->s_err_report);
-        sbi->s_err_report.function = print_daily_error_info;
-        sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
-        lock_kernel();
        kfree(orig_data);
        return 0;
@@ -3176,6 +3729,8 @@ cantfind_ext4:
        goto failed_mount;
 failed_mount4:
+        iput(root);
+        sb->s_root = NULL;
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
@@ -3184,17 +3739,20 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
+        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
                else
                        kfree(sbi->s_flex_groups);
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3213,7 +3771,6 @@ out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        lock_kernel();
 out_free_orig:
        kfree(orig_data);
        return ret;
@@ -3306,13 +3863,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext4_msg(sb, KERN_ERR,
-                        "failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -3470,7 +4020,7 @@ static int ext4_load_journal(struct super_block *sb,
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -3524,9 +4074,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                                        &EXT4_SB(sb)->s_freeblocks_counter));
+                                           &EXT4_SB(sb)->s_freeblocks_counter));
-        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+        es->s_free_inodes_count =
-                                        &EXT4_SB(sb)->s_freeinodes_counter));
+                cpu_to_le32(percpu_counter_sum_positive(
+                                &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
@@ -3658,6 +4209,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -3706,6 +4262,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -3716,18 +4288,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-        int err;
+        int err = 0;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
        char *orig_data = kstrdup(data, GFP_KERNEL);
-        lock_kernel();
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -3843,9 +4414,29 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                                     EXT4_FEATURE_INCOMPAT_MMP))
+                                if (ext4_multi_mount_protect(sb,
+                                                le64_to_cpu(es->s_mmp_block))) {
+                                        err = -EROFS;
+                                        goto restore_opts;
+                                }
                        enable_quota = 1;
                }
        }
+        /*
+         * Reinitialize lazy itable initialization thread based on
+         * current settings
+         */
+        if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+                ext4_unregister_li_request(sb);
+        else {
+                ext4_group_t first_not_zeroed;
+                first_not_zeroed = ext4_has_uninit_itable(sb);
+                ext4_register_li_request(sb, first_not_zeroed);
+        }
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
@@ -3858,7 +4449,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        unlock_super(sb);
-        unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
@@ -3869,6 +4459,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -3884,7 +4475,6 @@ restore_opts:
        }
 #endif
        unlock_super(sb);
-        unlock_kernel();
        kfree(orig_data);
        return err;
 }
@@ -3895,6 +4485,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        u64 fsid;
+        s64 bfree;
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
@@ -3938,8 +4529,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+        bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+        /* prevent underflow in case that few free space is available */
+        buf->f_bfree = max_t(s64, bfree, 0);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
@@ -4066,27 +4659,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
@@ -4097,7 +4683,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * all updates to the file when we bypass pagecache...
         */
        if (EXT4_SB(sb)->s_journal &&
-            ext4_should_journal_data(path.dentry->d_inode)) {
+            ext4_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -4105,32 +4691,42 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-        /* Force all delayed allocation blocks to be allocated */
+        struct inode *inode = sb_dqopt(sb)->files[type];
-        if (test_opt(sb, DELALLOC)) {
+        handle_t *handle;
-                down_read(&sb->s_umount);
+        /* Force all delayed allocation blocks to be allocated.
+         * Caller already holds s_umount sem */
+        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
-                up_read(&sb->s_umount);
-        }
+        if (!inode)
+                goto out;
+        /* Update modification times of quota files when userspace can
+         * start looking at them */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                goto out;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+out:
        return dquot_quota_off(sb, type);
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
@@ -4220,30 +4816,21 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
        }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext2",
-        .get_sb         = ext4_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
 static inline void register_as_ext2(void)
 {
        int err = register_filesystem(&ext2_fs_type);
@@ -4256,10 +4843,22 @@ static inline void unregister_as_ext2(void)
 {
        unregister_filesystem(&ext2_fs_type);
 }
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4275,79 +4874,155 @@ static inline void unregister_as_ext3(void)
 {
        unregister_filesystem(&ext3_fs_type);
 }
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static int __init init_ext4_fs(void)
+static int __init ext4_init_feat_adverts(void)
 {
-        int err;
+        struct ext4_features *ef;
+        int ret = -ENOMEM;
+        ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+        if (!ef)
+                goto out;
+        ef->f_kobj.kset = ext4_kset;
+        init_completion(&ef->f_kobj_unregister);
+        ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                   "features");
+        if (ret) {
+                kfree(ef);
+                goto out;
+        }
+        ext4_feat = ef;
+        ret = 0;
+out:
+        return ret;
+}
+static void ext4_exit_feat_adverts(void)
+{
+        kobject_put(&ext4_feat->f_kobj);
+        wait_for_completion(&ext4_feat->f_kobj_unregister);
+        kfree(ext4_feat);
+}
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+static int __init ext4_init_fs(void)
+{
+        int i, err;
        ext4_check_flag_values();
-        err = init_ext4_system_zone();
+        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+                mutex_init(&ext4__aio_mutex[i]);
+                init_waitqueue_head(&ext4__ioend_wq[i]);
+        }
+        err = ext4_init_pageio();
        if (err)
                return err;
+        err = ext4_init_system_zone();
+        if (err)
+                goto out7;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                goto out4;
+                goto out6;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-        err = init_ext4_mballoc();
+        if (!ext4_proc_root)
+                goto out5;
+        err = ext4_init_feat_adverts();
+        if (err)
+                goto out4;
+        err = ext4_init_mballoc();
        if (err)
                goto out3;
-        err = init_ext4_xattr();
+        err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;
-        register_as_ext2();
        register_as_ext3();
+        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+        ext4_li_info = NULL;
+        mutex_init(&ext4_li_mtx);
        return 0;
 out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        exit_ext4_xattr();
+        ext4_exit_xattr();
 out2:
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
 out3:
+        ext4_exit_feat_adverts();
+out4:
        remove_proc_entry("fs/ext4", NULL);
+out5:
        kset_unregister(ext4_kset);
-out4:
+out6:
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+out7:
+        ext4_exit_pageio();
        return err;
 }
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        exit_ext4_xattr();
+        ext4_exit_xattr();
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
+        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+        ext4_exit_pageio();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
+module_init(ext4_init_fs)
-module_exit(exit_ext4_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        int offset = (char *)s->here - bs->bh->b_data;
                        unlock_buffer(bs->bh);
-                        jbd2_journal_release_buffer(handle, bs->bh);
+                        ext4_handle_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
@@ -820,8 +820,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-                        block = ext4_new_meta_blocks(handle, inode,
+                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
-                                                  goal, NULL, &error);
+                                                     NULL, &error);
                        if (error)
                                goto cleanup;
@@ -833,7 +833,7 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
        if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
        if (ext4_xattr_cache)
                mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int init_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
-extern void exit_ext4_xattr(void);
+extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
-static inline int
+static __init inline int
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        return 0;
 }
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/ext4
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)