34 files changed, 1900 insertions, 1419 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 34ea4f1fa6ea..f7cbf574a875 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
 session_write_kbytes         This file is read-only and shows the number of
                              kilobytes of data that have been written to this
                              filesystem since it was mounted.
+ reserved_clusters            This is RW file and contains number of reserved
+                              clusters in the file system which will be used
+                              in the specific situations to avoid costly
+                              zeroout, unexpected ENOSPC, or possible data
+                              loss. The default is 2% or 4096 clusters,
+                              whichever is smaller and this can be changed
+                              however it can never exceed number of clusters
+                              in the file system. If there is not enough space
+                              for the reserved space when mounting the file
+                              mount will _not_ fail.
 ..............................................................................
 Ioctls
@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
                              bitmaps and inode table, the userspace tool thus
                              just passes the new number of blocks.
+EXT4_IOC_SWAP_BOOT            Swap i_blocks and associated attributes
+                              (like i_blocks, i_size, i_flags, ...) from
+                              the specified inode with inode
+                              EXT4_BOOT_LOADER_INO (#5). This is typically
+                              used to store a boot loader in a secure part of
+                              the filesystem, where it can't be changed by a
+                              normal user by accident.
+                              The data blocks of the previous boot loader
+                              will be associated with the given inode.
 ..............................................................................
 References
diff --git a/fs/buffer.c b/fs/buffer.c
index 10ef81e10b20..bc1fe14aaa3e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
        /* Take care of bh's that straddle the end of the device */
        guard_bh_eod(rw, bio, bh);
+        if (buffer_meta(bh))
+                rw |= REQ_META;
+        if (buffer_prio(bh))
+                rw |= REQ_PRIO;
        bio_get(bio);
        submit_bio(rw, bio);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb9..efea5d5c44ce 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
          Enables run-time debugging support for the ext4 filesystem.
          If you select Y here, then you will be able to turn on debugging
-          with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+          with a command such as:
+                echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92e68b33fffd..d0f13eada0ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 */
 /*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+                                   ext4_fsblk_t block)
+{
+        ext4_group_t group;
+        if (test_opt2(sb, STD_GROUP_SIZE))
+                group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+                         block) >>
+                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+        else
+                ext4_get_group_no_and_offset(sb, block, &group, NULL);
+        return group;
+}
+/*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
 */
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 }
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+/*
-                        ext4_group_t block_group)
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+                                      ext4_fsblk_t block,
+                                      ext4_group_t block_group)
 {
        ext4_group_t actual_group;
-        ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
-        if (actual_group == block_group)
+        actual_group = ext4_get_group_number(sb, block);
-                return 1;
+        return (actual_group == block_group) ? 1 : 0;
-        return 0;
 }
 /* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        trace_ext4_read_block_bitmap_load(sb, block_group);
        bh->b_end_io = ext4_end_bitmap_read;
        get_bh(bh);
-        submit_bh(READ, bh);
+        submit_bh(READ | REQ_META | REQ_PRIO, bh);
        return bh;
 verify:
        ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                  s64 nclusters, unsigned int flags)
 {
-        s64 free_clusters, dirty_clusters, root_clusters;
+        s64 free_clusters, dirty_clusters, rsv, resv_clusters;
        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
        free_clusters  = percpu_counter_read_positive(fcc);
        dirty_clusters = percpu_counter_read_positive(dcc);
+        resv_clusters = atomic64_read(&sbi->s_resv_clusters);
        /*
         * r_blocks_count should always be multiple of the cluster ratio so
         * we are safe to do a plane bit shift only.
         */
-        root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+        rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+              resv_clusters;
-        if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+        if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                        EXT4_FREECLUSTERS_WATERMARK) {
                free_clusters  = percpu_counter_sum_positive(fcc);
                dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
        /* Check whether we have space after accounting for current
         * dirty clusters & root reserved clusters.
         */
-        if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+        if (free_clusters >= (rsv + nclusters + dirty_clusters))
                return 1;
        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (uid_eq(sbi->s_resuid, current_fsuid()) ||
            (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE) ||
-                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+            (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+                if (free_clusters >= (nclusters + dirty_clusters +
+                                      resv_clusters))
+                        return 1;
+        }
+        /* No free blocks. Let's see if we can dip into reserved pool */
+        if (flags & EXT4_MB_USE_RESERVED) {
                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d8cd1f0f4661..f8d56e4254e0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
-             ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+             ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+             ext4_has_inline_data(inode)))
                return 1;
        return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
        int ret = 0;
        int dir_has_error = 0;
-        if (ext4_has_inline_data(inode)) {
-                int has_inline_data = 1;
-                ret = ext4_read_inline_dir(filp, dirent, filldir,
-                                           &has_inline_data);
-                if (has_inline_data)
-                        return ret;
-        }
        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
                ext4_clear_inode_flag(file_inode(filp),
                                      EXT4_INODE_INDEX);
        }
+        if (ext4_has_inline_data(inode)) {
+                int has_inline_data = 1;
+                ret = ext4_read_inline_dir(filp, dirent, filldir,
+                                           &has_inline_data);
+                if (has_inline_data)
+                        return ret;
+        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b83cd604796..0aabb344b02e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_STREAM_ALLOC            0x0800
 /* Use reserved root blocks if needed */
 #define EXT4_MB_USE_ROOT_BLOCKS         0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED            0x2000
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -196,19 +198,8 @@ struct mpage_da_data {
 #define EXT4_IO_END_ERROR       0x0002
 #define EXT4_IO_END_DIRECT      0x0004
-struct ext4_io_page {
-        struct page     *p_page;
-        atomic_t        p_count;
-};
-#define MAX_IO_PAGES 128
 /*
 * For converting uninitialized extents on a work queue.
- *
- * 'page' is only used from the writepage() path; 'pages' is only used for
- * buffered writes; they are used to keep page references until conversion
- * takes place.  For AIO/DIO, neither field is filled in.
 */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
        ssize_t                 size;           /* size of the extent */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
-        int                     num_io_pages;   /* for writepages() */
+        atomic_t                count;          /* reference counter */
-        struct ext4_io_page     *pages[MAX_IO_PAGES]; /* for writepages() */
 } ext4_io_end_t;
 struct ext4_io_submit {
        int                     io_op;
        struct bio              *io_bio;
        ext4_io_end_t           *io_end;
-        struct ext4_io_page     *io_page;
        sector_t                io_next_block;
 };
@@ -403,7 +392,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */
 #define EXT4_FL_USER_VISIBLE            0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE         0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE         0x004380FF /* User modifiable flags */
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +546,8 @@ enum {
 #define EXT4_GET_BLOCKS_UNINIT_EXT              0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
-        /* Caller is from the delayed allocation writeout path,
+        /* Caller is from the delayed allocation writeout path
-           so set the magic i_delalloc_reserve_flag after taking the
+         * finally doing the actual allocation of delayed blocks */
-           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unitialized extents if not allocated, split the uninitialized
@@ -571,8 +559,9 @@ enum {
        /* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT          (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-        /* Punch out blocks of an extent */
+        /* Eventual metadata allocation (due to growing extent tree)
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT           0x0020
+         * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL         0x0020
        /* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE            0x0040
        /* Request will not result in inode size update (user for fallocate) */
@@ -616,6 +605,7 @@ enum {
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
 #define EXT4_IOC_RESIZE_FS              _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT              _IO('f', 17)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -949,7 +939,7 @@ struct ext4_inode_info {
 #define EXT2_FLAGS_TEST_FILESYS         0x0004  /* to test development code */
 /*
- * Mount flags
+ * Mount flags set via mount options or defaults
 */
 #define EXT4_MOUNT_GRPID                0x00004 /* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG                0x00008 /* Some debugging messages */
@@ -981,8 +971,16 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
 #define EXT4_MOUNT2_EXPLICIT_DELALLOC   0x00000001 /* User explicitly
                                                      specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE      0x00000002 /* We have standard group
+                                                      size of blocksize * 8
+                                                      blocks */
 #define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
        unsigned int s_mount_flags;
        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
+        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
        return ino == EXT4_ROOT_INO ||
                ino == EXT4_USR_QUOTA_INO ||
                ino == EXT4_GRP_QUOTA_INO ||
+                ino == EXT4_BOOT_LOADER_INO ||
                ino == EXT4_JOURNAL_INO ||
                ino == EXT4_RESIZE_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
@@ -1374,6 +1374,7 @@ enum {
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
+        EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
 };
 #define EXT4_INODE_BIT_FNS(name, field, offset)                         \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 */
 #define ERR_BAD_DX_DIR  -75000
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 /*
 * Timeout and state flag for lazy initialization inode thread.
 */
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct buffer_head *bh);
 /* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+                                         ext4_fsblk_t blocknr,
+                                         ext4_group_t *blockgrpp,
+                                         ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+                                          ext4_fsblk_t block);
 extern void ext4_validate_block_bitmap(struct super_block *sb,
                                       struct ext4_group_desc *desc,
                                       unsigned int block_group,
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                                unsigned long nr_segs);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
-extern void ext4_ind_truncate(struct inode *inode);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
-extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+                                 ext4_lblk_t first, ext4_lblk_t stop);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
 /* namei.c */
 extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
 extern int ext4_read_inline_dir(struct file *filp,
                                void *dirent, filldir_t filldir,
                                int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+                                   struct inode *dir, ext4_lblk_t block,
+                                   struct dx_hash_info *hinfo,
+                                   __u32 start_hash, __u32 start_minor_hash,
+                                   int *has_inline_data);
 extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        const struct qstr *d_name,
                                        struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
 extern int ext4_handle_dirty_dirent_node(handle_t *handle,
                                         struct inode *inode,
                                         struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = EXT4_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = EXT4_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = EXT4_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = EXT4_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = EXT4_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = EXT4_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = EXT4_FT_SYMLINK,
+};
+static inline void ext4_set_de_type(struct super_block *sb,
+                                struct ext4_dir_entry_2 *de,
+                                umode_t mode) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
-                                loff_t length);
+                                 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 /* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+                                            struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+                                          struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
 extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+                                struct writeback_control *wbc);
 extern void ext4_end_io_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 8643ff5bbeb7..51bc821ade90 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                     0xffff);
 }
+#define ext4_ext_dirty(handle, inode, path) \
+                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+                     struct inode *inode, struct ext4_ext_path *path);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7058975e3a55..451eb4045330 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
 {
        journal_t *journal;
+        might_sleep();
        trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
 {
        int err = 0;
+        might_sleep();
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 {
        int err = 0;
+        might_sleep();
+        set_buffer_meta(bh);
+        set_buffer_prio(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4c216b1bf20c..c8c6885406db 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-         ? 27U : 8U)
+         ? 20U : 8U)
 /* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
 */
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
 {
+        bool deleted;
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);
        spin_lock(&sbi->s_md_lock);
+        deleted = !list_empty(&jce->jce_list);
        list_del_init(&jce->jce_list);
        spin_unlock(&sbi->s_md_lock);
+        return deleted;
 }
 int
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c6d06dcef8b..107936db244e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 *  - ENOMEM
 *  - EIO
 */
-#define ext4_ext_dirty(handle, inode, path) \
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
-                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+                     struct inode *inode, struct ext4_ext_path *path)
-static int __ext4_ext_dirty(const char *where, unsigned int line,
-                            handle_t *handle, struct inode *inode,
-                            struct ext4_ext_path *path)
 {
        int err;
        if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
+        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EIO;
        }
        /* try to insert block into found extent and return */
-        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
+        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
-                && ext4_can_extents_be_merged(inode, ex, newext)) {
-                ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
-                          ext4_ext_is_uninitialized(newext),
-                          ext4_ext_get_actual_len(newext),
-                          le32_to_cpu(ex->ee_block),
-                          ext4_ext_is_uninitialized(ex),
-                          ext4_ext_get_actual_len(ex),
-                          ext4_ext_pblock(ex));
-                err = ext4_ext_get_access(handle, inode, path + depth);
-                if (err)
-                        return err;
                /*
-                 * ext4_can_extents_be_merged should have checked that either
+                 * Try to see whether we should rather test the extent on
-                 * both extents are uninitialized, or both aren't. Thus we
+                 * right from ex, or from the left of ex. This is because
-                 * need to check only one of them here.
+                 * ext4_ext_find_extent() can return either extent on the
+                 * left, or on the right from the searched position. This
+                 * will make merging more effective.
                 */
-                if (ext4_ext_is_uninitialized(ex))
+                if (ex < EXT_LAST_EXTENT(eh) &&
-                        uninitialized = 1;
+                    (le32_to_cpu(ex->ee_block) +
-                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+                    ext4_ext_get_actual_len(ex) <
+                    le32_to_cpu(newext->ee_block))) {
+                        ex += 1;
+                        goto prepend;
+                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+                           (le32_to_cpu(newext->ee_block) +
+                           ext4_ext_get_actual_len(newext) <
+                           le32_to_cpu(ex->ee_block)))
+                        ex -= 1;
+                /* Try to append newex to the ex */
+                if (ext4_can_extents_be_merged(inode, ex, newext)) {
+                        ext_debug("append [%d]%d block to %u:[%d]%d"
+                                  "(from %llu)\n",
+                                  ext4_ext_is_uninitialized(newext),
+                                  ext4_ext_get_actual_len(newext),
+                                  le32_to_cpu(ex->ee_block),
+                                  ext4_ext_is_uninitialized(ex),
+                                  ext4_ext_get_actual_len(ex),
+                                  ext4_ext_pblock(ex));
+                        err = ext4_ext_get_access(handle, inode,
+                                                  path + depth);
+                        if (err)
+                                return err;
+                        /*
+                         * ext4_can_extents_be_merged should have checked
+                         * that either both extents are uninitialized, or
+                         * both aren't. Thus we need to check only one of
+                         * them here.
+                         */
+                        if (ext4_ext_is_uninitialized(ex))
+                                uninitialized = 1;
+                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
-                if (uninitialized)
+                        if (uninitialized)
-                        ext4_ext_mark_uninitialized(ex);
+                                ext4_ext_mark_uninitialized(ex);
-                eh = path[depth].p_hdr;
+                        eh = path[depth].p_hdr;
-                nearex = ex;
+                        nearex = ex;
-                goto merge;
+                        goto merge;
+                }
+prepend:
+                /* Try to prepend newex to the ex */
+                if (ext4_can_extents_be_merged(inode, newext, ex)) {
+                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+                                  "(from %llu)\n",
+                                  le32_to_cpu(newext->ee_block),
+                                  ext4_ext_is_uninitialized(newext),
+                                  ext4_ext_get_actual_len(newext),
+                                  le32_to_cpu(ex->ee_block),
+                                  ext4_ext_is_uninitialized(ex),
+                                  ext4_ext_get_actual_len(ex),
+                                  ext4_ext_pblock(ex));
+                        err = ext4_ext_get_access(handle, inode,
+                                                  path + depth);
+                        if (err)
+                                return err;
+                        /*
+                         * ext4_can_extents_be_merged should have checked
+                         * that either both extents are uninitialized, or
+                         * both aren't. Thus we need to check only one of
+                         * them here.
+                         */
+                        if (ext4_ext_is_uninitialized(ex))
+                                uninitialized = 1;
+                        ex->ee_block = newext->ee_block;
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+                                        + ext4_ext_get_actual_len(newext));
+                        if (uninitialized)
+                                ext4_ext_mark_uninitialized(ex);
+                        eh = path[depth].p_hdr;
+                        nearex = ex;
+                        goto merge;
+                }
        }
        depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
-        if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+        if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
-                flags = EXT4_MB_USE_ROOT_BLOCKS;
+                flags = EXT4_MB_USE_RESERVED;
        err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
        if (err)
                goto cleanup;
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
-                                 ext4_lblk_t end)
+                          ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2667,12 +2726,14 @@ again:
                        /*
                         * Split the extent in two so that 'end' is the last
-                         * block in the first new extent
+                         * block in the first new extent. Also we should not
+                         * fail removing space due to ENOSPC so try to use
+                         * reserved block if that happens.
                         */
                        err = ext4_split_extent_at(handle, inode, path,
-                                                end + 1, split_flag,
+                                        end + 1, split_flag,
-                                                EXT4_GET_BLOCKS_PRE_IO |
+                                        EXT4_GET_BLOCKS_PRE_IO |
-                                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
                        if (err < 0)
                                goto out;
@@ -3147,35 +3208,35 @@ out:
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
-                                           struct ext4_ext_path *path)
+                                           struct ext4_ext_path *path,
+                                           int flags)
 {
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
-        struct ext4_extent *ex;
+        struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
-        unsigned int ee_len, depth;
+        unsigned int ee_len, depth, map_len = map->m_len;
-        int allocated, max_zeroout = 0;
+        int allocated = 0, max_zeroout = 0;
        int err = 0;
        int split_flag = 0;
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
-                (unsigned long long)map->m_lblk, map->m_len);
+                (unsigned long long)map->m_lblk, map_len);
        sbi = EXT4_SB(inode->i_sb);
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
-        if (eof_block < map->m_lblk + map->m_len)
+        if (eof_block < map->m_lblk + map_len)
-                eof_block = map->m_lblk + map->m_len;
+                eof_block = map->m_lblk + map_len;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (map->m_lblk - ee_block);
        zero_ex.ee_len = 0;
        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        /*
         * Attempt to transfer newly initialized blocks from the currently
-         * uninitialized extent to its left neighbor. This is much cheaper
+         * uninitialized extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
-         * memmove() calls. This is the common case in steady state for
+         * memmove() calls. Transferring to the left is the common case in
-         * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
-         * writes.
+         * followed by append writes.
         *
         * Limitations of the current logic:
-         *  - L1: we only deal with writes at the start of the extent.
+         *  - L1: we do not deal with writes covering the whole extent.
-         *    The approach could be extended to writes at the end
-         *    of the extent but this scenario was deemed less common.
-         *  - L2: we do not deal with writes covering the whole extent.
         *    This would require removing the extent if the transfer
         *    is possible.
-         *  - L3: we only attempt to merge with an extent stored in the
+         *  - L2: we only attempt to merge with an extent stored in the
         *    same extent tree node.
         */
-        if ((map->m_lblk == ee_block) &&        /*L1*/
+        if ((map->m_lblk == ee_block) &&
-                (map->m_len < ee_len) &&        /*L2*/
+                /* See if we can merge left */
-                (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
+                (map_len < ee_len) &&           /*L1*/
-                struct ext4_extent *prev_ex;
+                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
                ext4_lblk_t prev_lblk;
                ext4_fsblk_t prev_pblk, ee_pblk;
-                unsigned int prev_len, write_len;
+                unsigned int prev_len;
-                prev_ex = ex - 1;
+                abut_ex = ex - 1;
-                prev_lblk = le32_to_cpu(prev_ex->ee_block);
+                prev_lblk = le32_to_cpu(abut_ex->ee_block);
-                prev_len = ext4_ext_get_actual_len(prev_ex);
+                prev_len = ext4_ext_get_actual_len(abut_ex);
-                prev_pblk = ext4_ext_pblock(prev_ex);
+                prev_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);
-                write_len = map->m_len;
                /*
-                 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
-                 * - C1: prev_ex is initialized,
+                 * - C1: abut_ex is initialized,
-                 * - C2: prev_ex is logically abutting ex,
+                 * - C2: abut_ex is logically abutting ex,
-                 * - C3: prev_ex is physically abutting ex,
+                 * - C3: abut_ex is physically abutting ex,
-                 * - C4: prev_ex can receive the additional blocks without
+                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
-                if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+                if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
-                        (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;
                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
-                                map, ex, prev_ex);
+                                map, ex, abut_ex);
-                        /* Shift the start of ex by 'write_len' blocks */
+                        /* Shift the start of ex by 'map_len' blocks */
-                        ex->ee_block = cpu_to_le32(ee_block + write_len);
+                        ex->ee_block = cpu_to_le32(ee_block + map_len);
-                        ext4_ext_store_pblock(ex, ee_pblk + write_len);
+                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
-                        ex->ee_len = cpu_to_le16(ee_len - write_len);
+                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
-                        /* Extend prev_ex by 'write_len' blocks */
+                        /* Extend abut_ex by 'map_len' blocks */
-                        prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
-                        /* Mark the block containing both extents as dirty */
+                        /* Result: number of initialized blocks past m_lblk */
-                        ext4_ext_dirty(handle, inode, path + depth);
+                        allocated = map_len;
+                }
+        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+                   (map_len < ee_len) &&        /*L1*/
+                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
+                /* See if we can merge right */
+                ext4_lblk_t next_lblk;
+                ext4_fsblk_t next_pblk, ee_pblk;
+                unsigned int next_len;
+                abut_ex = ex + 1;
+                next_lblk = le32_to_cpu(abut_ex->ee_block);
+                next_len = ext4_ext_get_actual_len(abut_ex);
+                next_pblk = ext4_ext_pblock(abut_ex);
+                ee_pblk = ext4_ext_pblock(ex);
-                        /* Update path to point to the right extent */
+                /*
-                        path[depth].p_ext = prev_ex;
+                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+                 * upon those conditions:
+                 * - C1: abut_ex is initialized,
+                 * - C2: abut_ex is logically abutting ex,
+                 * - C3: abut_ex is physically abutting ex,
+                 * - C4: abut_ex can receive the additional blocks without
+                 *   overflowing the (initialized) length limit.
+                 */
+                if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
+                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
+                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
+                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
+                        err = ext4_ext_get_access(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                                map, ex, abut_ex);
+                        /* Shift the start of abut_ex by 'map_len' blocks */
+                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+                        ex->ee_len = cpu_to_le16(ee_len - map_len);
+                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+                        /* Extend abut_ex by 'map_len' blocks */
+                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
                        /* Result: number of initialized blocks past m_lblk */
-                        allocated = write_len;
+                        allocated = map_len;
-                        goto out;
                }
        }
+        if (allocated) {
+                /* Mark the block containing both extents as dirty */
+                ext4_ext_dirty(handle, inode, path + depth);
+                /* Update path to point to the right extent */
+                path[depth].p_ext = abut_ex;
+                goto out;
+        } else
+                allocated = ee_len - (map->m_lblk - ee_block);
        WARN_ON(map->m_lblk < ee_block);
        /*
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        }
        allocated = ext4_split_extent(handle, inode, path,
-                                      &split_map, split_flag, 0);
+                                      &split_map, split_flag, flags);
        if (allocated < 0)
                err = allocated;
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
+        /*
+         * When writing into uninitialized space, we should not fail to
+         * allocate metadata blocks for the new extent block if needed.
+         */
+        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
        trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
                                                    allocated, newblock);
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -4257,48 +4368,13 @@ out3:
        return err ? err : allocated;
 }
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
-        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
-        handle_t *handle;
-        loff_t page_len;
        int err = 0;
        /*
-         * finish any pending end_io work so we won't run the risk of
-         * converting any truncated blocks to initialized later
-         */
-        ext4_flush_unwritten_io(inode);
-        /*
-         * probably first extent we're gonna free will be last in block
-         */
-        err = ext4_writepage_trans_blocks(inode);
-        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
-        if (IS_ERR(handle))
-                return;
-        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                err = ext4_discard_partial_page_buffers(handle,
-                        mapping, inode->i_size, page_len, 0);
-                if (err)
-                        goto out_stop;
-        }
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
-        down_write(&EXT4_I(inode)->i_data_sem);
-        ext4_discard_preallocations(inode);
-        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
         * because page truncation is enough.
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode)
        err = ext4_es_remove_extent(inode, last_block,
                                    EXT_MAX_BLOCKS - last_block);
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-        /* In a multi-transaction truncate, we only make the final
-         * transaction synchronous.
-         */
-        if (IS_SYNC(inode))
-                ext4_handle_sync(handle);
-        up_write(&EXT4_I(inode)->i_data_sem);
-out_stop:
-        /*
-         * If this was a simple ftruncate() and the file will remain alive,
-         * then we need to clear up the orphan record which we created above.
-         * However, if this was a real unlink then we were called by
-         * ext4_delete_inode(), and we allow that function to clean up the
-         * orphan info for us.
-         */
-        if (inode->i_nlink)
-                ext4_orphan_del(handle, inode);
-        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
-        ext4_journal_stop(handle);
 }
 static void ext4_falloc_update_inode(struct inode *inode,
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
        return (error < 0 ? error : 0);
 }
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode:  The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-        struct inode *inode = file_inode(file);
-        struct super_block *sb = inode->i_sb;
-        ext4_lblk_t first_block, stop_block;
-        struct address_space *mapping = inode->i_mapping;
-        handle_t *handle;
-        loff_t first_page, last_page, page_len;
-        loff_t first_page_offset, last_page_offset;
-        int credits, err = 0;
-        /*
-         * Write out all dirty pages to avoid race conditions
-         * Then release them.
-         */
-        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-                err = filemap_write_and_wait_range(mapping,
-                        offset, offset + length - 1);
-                if (err)
-                        return err;
-        }
-        mutex_lock(&inode->i_mutex);
-        /* It's not possible punch hole on append only file */
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-                err = -EPERM;
-                goto out_mutex;
-        }
-        if (IS_SWAPFILE(inode)) {
-                err = -ETXTBSY;
-                goto out_mutex;
-        }
-        /* No need to punch hole beyond i_size */
-        if (offset >= inode->i_size)
-                goto out_mutex;
-        /*
-         * If the hole extends beyond i_size, set the hole
-         * to end after the page that contains i_size
-         */
-        if (offset + length > inode->i_size) {
-                length = inode->i_size +
-                   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
-                   offset;
-        }
-        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-        first_page_offset = first_page << PAGE_CACHE_SHIFT;
-        last_page_offset = last_page << PAGE_CACHE_SHIFT;
-        /* Now release the pages */
-        if (last_page_offset > first_page_offset) {
-                truncate_pagecache_range(inode, first_page_offset,
-                                         last_page_offset - 1);
-        }
-        /* Wait all existing dio workers, newcomers will block on i_mutex */
-        ext4_inode_block_unlocked_dio(inode);
-        err = ext4_flush_unwritten_io(inode);
-        if (err)
-                goto out_dio;
-        inode_dio_wait(inode);
-        credits = ext4_writepage_trans_blocks(inode);
-        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
-        if (IS_ERR(handle)) {
-                err = PTR_ERR(handle);
-                goto out_dio;
-        }
-        /*
-         * Now we need to zero out the non-page-aligned data in the
-         * pages at the start and tail of the hole, and unmap the buffer
-         * heads for the block aligned regions of the page that were
-         * completely zeroed.
-         */
-        if (first_page > last_page) {
-                /*
-                 * If the file space being truncated is contained within a page
-                 * just zero out and unmap the middle of that page
-                 */
-                err = ext4_discard_partial_page_buffers(handle,
-                        mapping, offset, length, 0);
-                if (err)
-                        goto out;
-        } else {
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the start of the hole
-                 */
-                page_len  = first_page_offset - offset;
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle, mapping,
-                                                   offset, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the end of the hole
-                 */
-                page_len = offset + length - last_page_offset;
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle, mapping,
-                                        last_page_offset, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-        }
-        /*
-         * If i_size is contained in the last page, we need to
-         * unmap and zero the partial page after i_size
-         */
-        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-           inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle,
-                          mapping, inode->i_size, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-        }
-        first_block = (offset + sb->s_blocksize - 1) >>
-                EXT4_BLOCK_SIZE_BITS(sb);
-        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-        /* If there are no blocks to remove, return now */
-        if (first_block >= stop_block)
-                goto out;
-        down_write(&EXT4_I(inode)->i_data_sem);
-        ext4_discard_preallocations(inode);
-        err = ext4_es_remove_extent(inode, first_block,
-                                    stop_block - first_block);
-        err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-        ext4_discard_preallocations(inode);
-        if (IS_SYNC(inode))
-                ext4_handle_sync(handle);
-        up_write(&EXT4_I(inode)->i_data_sem);
-out:
-        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
-        ext4_journal_stop(handle);
-out_dio:
-        ext4_inode_resume_unlocked_dio(inode);
-out_mutex:
-        mutex_unlock(&inode->i_mutex);
-        return err;
-}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..e0ba8a408def 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (journal->j_flags & JBD2_BARRIER &&
            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
                needs_barrier = true;
-        jbd2_log_start_commit(journal, commit_tid);
+        ret = jbd2_complete_transaction(journal, commit_tid);
-        ret = jbd2_log_wait_commit(journal, commit_tid);
        if (needs_barrier) {
                err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
                if (!ret)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6c5bb8d993fe..00a818d67b54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        trace_ext4_load_inode_bitmap(sb, block_group);
        bh->b_end_io = ext4_end_bitmap_read;
        get_bh(bh);
-        submit_bh(READ, bh);
+        submit_bh(READ | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh)) {
                put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
        ei = EXT4_I(inode);
        sbi = EXT4_SB(sb);
+        /*
+         * Initalize owners and quota early so that we don't have to account
+         * for quota initialization worst case in standard inode creating
+         * transaction
+         */
+        if (owner) {
+                inode->i_mode = mode;
+                i_uid_write(inode, owner[0]);
+                i_gid_write(inode, owner[1]);
+        } else if (test_opt(sb, GRPID)) {
+                inode->i_mode = mode;
+                inode->i_uid = current_fsuid();
+                inode->i_gid = dir->i_gid;
+        } else
+                inode_init_owner(inode, dir, mode);
+        dquot_initialize(inode);
        if (!goal)
                goal = sbi->s_inode_goal;
@@ -697,7 +714,7 @@ got_group:
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
-                        goto fail;
+                        goto out;
                /*
                 * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
                brelse(inode_bitmap_bh);
                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                if (!inode_bitmap_bh)
-                        goto fail;
+                        goto out;
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ repeat_in_this_group:
                                                         handle_type, nblocks);
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
-                                goto fail;
+                                ext4_std_error(sb, err);
+                                goto out;
                        }
                }
                BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-                if (err)
+                if (err) {
-                        goto fail;
+                        ext4_std_error(sb, err);
+                        goto out;
+                }
                ext4_lock_group(sb, group);
                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ repeat_in_this_group:
 got:
        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-        if (err)
+        if (err) {
-                goto fail;
+                ext4_std_error(sb, err);
+                goto out;
+        }
        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ got:
                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
                        brelse(block_bitmap_bh);
-                        goto fail;
+                        ext4_std_error(sb, err);
+                        goto out;
                }
                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ got:
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);
-                if (err)
+                if (err) {
-                        goto fail;
+                        ext4_std_error(sb, err);
+                        goto out;
+                }
        }
        BUFFER_TRACE(group_desc_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, group_desc_bh);
-        if (err)
+        if (err) {
-                goto fail;
+                ext4_std_error(sb, err);
+                goto out;
+        }
        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ got:
        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-        if (err)
+        if (err) {
-                goto fail;
+                ext4_std_error(sb, err);
+                goto out;
+        }
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ got:
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
-        if (owner) {
-                inode->i_mode = mode;
-                i_uid_write(inode, owner[0]);
-                i_gid_write(inode, owner[1]);
-        } else if (test_opt(sb, GRPID)) {
-                inode->i_mode = mode;
-                inode->i_uid = current_fsuid();
-                inode->i_gid = dir->i_gid;
-        } else
-                inode_init_owner(inode, dir, mode);
        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ got:
                 * twice.
                 */
                err = -EIO;
-                goto fail;
+                ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+                           inode->i_ino);
+                goto out;
        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ got:
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                __u32 csum;
-                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = cpu_to_le32(inode->i_generation);
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ got:
                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        ret = inode;
-        dquot_initialize(inode);
        err = dquot_alloc_inode(inode);
        if (err)
                goto fail_drop;
@@ -952,24 +971,17 @@ got:
        ext4_debug("allocating inode %lu\n", inode->i_ino);
        trace_ext4_allocate_inode(inode, dir, mode);
-        goto really_out;
-fail:
-        ext4_std_error(sb, err);
-out:
-        iput(inode);
-        ret = ERR_PTR(err);
-really_out:
        brelse(inode_bitmap_bh);
        return ret;
 fail_free_drop:
        dquot_free_inode(inode);
 fail_drop:
-        dquot_drop(inode);
-        inode->i_flags |= S_NOQUOTA;
        clear_nlink(inode);
        unlock_new_inode(inode);
+out:
+        dquot_drop(inode);
+        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a04183127ef0..98be6f697463 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 }
 /**
- *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
- *      @handle: handle for this transaction
- *      @inode: inode which needs allocated blocks
- *      @iblock: the logical block to start allocated at
- *      @goal: preferred physical block of allocation
- *      @indirect_blks: the number of blocks need to allocate for indirect
- *                      blocks
- *      @blks: number of desired blocks
- *      @new_blocks: on return it will store the new block numbers for
- *      the indirect blocks(if needed) and the first direct block,
- *      @err: on return it will store the error code
- *
- *      This function will return the number of blocks allocated as
- *      requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                             ext4_lblk_t iblock, ext4_fsblk_t goal,
-                             int indirect_blks, int blks,
-                             ext4_fsblk_t new_blocks[4], int *err)
-{
-        struct ext4_allocation_request ar;
-        int target, i;
-        unsigned long count = 0, blk_allocated = 0;
-        int index = 0;
-        ext4_fsblk_t current_block = 0;
-        int ret = 0;
-        /*
-         * Here we try to allocate the requested multiple blocks at once,
-         * on a best-effort basis.
-         * To build a branch, we should allocate blocks for
-         * the indirect blocks(if not allocated yet), and at least
-         * the first direct block of this branch.  That's the
-         * minimum number of blocks need to allocate(required)
-         */
-        /* first we try to allocate the indirect blocks */
-        target = indirect_blks;
-        while (target > 0) {
-                count = target;
-                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                     0, &count, err);
-                if (*err)
-                        goto failed_out;
-                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
-                        EXT4_ERROR_INODE(inode,
-                                         "current_block %llu + count %lu > %d!",
-                                         current_block, count,
-                                         EXT4_MAX_BLOCK_FILE_PHYS);
-                        *err = -EIO;
-                        goto failed_out;
-                }
-                target -= count;
-                /* allocate blocks for indirect blocks */
-                while (index < indirect_blks && count) {
-                        new_blocks[index++] = current_block++;
-                        count--;
-                }
-                if (count > 0) {
-                        /*
-                         * save the new block number
-                         * for the first direct block
-                         */
-                        new_blocks[index] = current_block;
-                        WARN(1, KERN_INFO "%s returned more blocks than "
-                                                "requested\n", __func__);
-                        break;
-                }
-        }
-        target = blks - count ;
-        blk_allocated = count;
-        if (!target)
-                goto allocated;
-        /* Now allocate data blocks */
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = target;
-        ar.logical = iblock;
-        if (S_ISREG(inode->i_mode))
-                /* enable in-core preallocation only for regular files */
-                ar.flags = EXT4_MB_HINT_DATA;
-        current_block = ext4_mb_new_blocks(handle, &ar, err);
-        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
-                EXT4_ERROR_INODE(inode,
-                                 "current_block %llu + ar.len %d > %d!",
-                                 current_block, ar.len,
-                                 EXT4_MAX_BLOCK_FILE_PHYS);
-                *err = -EIO;
-                goto failed_out;
-        }
-        if (*err && (target == blks)) {
-                /*
-                 * if the allocation failed and we didn't allocate
-                 * any blocks before
-                 */
-                goto failed_out;
-        }
-        if (!*err) {
-                if (target == blks) {
-                        /*
-                         * save the new block number
-                         * for the first direct block
-                         */
-                        new_blocks[index] = current_block;
-                }
-                blk_allocated += ar.len;
-        }
-allocated:
-        /* total number of blocks allocated for direct blocks */
-        ret = blk_allocated;
-        *err = 0;
-        return ret;
-failed_out:
-        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        return ret;
-}
-/**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
 *      @handle: handle for this transaction
 *      @inode: owner
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                             int *blks, ext4_fsblk_t goal,
                             ext4_lblk_t *offsets, Indirect *branch)
 {
-        int blocksize = inode->i_sb->s_blocksize;
+        struct ext4_allocation_request  ar;
-        int i, n = 0;
+        struct buffer_head *            bh;
-        int err = 0;
+        ext4_fsblk_t                    b, new_blocks[4];
-        struct buffer_head *bh;
+        __le32                          *p;
-        int num;
+        int                             i, j, err, len = 1;
-        ext4_fsblk_t new_blocks[4];
-        ext4_fsblk_t current_block;
-        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
-                                *blks, new_blocks, &err);
-        if (err)
-                return err;
-        branch[0].key = cpu_to_le32(new_blocks[0]);
        /*
-         * metadata blocks and data blocks are allocated.
+         * Set up for the direct block allocation
         */
-        for (n = 1; n <= indirect_blks;  n++) {
+        memset(&ar, 0, sizeof(ar));
-                /*
+        ar.inode = inode;
-                 * Get buffer_head for parent block, zero it out
+        ar.len = *blks;
-                 * and set the pointer to new one, then send
+        ar.logical = iblock;
-                 * parent to disk.
+        if (S_ISREG(inode->i_mode))
-                 */
+                ar.flags = EXT4_MB_HINT_DATA;
-                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+        for (i = 0; i <= indirect_blks; i++) {
+                if (i == indirect_blks) {
+                        ar.goal = goal;
+                        new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+                } else
+                        goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+                                                        goal, 0, NULL, &err);
+                if (err) {
+                        i--;
+                        goto failed;
+                }
+                branch[i].key = cpu_to_le32(new_blocks[i]);
+                if (i == 0)
+                        continue;
+                bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto failed;
                }
-                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, bh);
                if (err) {
-                        /* Don't brelse(bh) here; it's done in
-                         * ext4_journal_forget() below */
                        unlock_buffer(bh);
                        goto failed;
                }
-                memset(bh->b_data, 0, blocksize);
+                memset(bh->b_data, 0, bh->b_size);
-                branch[n].p = (__le32 *) bh->b_data + offsets[n];
+                p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
-                branch[n].key = cpu_to_le32(new_blocks[n]);
+                b = new_blocks[i];
-                *branch[n].p = branch[n].key;
-                if (n == indirect_blks) {
+                if (i == indirect_blks)
-                        current_block = new_blocks[n];
+                        len = ar.len;
-                        /*
+                for (j = 0; j < len; j++)
-                         * End of chain, update the last new metablock of
+                        *p++ = cpu_to_le32(b++);
-                         * the chain to point to the new allocated
-                         * data blocks numbers
-                         */
-                        for (i = 1; i < num; i++)
-                                *(branch[n].p + i) = cpu_to_le32(++current_block);
-                }
                BUFFER_TRACE(bh, "marking uptodate");
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                if (err)
                        goto failed;
        }
-        *blks = num;
+        *blks = ar.len;
-        return err;
+        return 0;
 failed:
-        /* Allocation failed, free what we already allocated */
+        for (; i >= 0; i--) {
-        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
+                if (i != indirect_blks && branch[i].bh)
-        for (i = 1; i <= n ; i++) {
+                        ext4_forget(handle, 1, inode, branch[i].bh,
-                /*
+                                    branch[i].bh->b_blocknr);
-                 * branch[i].bh is newly allocated, so there is no
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i],
-                 * need to revoke the block, which is why we don't
+                                 (i == indirect_blks) ? ar.len : 1, 0);
-                 * need to set EXT4_FREE_BLOCKS_METADATA.
-                 */
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
-                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
        return err;
 }
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 * be able to restart the transaction at a conventient checkpoint to make
 * sure we don't overflow the journal.
 *
- * start_transaction gets us a new handle for a truncate transaction,
+ * Try to extend this transaction for the purposes of truncation.  If
- * and extend_transaction tries to extend the existing one a bit.  If
 * extend fails, we need to propagate the failure up and restart the
 * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
-        handle_t *result;
-        result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                    ext4_blocks_for_truncate(inode));
-        if (!IS_ERR(result))
-                return result;
-        ext4_std_error(inode->i_sb, PTR_ERR(result));
-        return result;
-}
-/*
- * Try to extend this transaction for the purposes of truncation.
 *
 * Returns 0 if we managed to create more room.  If we can't create more
 * room, and the transaction must be restarted we return 1.
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        }
 }
-void ext4_ind_truncate(struct inode *inode)
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
 {
-        handle_t *handle;
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-        struct address_space *mapping = inode->i_mapping;
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
        int n = 0;
        ext4_lblk_t last_block, max_block;
-        loff_t page_len;
        unsigned blocksize = inode->i_sb->s_blocksize;
-        int err;
-        handle = start_transaction(inode);
-        if (IS_ERR(handle))
-                return;         /* AKPM: return what? */
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                err = ext4_discard_partial_page_buffers(handle,
-                        mapping, inode->i_size, page_len, 0);
-                if (err)
-                        goto out_stop;
-        }
        if (last_block != max_block) {
                n = ext4_block_to_path(inode, last_block, offsets, NULL);
                if (n == 0)
-                        goto out_stop;  /* error */
+                        return;
        }
-        /*
-         * OK.  This truncate is going to happen.  We add the inode to the
-         * orphan list, so that if this truncate spans multiple transactions,
-         * and we crash, we will resume the truncate when the filesystem
-         * recovers.  It also marks the inode dirty, to catch the new size.
-         *
-         * Implication: the file must always be in a sane, consistent
-         * truncatable state while each transaction commits.
-         */
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
-        /*
-         * From here we block out all ext4_get_block() callers who want to
-         * modify the block allocation tree.
-         */
-        down_write(&ei->i_data_sem);
-        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
        /*
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
                 * It is unnecessary to free any data blocks if last_block is
                 * equal to the indirect block limit.
                 */
-                goto out_unlock;
+                return;
        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1301,6 @@ do_indirects:
        case EXT4_TIND_BLOCK:
                ;
        }
-out_unlock:
-        up_write(&ei->i_data_sem);
-        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
-        /*
-         * In a multi-transaction truncate, we only make the final transaction
-         * synchronous
-         */
-        if (IS_SYNC(inode))
-                ext4_handle_sync(handle);
-out_stop:
-        /*
-         * If this was a simple ftruncate(), and the file will remain alive
-         * then we need to clear up the orphan record which we created above.
-         * However, if this was a real unlink then we were called by
-         * ext4_delete_inode(), and we allow that function to clean up the
-         * orphan info for us.
-         */
-        if (inode->i_nlink)
-                ext4_orphan_del(handle, inode);
-        ext4_journal_stop(handle);
-        trace_ext4_truncate_exit(inode);
 }
 static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1354,8 @@ err:
        return ret;
 }
-static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
-                                 ext4_lblk_t first, ext4_lblk_t stop)
+                          ext4_lblk_t first, ext4_lblk_t stop)
 {
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int level, ret = 0;
@@ -1604,157 +1389,3 @@ err:
        return ret;
 }
-int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-        struct inode *inode = file_inode(file);
-        struct super_block *sb = inode->i_sb;
-        ext4_lblk_t first_block, stop_block;
-        struct address_space *mapping = inode->i_mapping;
-        handle_t *handle = NULL;
-        loff_t first_page, last_page, page_len;
-        loff_t first_page_offset, last_page_offset;
-        int err = 0;
-        /*
-         * Write out all dirty pages to avoid race conditions
-         * Then release them.
-         */
-        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-                err = filemap_write_and_wait_range(mapping,
-                        offset, offset + length - 1);
-                if (err)
-                        return err;
-        }
-        mutex_lock(&inode->i_mutex);
-        /* It's not possible punch hole on append only file */
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-                err = -EPERM;
-                goto out_mutex;
-        }
-        if (IS_SWAPFILE(inode)) {
-                err = -ETXTBSY;
-                goto out_mutex;
-        }
-        /* No need to punch hole beyond i_size */
-        if (offset >= inode->i_size)
-                goto out_mutex;
-        /*
-         * If the hole extents beyond i_size, set the hole
-         * to end after the page that contains i_size
-         */
-        if (offset + length > inode->i_size) {
-                length = inode->i_size +
-                    PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
-                    offset;
-        }
-        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-        first_page_offset = first_page << PAGE_CACHE_SHIFT;
-        last_page_offset = last_page << PAGE_CACHE_SHIFT;
-        /* Now release the pages */
-        if (last_page_offset > first_page_offset) {
-                truncate_pagecache_range(inode, first_page_offset,
-                                         last_page_offset - 1);
-        }
-        /* Wait all existing dio works, newcomers will block on i_mutex */
-        inode_dio_wait(inode);
-        handle = start_transaction(inode);
-        if (IS_ERR(handle))
-                goto out_mutex;
-        /*
-         * Now we need to zero out the non-page-aligned data in the
-         * pages at the start and tail of the hole, and unmap the buffer
-         * heads for the block aligned regions of the page that were
-         * completely zerod.
-         */
-        if (first_page > last_page) {
-                /*
-                 * If the file space being truncated is contained within a page
-                 * just zero out and unmap the middle of that page
-                 */
-                err = ext4_discard_partial_page_buffers(handle,
-                        mapping, offset, length, 0);
-                if (err)
-                        goto out;
-        } else {
-                /*
-                 * Zero out and unmap the paritial page that contains
-                 * the start of the hole
-                 */
-                page_len = first_page_offset - offset;
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle, mapping,
-                                                        offset, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-                /*
-                 * Zero out and unmap the partial page that contains
-                 * the end of the hole
-                 */
-                page_len = offset + length - last_page_offset;
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle, mapping,
-                                                last_page_offset, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-        }
-        /*
-         * If i_size contained in the last page, we need to
-         * unmap and zero the paritial page after i_size
-         */
-        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-            inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (page_len > 0) {
-                        err = ext4_discard_partial_page_buffers(handle,
-                                mapping, inode->i_size, page_len, 0);
-                        if (err)
-                                goto out;
-                }
-        }
-        first_block = (offset + sb->s_blocksize - 1) >>
-                EXT4_BLOCK_SIZE_BITS(sb);
-        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-        if (first_block >= stop_block)
-                goto out;
-        down_write(&EXT4_I(inode)->i_data_sem);
-        ext4_discard_preallocations(inode);
-        err = ext4_es_remove_extent(inode, first_block,
-                                    stop_block - first_block);
-        err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
-        ext4_discard_preallocations(inode);
-        if (IS_SYNC(inode))
-                ext4_handle_sync(handle);
-        up_write(&EXT4_I(inode)->i_data_sem);
-out:
-        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
-        ext4_journal_stop(handle);
-out_mutex:
-        mutex_unlock(&inode->i_mutex);
-        return err;
-}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0fd1a123f7d..3e2bf873e8a8 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
 #define EXT4_XATTR_SYSTEM_DATA  "data"
 #define EXT4_MIN_INLINE_DATA_SIZE       ((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE 4
+#define EXT4_INLINE_DOTDOT_OFFSET       2
+#define EXT4_INLINE_DOTDOT_SIZE         4
 int ext4_get_inline_size(struct inode *inode)
 {
@@ -1289,6 +1290,120 @@ out:
        return ret;
 }
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+                            struct inode *dir, ext4_lblk_t block,
+                            struct dx_hash_info *hinfo,
+                            __u32 start_hash, __u32 start_minor_hash,
+                            int *has_inline_data)
+{
+        int err = 0, count = 0;
+        unsigned int parent_ino;
+        int pos;
+        struct ext4_dir_entry_2 *de;
+        struct inode *inode = file_inode(dir_file);
+        int ret, inline_size = 0;
+        struct ext4_iloc iloc;
+        void *dir_buf = NULL;
+        struct ext4_dir_entry_2 fake;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                up_read(&EXT4_I(inode)->xattr_sem);
+                *has_inline_data = 0;
+                goto out;
+        }
+        inline_size = ext4_get_inline_size(inode);
+        dir_buf = kmalloc(inline_size, GFP_NOFS);
+        if (!dir_buf) {
+                ret = -ENOMEM;
+                up_read(&EXT4_I(inode)->xattr_sem);
+                goto out;
+        }
+        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+        up_read(&EXT4_I(inode)->xattr_sem);
+        if (ret < 0)
+                goto out;
+        pos = 0;
+        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+        while (pos < inline_size) {
+                /*
+                 * As inlined dir doesn't store any information about '.' and
+                 * only the inode number of '..' is stored, we have to handle
+                 * them differently.
+                 */
+                if (pos == 0) {
+                        fake.inode = cpu_to_le32(inode->i_ino);
+                        fake.name_len = 1;
+                        strcpy(fake.name, ".");
+                        fake.rec_len = ext4_rec_len_to_disk(
+                                                EXT4_DIR_REC_LEN(fake.name_len),
+                                                inline_size);
+                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                        de = &fake;
+                        pos = EXT4_INLINE_DOTDOT_OFFSET;
+                } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+                        fake.inode = cpu_to_le32(parent_ino);
+                        fake.name_len = 2;
+                        strcpy(fake.name, "..");
+                        fake.rec_len = ext4_rec_len_to_disk(
+                                                EXT4_DIR_REC_LEN(fake.name_len),
+                                                inline_size);
+                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                        de = &fake;
+                        pos = EXT4_INLINE_DOTDOT_SIZE;
+                } else {
+                        de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+                        pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+                        if (ext4_check_dir_entry(inode, dir_file, de,
+                                         iloc.bh, dir_buf,
+                                         inline_size, pos)) {
+                                ret = count;
+                                goto out;
+                        }
+                }
+                ext4fs_dirhash(de->name, de->name_len, hinfo);
+                if ((hinfo->hash < start_hash) ||
+                    ((hinfo->hash == start_hash) &&
+                     (hinfo->minor_hash < start_minor_hash)))
+                        continue;
+                if (de->inode == 0)
+                        continue;
+                err = ext4_htree_store_dirent(dir_file,
+                                   hinfo->hash, hinfo->minor_hash, de);
+                if (err) {
+                        count = err;
+                        goto out;
+                }
+                count++;
+        }
+        ret = count;
+out:
+        kfree(dir_buf);
+        brelse(iloc.bh);
+        return ret;
+}
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
 int ext4_read_inline_dir(struct file *filp,
                         void *dirent, filldir_t filldir,
                         int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
+        int dotdot_offset, dotdot_size, extra_offset, extra_size;
        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
        sb = inode->i_sb;
        stored = 0;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+        offset = filp->f_pos;
-        while (!error && !stored && filp->f_pos < inode->i_size) {
+        /*
+         * dotdot_offset and dotdot_size is the real offset and
+         * size for ".." and "." if the dir is block based while
+         * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+         * So we will use extra_offset and extra_size to indicate them
+         * during the inline dir iteration.
+         */
+        dotdot_offset = EXT4_DIR_REC_LEN(1);
+        dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+        extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+        extra_size = extra_offset + inline_size;
+        while (!error && !stored && filp->f_pos < extra_size) {
 revalidate:
                /*
                 * If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ revalidate:
                 * dir to make sure.
                 */
                if (filp->f_version != inode->i_version) {
-                        for (i = 0;
+                        for (i = 0; i < extra_size && i < offset;) {
-                             i < inode->i_size && i < offset;) {
+                                /*
+                                 * "." is with offset 0 and
+                                 * ".." is dotdot_offset.
+                                 */
                                if (!i) {
-                                        /* skip "." and ".." if needed. */
+                                        i = dotdot_offset;
-                                        i += EXT4_INLINE_DOTDOT_SIZE;
+                                        continue;
+                                } else if (i == dotdot_offset) {
+                                        i = dotdot_size;
                                        continue;
                                }
+                                /* for other entry, the real offset in
+                                 * the buf has to be tuned accordingly.
+                                 */
                                de = (struct ext4_dir_entry_2 *)
-                                        (dir_buf + i);
+                                        (dir_buf + i - extra_offset);
                                /* It's too expensive to do a full
                                 * dirent test each time round this
                                 * loop, but we do have to test at
@@ -1356,43 +1493,47 @@ revalidate:
                                 * failure will be detected in the
                                 * dirent test below. */
                                if (ext4_rec_len_from_disk(de->rec_len,
-                                        inline_size) < EXT4_DIR_REC_LEN(1))
+                                        extra_size) < EXT4_DIR_REC_LEN(1))
                                        break;
                                i += ext4_rec_len_from_disk(de->rec_len,
-                                                            inline_size);
+                                                            extra_size);
                        }
                        offset = i;
                        filp->f_pos = offset;
                        filp->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size) {
+                while (!error && filp->f_pos < extra_size) {
                        if (filp->f_pos == 0) {
                                error = filldir(dirent, ".", 1, 0, inode->i_ino,
                                                DT_DIR);
                                if (error)
                                        break;
                                stored++;
+                                filp->f_pos = dotdot_offset;
+                                continue;
+                        }
-                                error = filldir(dirent, "..", 2, 0, parent_ino,
+                        if (filp->f_pos == dotdot_offset) {
-                                                DT_DIR);
+                                error = filldir(dirent, "..", 2,
+                                                dotdot_offset,
+                                                parent_ino, DT_DIR);
                                if (error)
                                        break;
                                stored++;
-                                filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+                                filp->f_pos = dotdot_size;
                                continue;
                        }
-                        de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+                        de = (struct ext4_dir_entry_2 *)
+                                (dir_buf + filp->f_pos - extra_offset);
                        if (ext4_check_dir_entry(inode, filp, de,
                                                 iloc.bh, dir_buf,
-                                                 inline_size, offset)) {
+                                                 extra_size, filp->f_pos)) {
                                ret = stored;
                                goto out;
                        }
-                        offset += ext4_rec_len_from_disk(de->rec_len,
-                                                         inline_size);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -1415,9 +1556,8 @@ revalidate:
                                stored++;
                        }
                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-                                                              inline_size);
+                                                              extra_size);
                }
-                offset = 0;
        }
 out:
        kfree(dir_buf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3a5213bc73e..793d44b84d7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
        __u16 csum_hi = 0;
        __u32 csum;
-        csum_lo = raw->i_checksum_lo;
+        csum_lo = le16_to_cpu(raw->i_checksum_lo);
        raw->i_checksum_lo = 0;
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
-                csum_hi = raw->i_checksum_hi;
+                csum_hi = le16_to_cpu(raw->i_checksum_hi);
                raw->i_checksum_hi = 0;
        }
        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
                           EXT4_INODE_SIZE(inode->i_sb));
-        raw->i_checksum_lo = csum_lo;
+        raw->i_checksum_lo = cpu_to_le16(csum_lo);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
-                raw->i_checksum_hi = csum_hi;
+                raw->i_checksum_hi = cpu_to_le16(csum_hi);
        return csum;
 }
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
                        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
                        tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
-                        jbd2_log_start_commit(journal, commit_tid);
+                        jbd2_complete_transaction(journal, commit_tid);
-                        jbd2_log_wait_commit(journal, commit_tid);
                        filemap_write_and_wait(&inode->i_data);
                }
                truncate_inode_pages(&inode->i_data, 0);
@@ -1081,20 +1080,42 @@ retry_journal:
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
-        return ext4_handle_dirty_metadata(handle, NULL, bh);
+        ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+        clear_buffer_meta(bh);
+        clear_buffer_prio(bh);
+        return ret;
 }
-static int ext4_generic_write_end(struct file *file,
+/*
-                                  struct address_space *mapping,
+ * We need to pick up the new inode size which generic_commit_write gave us
-                                  loff_t pos, unsigned len, unsigned copied,
+ * `file' can be NULL - eg, when called from page_symlink().
-                                  struct page *page, void *fsdata)
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+                          struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
 {
-        int i_size_changed = 0;
-        struct inode *inode = mapping->host;
        handle_t *handle = ext4_journal_current_handle();
+        struct inode *inode = mapping->host;
+        int ret = 0, ret2;
+        int i_size_changed = 0;
+        trace_ext4_write_end(inode, pos, len, copied);
+        if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+                ret = ext4_jbd2_file_inode(handle, inode);
+                if (ret) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto errout;
+                }
+        }
        if (ext4_has_inline_data(inode))
                copied = ext4_write_inline_data_end(inode, pos, len,
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file,
        /*
         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold i_mutex.
+         * cannot change under us because we hole i_mutex.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file,
                i_size_changed = 1;
        }
-        if (pos + copied >  EXT4_I(inode)->i_disksize) {
+        if (pos + copied > EXT4_I(inode)->i_disksize) {
                /* We need to mark inode dirty even if
                 * new_i_size is less that inode->i_size
-                 * bu greater than i_disksize.(hint delalloc)
+                 * but greater than i_disksize. (hint delalloc)
                 */
                ext4_update_i_disksize(inode, (pos + copied));
                i_size_changed = 1;
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file,
        if (i_size_changed)
                ext4_mark_inode_dirty(handle, inode);
-        return copied;
+        if (copied < 0)
-}
+                ret = copied;
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list.  metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
-                                  struct address_space *mapping,
-                                  loff_t pos, unsigned len, unsigned copied,
-                                  struct page *page, void *fsdata)
-{
-        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = mapping->host;
-        int ret = 0, ret2;
-        trace_ext4_ordered_write_end(inode, pos, len, copied);
-        ret = ext4_jbd2_file_inode(handle, inode);
-        if (ret == 0) {
-                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-                copied = ret2;
-                if (pos + len > inode->i_size && ext4_can_truncate(inode))
-                        /* if we have allocated more blocks and copied
-                         * less. We will have blocks allocated outside
-                         * inode->i_size. So truncate them
-                         */
-                        ext4_orphan_add(handle, inode);
-                if (ret2 < 0)
-                        ret = ret2;
-        } else {
-                unlock_page(page);
-                page_cache_release(page);
-        }
-        ret2 = ext4_journal_stop(handle);
-        if (!ret)
-                ret = ret2;
-        if (pos + len > inode->i_size) {
-                ext4_truncate_failed_write(inode);
-                /*
-                 * If truncate failed early the inode might still be
-                 * on the orphan list; we need to make sure the inode
-                 * is removed from the orphan list in that case.
-                 */
-                if (inode->i_nlink)
-                        ext4_orphan_del(NULL, inode);
-        }
-        return ret ? ret : copied;
-}
-static int ext4_writeback_write_end(struct file *file,
-                                    struct address_space *mapping,
-                                    loff_t pos, unsigned len, unsigned copied,
-                                    struct page *page, void *fsdata)
-{
-        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = mapping->host;
-        int ret = 0, ret2;
-        trace_ext4_writeback_write_end(inode, pos, len, copied);
-        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-        copied = ret2;
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);
+errout:
-        if (ret2 < 0)
-                ret = ret2;
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
-        memset(&io_submit, 0, sizeof(io_submit));
+        ext4_io_submit_init(&io_submit, mpd->wbc);
+        io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_submit.io_end)
+                return -ENOMEM;
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                pagevec_release(&pvec);
        }
        ext4_io_submit(&io_submit);
+        /* Drop io_end reference we got from init */
+        ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
-                        ext4_count_free_clusters(inode->i_sb)));
+                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
-               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
-               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
-                 EXT4_I(inode)->i_reserved_data_blocks);
+                 ei->i_reserved_data_blocks);
        ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
-               EXT4_I(inode)->i_reserved_meta_blocks);
+               ei->i_reserved_meta_blocks);
+        ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+               ei->i_allocated_meta_blocks);
        return;
 }
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
         */
        map.m_lblk = next;
        map.m_len = max_blocks;
-        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+        /*
+         * We're in delalloc path and it is possible that we're going to
+         * need more metadata blocks than previously reserved. However
+         * we must not fail because we're in writeback and there is
+         * nothing we can do about it so it might result in data loss.
+         * So use reserved blocks to allocate metadata if possible.
+         */
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
        if (ext4_should_dioread_nolock(mpd->inode))
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
        if (blks < 0) {
                struct super_block *sb = mpd->inode->i_sb;
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
-        memset(&io_submit, 0, sizeof(io_submit));
+        ext4_io_submit_init(&io_submit, wbc);
+        io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_submit.io_end) {
+                redirty_page_for_writepage(wbc, page);
+                return -ENOMEM;
+        }
        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
        ext4_io_submit(&io_submit);
+        /* Drop io_end reference we got from init */
+        ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
@@ -2661,7 +2634,7 @@ out_writepages:
 static int ext4_nonda_switch(struct super_block *sb)
 {
-        s64 free_blocks, dirty_blocks;
+        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        /*
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb)
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
-        free_blocks  = EXT4_C2B(sbi,
+        free_clusters =
-                percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
-        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+        dirty_clusters =
+                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
-        if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
-        if (2 * free_blocks < 3 * dirty_blocks ||
+        if (2 * free_clusters < 3 * dirty_clusters ||
-                free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file,
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;
-        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+        if (write_mode == FALL_BACK_TO_NONDELALLOC)
-                switch (ext4_inode_journal_mode(inode)) {
+                return ext4_write_end(file, mapping, pos,
-                case EXT4_INODE_ORDERED_DATA_MODE:
+                                      len, copied, page, fsdata);
-                        return ext4_ordered_write_end(file, mapping, pos,
-                                        len, copied, page, fsdata);
-                case EXT4_INODE_WRITEBACK_DATA_MODE:
-                        return ext4_writeback_write_end(file, mapping, pos,
-                                        len, copied, page, fsdata);
-                default:
-                        BUG();
-                }
-        }
        trace_ext4_da_write_end(inode, pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        struct inode *inode = file_inode(iocb->ki_filp);
        ext4_io_end_t *io_end = iocb->private;
-        /* if not async direct IO or dio with 0 bytes write, just return */
+        /* if not async direct IO just return */
-        if (!io_end || !size)
+        if (!io_end) {
-                goto out;
+                inode_dio_done(inode);
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
+                return;
+        }
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        iocb->private = NULL;
-        /* if not aio dio with unwritten extents, just free io and return */
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                ext4_free_io_end(io_end);
-out:
-                inode_dio_done(inode);
-                if (is_async)
-                        aio_complete(iocb, ret, 0);
-                return;
-        }
        io_end->offset = offset;
        io_end->size = size;
        if (is_async) {
                io_end->iocb = iocb;
                io_end->result = ret;
        }
+        ext4_put_io_end_defer(io_end);
-        ext4_add_complete_io(io_end);
 }
 /*
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+        ext4_io_end_t *io_end = NULL;
        /* Use the old path for reads and writes beyond i_size. */
        if (rw != WRITE || final_size > inode->i_size)
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        iocb->private = NULL;
        ext4_inode_aio_set(inode, NULL);
        if (!is_sync_kiocb(iocb)) {
-                ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+                io_end = ext4_init_io_end(inode, GFP_NOFS);
                if (!io_end) {
                        ret = -ENOMEM;
                        goto retake_lock;
                }
                io_end->flag |= EXT4_IO_END_DIRECT;
-                iocb->private = io_end;
+                /*
+                 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+                 */
+                iocb->private = ext4_get_io_end(io_end);
                /*
                 * we save the io structure for current async direct
                 * IO, so that later ext4_map_blocks() could flag the
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                   NULL,
                                   dio_flags);
-        if (iocb->private)
-                ext4_inode_aio_set(inode, NULL);
        /*
-         * The io_end structure takes a reference to the inode, that
+         * Put our reference to io_end. This can free the io_end structure e.g.
-         * structure needs to be destroyed and the reference to the
+         * in sync IO case or in case of error. It can even perform extent
-         * inode need to be dropped, when IO is complete, even with 0
+         * conversion if all bios we submitted finished before we got here.
-         * byte write, or failed.
+         * Note that in that case iocb->private can be already set to NULL
-         *
+         * here.
-         * In the successful AIO DIO case, the io_end structure will
-         * be destroyed and the reference to the inode will be dropped
-         * after the end_io call back function is called.
-         *
-         * In the case there is 0 byte write, or error case, since VFS
-         * direct IO won't invoke the end_io call back function, we
-         * need to free the end_io structure here.
         */
-        if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+        if (io_end) {
-                ext4_free_io_end(iocb->private);
+                ext4_inode_aio_set(inode, NULL);
-                iocb->private = NULL;
+                ext4_put_io_end(io_end);
-        } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+                /*
+                 * In case of error or no write ext4_end_io_dio() was not
+                 * called so we have to put iocb's reference.
+                 */
+                if (ret <= 0 && ret != -EIOCBQUEUED) {
+                        WARN_ON(iocb->private != io_end);
+                        ext4_put_io_end(io_end);
+                        iocb->private = NULL;
+                }
+        }
+        if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
                /*
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page)
        return __set_page_dirty_nobuffers(page);
 }
-static const struct address_space_operations ext4_ordered_aops = {
+static const struct address_space_operations ext4_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .write_begin            = ext4_write_begin,
-        .write_end              = ext4_ordered_write_end,
+        .write_end              = ext4_write_end,
-        .bmap                   = ext4_bmap,
-        .invalidatepage         = ext4_invalidatepage,
-        .releasepage            = ext4_releasepage,
-        .direct_IO              = ext4_direct_IO,
-        .migratepage            = buffer_migrate_page,
-        .is_partially_uptodate  = block_is_partially_uptodate,
-        .error_remove_page      = generic_error_remove_page,
-};
-static const struct address_space_operations ext4_writeback_aops = {
-        .readpage               = ext4_readpage,
-        .readpages              = ext4_readpages,
-        .writepage              = ext4_writepage,
-        .write_begin            = ext4_write_begin,
-        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode)
 {
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
-                if (test_opt(inode->i_sb, DELALLOC))
+                ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-                        inode->i_mapping->a_ops = &ext4_da_aops;
-                else
-                        inode->i_mapping->a_ops = &ext4_ordered_aops;
                break;
        case EXT4_INODE_WRITEBACK_DATA_MODE:
-                if (test_opt(inode->i_sb, DELALLOC))
+                ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-                        inode->i_mapping->a_ops = &ext4_da_aops;
-                else
-                        inode->i_mapping->a_ops = &ext4_writeback_aops;
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
-                break;
+                return;
        default:
                BUG();
        }
+        if (test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
+        else
+                inode->i_mapping->a_ops = &ext4_aops;
 }
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode)
 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
        struct inode *inode = file_inode(file);
+        struct super_block *sb = inode->i_sb;
+        ext4_lblk_t first_block, stop_block;
+        struct address_space *mapping = inode->i_mapping;
+        loff_t first_page, last_page, page_len;
+        loff_t first_page_offset, last_page_offset;
+        handle_t *handle;
+        unsigned int credits;
+        int ret = 0;
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+        if (EXT4_SB(sb)->s_cluster_ratio > 1) {
-                return ext4_ind_punch_hole(file, offset, length);
-        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
                return -EOPNOTSUPP;
        }
        trace_ext4_punch_hole(inode, offset, length);
-        return ext4_ext_punch_hole(file, offset, length);
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                ret = filemap_write_and_wait_range(mapping, offset,
+                                                   offset + length - 1);
+                if (ret)
+                        return ret;
+        }
+        mutex_lock(&inode->i_mutex);
+        /* It's not possible punch hole on append only file */
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+                ret = -EPERM;
+                goto out_mutex;
+        }
+        if (IS_SWAPFILE(inode)) {
+                ret = -ETXTBSY;
+                goto out_mutex;
+        }
+        /* No need to punch hole beyond i_size */
+        if (offset >= inode->i_size)
+                goto out_mutex;
+        /*
+         * If the hole extends beyond i_size, set the hole
+         * to end after the page that contains i_size
+         */
+        if (offset + length > inode->i_size) {
+                length = inode->i_size +
+                   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+                   offset;
+        }
+        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        /* Now release the pages */
+        if (last_page_offset > first_page_offset) {
+                truncate_pagecache_range(inode, first_page_offset,
+                                         last_page_offset - 1);
+        }
+        /* Wait all existing dio workers, newcomers will block on i_mutex */
+        ext4_inode_block_unlocked_dio(inode);
+        ret = ext4_flush_unwritten_io(inode);
+        if (ret)
+                goto out_dio;
+        inode_dio_wait(inode);
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                credits = ext4_writepage_trans_blocks(inode);
+        else
+                credits = ext4_blocks_for_truncate(inode);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                ext4_std_error(sb, ret);
+                goto out_dio;
+        }
+        /*
+         * Now we need to zero out the non-page-aligned data in the
+         * pages at the start and tail of the hole, and unmap the
+         * buffer heads for the block aligned regions of the page that
+         * were completely zeroed.
+         */
+        if (first_page > last_page) {
+                /*
+                 * If the file space being truncated is contained
+                 * within a page just zero out and unmap the middle of
+                 * that page
+                 */
+                ret = ext4_discard_partial_page_buffers(handle,
+                        mapping, offset, length, 0);
+                if (ret)
+                        goto out_stop;
+        } else {
+                /*
+                 * zero out and unmap the partial page that contains
+                 * the start of the hole
+                 */
+                page_len = first_page_offset - offset;
+                if (page_len > 0) {
+                        ret = ext4_discard_partial_page_buffers(handle, mapping,
+                                                offset, page_len, 0);
+                        if (ret)
+                                goto out_stop;
+                }
+                /*
+                 * zero out and unmap the partial page that contains
+                 * the end of the hole
+                 */
+                page_len = offset + length - last_page_offset;
+                if (page_len > 0) {
+                        ret = ext4_discard_partial_page_buffers(handle, mapping,
+                                        last_page_offset, page_len, 0);
+                        if (ret)
+                                goto out_stop;
+                }
+        }
+        /*
+         * If i_size is contained in the last page, we need to
+         * unmap and zero the partial page after i_size
+         */
+        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+           inode->i_size % PAGE_CACHE_SIZE != 0) {
+                page_len = PAGE_CACHE_SIZE -
+                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
+                if (page_len > 0) {
+                        ret = ext4_discard_partial_page_buffers(handle,
+                                        mapping, inode->i_size, page_len, 0);
+                        if (ret)
+                                goto out_stop;
+                }
+        }
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+        /* If there are no blocks to remove, return now */
+        if (first_block >= stop_block)
+                goto out_stop;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_discard_preallocations(inode);
+        ret = ext4_es_remove_extent(inode, first_block,
+                                    stop_block - first_block);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                ret = ext4_ext_remove_space(inode, first_block,
+                                            stop_block - 1);
+        else
+                ret = ext4_free_hole_blocks(handle, inode, first_block,
+                                            stop_block);
+        ext4_discard_preallocations(inode);
+        up_write(&EXT4_I(inode)->i_data_sem);
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+out_stop:
+        ext4_journal_stop(handle);
+out_dio:
+        ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
 }
 /*
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 */
 void ext4_truncate(struct inode *inode)
 {
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned int credits;
+        handle_t *handle;
+        struct address_space *mapping = inode->i_mapping;
+        loff_t page_len;
+        /*
+         * There is a possibility that we're either freeing the inode
+         * or it completely new indode. In those cases we might not
+         * have i_mutex locked because it's not necessary.
+         */
+        if (!(inode->i_state & (I_NEW|I_FREEING)))
+                WARN_ON(!mutex_is_locked(&inode->i_mutex));
        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode)
                        return;
        }
+        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_unwritten_io(inode);
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                credits = ext4_writepage_trans_blocks(inode);
+        else
+                credits = ext4_blocks_for_truncate(inode);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+        if (IS_ERR(handle)) {
+                ext4_std_error(inode->i_sb, PTR_ERR(handle));
+                return;
+        }
+        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+                page_len = PAGE_CACHE_SIZE -
+                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
+                if (ext4_discard_partial_page_buffers(handle,
+                                mapping, inode->i_size, page_len, 0))
+                        goto out_stop;
+        }
+        /*
+         * We add the inode to the orphan list, so that if this
+         * truncate spans multiple transactions, and we crash, we will
+         * resume the truncate when the filesystem recovers.  It also
+         * marks the inode dirty, to catch the new size.
+         *
+         * Implication: the file must always be in a sane, consistent
+         * truncatable state while each transaction commits.
+         */
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_discard_preallocations(inode);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                ext4_ext_truncate(inode);
+                ext4_ext_truncate(handle, inode);
        else
-                ext4_ind_truncate(inode);
+                ext4_ind_truncate(handle, inode);
+        up_write(&ei->i_data_sem);
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+out_stop:
+        /*
+         * If this was a simple ftruncate() and the file will remain alive,
+         * then we need to clear up the orphan record which we created above.
+         * However, if this was a real unlink then we were called by
+         * ext4_delete_inode(), and we allow that function to clean up the
+         * orphan info for us.
+         */
+        if (inode->i_nlink)
+                ext4_orphan_del(handle, inode);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
        trace_ext4_truncate_exit(inode);
 }
@@ -3821,13 +4011,14 @@ make_io:
                if (EXT4_SB(sb)->s_inode_readahead_blks) {
                        ext4_fsblk_t b, end, table;
                        unsigned num;
+                        __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
                        table = ext4_inode_table(sb, gdp);
                        /* s_inode_readahead_blks is always a power of 2 */
-                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+                        b = block & ~((ext4_fsblk_t) ra_blks - 1);
                        if (table > b)
                                b = table;
-                        end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+                        end = b + ra_blks;
                        num = EXT4_INODES_PER_GROUP(sb);
                        if (ext4_has_group_desc_csum(sb))
                                num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
-                if (inode->i_mode == 0 ||
+                if ((inode->i_mode == 0 ||
-                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted */
                        ret = -ESTALE;
                        goto bad_inode;
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
-                 * the process of deleting those. */
+                 * the process of deleting those.
+                 * OR it is the EXT4_BOOT_LOADER_INO which is
+                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+        } else if (ino == EXT4_BOOT_LOADER_INO) {
+                make_bad_inode(inode);
        } else {
                ret = -EIO;
                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 721f4d33e148..9491ac0590f7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+        unsigned char *ap, *bp;
+        unsigned char tmp;
+        ap = (unsigned char *)a;
+        bp = (unsigned char *)b;
+        while (len-- > 0) {
+                tmp = *ap;
+                *ap = *bp;
+                *bp = tmp;
+                ap++;
+                bp++;
+        }
+}
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+        loff_t isize;
+        struct ext4_inode_info *ei1;
+        struct ext4_inode_info *ei2;
+        ei1 = EXT4_I(inode1);
+        ei2 = EXT4_I(inode2);
+        memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+        memswap(&inode1->i_version, &inode2->i_version,
+                  sizeof(inode1->i_version));
+        memswap(&inode1->i_blocks, &inode2->i_blocks,
+                  sizeof(inode1->i_blocks));
+        memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+        memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+        memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+        memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+        memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+        memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+        memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+        memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+        isize = i_size_read(inode1);
+        i_size_write(inode1, i_size_read(inode2));
+        i_size_write(inode2, isize);
+}
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+                                struct inode *inode)
+{
+        handle_t *handle;
+        int err;
+        struct inode *inode_bl;
+        struct ext4_inode_info *ei;
+        struct ext4_inode_info *ei_bl;
+        struct ext4_sb_info *sbi;
+        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+                err = -EINVAL;
+                goto swap_boot_out;
+        }
+        if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto swap_boot_out;
+        }
+        sbi = EXT4_SB(sb);
+        ei = EXT4_I(inode);
+        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+        if (IS_ERR(inode_bl)) {
+                err = PTR_ERR(inode_bl);
+                goto swap_boot_out;
+        }
+        ei_bl = EXT4_I(inode_bl);
+        filemap_flush(inode->i_mapping);
+        filemap_flush(inode_bl->i_mapping);
+        /* Protect orig inodes against a truncate and make sure,
+         * that only 1 swap_inode_boot_loader is running. */
+        ext4_inode_double_lock(inode, inode_bl);
+        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages(&inode_bl->i_data, 0);
+        /* Wait for all existing dio workers */
+        ext4_inode_block_unlocked_dio(inode);
+        ext4_inode_block_unlocked_dio(inode_bl);
+        inode_dio_wait(inode);
+        inode_dio_wait(inode_bl);
+        handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+        if (IS_ERR(handle)) {
+                err = -EINVAL;
+                goto swap_boot_out;
+        }
+        /* Protect extent tree against block allocations via delalloc */
+        ext4_double_down_write_data_sem(inode, inode_bl);
+        if (inode_bl->i_nlink == 0) {
+                /* this inode has never been used as a BOOT_LOADER */
+                set_nlink(inode_bl, 1);
+                i_uid_write(inode_bl, 0);
+                i_gid_write(inode_bl, 0);
+                inode_bl->i_flags = 0;
+                ei_bl->i_flags = 0;
+                inode_bl->i_version = 1;
+                i_size_write(inode_bl, 0);
+                inode_bl->i_mode = S_IFREG;
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                              EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                        ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+                        ext4_ext_tree_init(handle, inode_bl);
+                } else
+                        memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+        }
+        swap_inode_data(inode, inode_bl);
+        inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        inode_bl->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        ext4_discard_preallocations(inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (err < 0) {
+                ext4_warning(inode->i_sb,
+                        "couldn't mark inode #%lu dirty (err %d)",
+                        inode->i_ino, err);
+                /* Revert all changes: */
+                swap_inode_data(inode, inode_bl);
+        } else {
+                err = ext4_mark_inode_dirty(handle, inode_bl);
+                if (err < 0) {
+                        ext4_warning(inode_bl->i_sb,
+                                "couldn't mark inode #%lu dirty (err %d)",
+                                inode_bl->i_ino, err);
+                        /* Revert all changes: */
+                        swap_inode_data(inode, inode_bl);
+                        ext4_mark_inode_dirty(handle, inode);
+                }
+        }
+        ext4_journal_stop(handle);
+        ext4_double_up_write_data_sem(inode, inode_bl);
+        ext4_inode_resume_unlocked_dio(inode);
+        ext4_inode_resume_unlocked_dio(inode_bl);
+        ext4_inode_double_unlock(inode, inode_bl);
+        iput(inode_bl);
+swap_boot_out:
+        return err;
+}
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        if (!capable(CAP_SYS_RESOURCE))
                                goto flags_out;
                }
-                if (oldflags & EXT4_EXTENTS_FL) {
+                if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
-                        /* We don't support clearning extent flags */
-                        if (!(flags & EXT4_EXTENTS_FL)) {
-                                err = -EOPNOTSUPP;
-                                goto flags_out;
-                        }
-                } else if (flags & EXT4_EXTENTS_FL) {
-                        /* migrate the file */
                        migrate = 1;
-                        flags &= ~EXT4_EXTENTS_FL;
-                }
                if (flags & EXT4_EOFBLOCKS_FL) {
                        /* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
                        err = ext4_change_inode_journal_flag(inode, jflag);
                if (err)
                        goto flags_out;
-                if (migrate)
+                if (migrate) {
-                        err = ext4_ext_migrate(inode);
+                        if (flags & EXT4_EXTENTS_FL)
+                                err = ext4_ext_migrate(inode);
+                        else
+                                err = ext4_ind_migrate(inode);
+                }
 flags_out:
                mutex_unlock(&inode->i_mutex);
                mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
                return err;
        }
+        case EXT4_IOC_SWAP_BOOT:
+                if (!(filp->f_mode & FMODE_WRITE))
+                        return -EBADF;
+                return swap_inode_boot_loader(sb, inode);
        case EXT4_IOC_RESIZE_FS: {
                ext4_fsblk_t n_blocks_count;
-                struct super_block *sb = inode->i_sb;
                int err = 0, err2 = 0;
                ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..a11ea4d6164c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
        ext4_clear_bit(bit, addr);
 }
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+        addr = mb_correct_addr_and_bit(&bit, addr);
+        return ext4_test_and_clear_bit(bit, addr);
+}
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
        int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 }
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+        int count;
+        int order = 1;
+        void *buddy;
+        while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+                ext4_set_bits(buddy, 0, count);
+        }
+        e4b->bd_info->bb_fragments = 0;
+        memset(e4b->bd_info->bb_counters, 0,
+                sizeof(*e4b->bd_info->bb_counters) *
+                (e4b->bd_sb->s_blocksize_bits + 2));
+        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+                e4b->bd_bitmap, e4b->bd_group);
+}
 /* The buddy information is attached the buddy cache inode
 * for convenience. The information regarding each group
 * is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
-                int group;
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
        struct page *page;
        int ret = 0;
+        might_sleep();
        mb_debug(1, "init group %u\n", group);
        this_grp = ext4_get_group_info(sb, group);
        /*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct inode *inode = sbi->s_buddy_cache;
+        might_sleep();
        mb_debug(1, "load group %u\n", group);
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
        }
 }
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+        __u32 *addr;
+        int zero_bit = -1;
+        len = cur + len;
+        while (cur < len) {
+                if ((cur & 31) == 0 && (len - cur) >= 32) {
+                        /* fast path: clear whole word at once */
+                        addr = bm + (cur >> 3);
+                        if (*addr != (__u32)(-1) && zero_bit == -1)
+                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+                        *addr = 0;
+                        cur += 32;
+                        continue;
+                }
+                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+                        zero_bit = cur;
+                cur++;
+        }
+        return zero_bit;
+}
 void ext4_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
        }
 }
+/*
+ * _________________________________________________________________ */
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+        if (mb_test_bit(*bit + side, bitmap)) {
+                mb_clear_bit(*bit, bitmap);
+                (*bit) -= side;
+                return 1;
+        }
+        else {
+                (*bit) += side;
+                mb_set_bit(*bit, bitmap);
+                return -1;
+        }
+}
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+        int max;
+        int order = 1;
+        void *buddy = mb_find_buddy(e4b, order, &max);
+        while (buddy) {
+                void *buddy2;
+                /* Bits in range [first; last] are known to be set since
+                 * corresponding blocks were allocated. Bits in range
+                 * (first; last) will stay set because they form buddies on
+                 * upper layer. We just deal with borders if they don't
+                 * align with upper layer and then go up.
+                 * Releasing entire group is all about clearing
+                 * single bit of highest order buddy.
+                 */
+                /* Example:
+                 * ---------------------------------
+                 * |   1   |   1   |   1   |   1   |
+                 * ---------------------------------
+                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+                 * ---------------------------------
+                 *   0   1   2   3   4   5   6   7
+                 *      \_____________________/
+                 *
+                 * Neither [1] nor [6] is aligned to above layer.
+                 * Left neighbour [0] is free, so mark it busy,
+                 * decrease bb_counters and extend range to
+                 * [0; 6]
+                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+                 * mark [6] free, increase bb_counters and shrink range to
+                 * [0; 5].
+                 * Then shift range to [0; 2], go up and do the same.
+                 */
+                if (first & 1)
+                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+                if (!(last & 1))
+                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+                if (first > last)
+                        break;
+                order++;
+                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+                        mb_clear_bits(buddy, first, last - first + 1);
+                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+                        break;
+                }
+                first >>= 1;
+                last >>= 1;
+                buddy = buddy2;
+        }
+}
 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
-                          int first, int count)
+                           int first, int count)
 {
-        int block = 0;
+        int left_is_free = 0;
-        int max = 0;
+        int right_is_free = 0;
-        int order;
+        int block;
-        void *buddy;
+        int last = first + count - 1;
-        void *buddy2;
        struct super_block *sb = e4b->bd_sb;
-        BUG_ON(first + count > (sb->s_blocksize << 3));
+        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        if (first < e4b->bd_info->bb_first_free)
                e4b->bd_info->bb_first_free = first;
-        /* let's maintain fragments counter */
+        /* access memory sequentially: check left neighbour,
+         * clear range and then check right neighbour
+         */
        if (first != 0)
-                block = !mb_test_bit(first - 1, e4b->bd_bitmap);
+                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
-        if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
-                max = !mb_test_bit(first + count, e4b->bd_bitmap);
+        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
-        if (block && max)
+                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
-                e4b->bd_info->bb_fragments--;
-        else if (!block && !max)
-                e4b->bd_info->bb_fragments++;
-        /* let's maintain buddy itself */
+        if (unlikely(block != -1)) {
-        while (count-- > 0) {
+                ext4_fsblk_t blocknr;
-                block = first++;
-                order = 0;
-                if (!mb_test_bit(block, e4b->bd_bitmap)) {
+                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                        ext4_fsblk_t blocknr;
+                blocknr += EXT4_C2B(EXT4_SB(sb), block);
+                ext4_grp_locked_error(sb, e4b->bd_group,
-                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+                                      inode ? inode->i_ino : 0,
-                        blocknr += EXT4_C2B(EXT4_SB(sb), block);
+                                      blocknr,
-                        ext4_grp_locked_error(sb, e4b->bd_group,
+                                      "freeing already freed block "
-                                              inode ? inode->i_ino : 0,
+                                      "(bit %u)", block);
-                                              blocknr,
+                mb_regenerate_buddy(e4b);
-                                              "freeing already freed block "
+                goto done;
-                                              "(bit %u)", block);
+        }
-                }
-                mb_clear_bit(block, e4b->bd_bitmap);
-                e4b->bd_info->bb_counters[order]++;
-                /* start of the buddy */
-                buddy = mb_find_buddy(e4b, order, &max);
-                do {
-                        block &= ~1UL;
-                        if (mb_test_bit(block, buddy) ||
-                                        mb_test_bit(block + 1, buddy))
-                                break;
-                        /* both the buddies are free, try to coalesce them */
-                        buddy2 = mb_find_buddy(e4b, order + 1, &max);
-                        if (!buddy2)
+        /* let's maintain fragments counter */
-                                break;
+        if (left_is_free && right_is_free)
+                e4b->bd_info->bb_fragments--;
+        else if (!left_is_free && !right_is_free)
+                e4b->bd_info->bb_fragments++;
-                        if (order > 0) {
+        /* buddy[0] == bd_bitmap is a special case, so handle
-                                /* for special purposes, we don't set
+         * it right away and let mb_buddy_mark_free stay free of
-                                 * free bits in bitmap */
+         * zero order checks.
-                                mb_set_bit(block, buddy);
+         * Check if neighbours are to be coaleasced,
-                                mb_set_bit(block + 1, buddy);
+         * adjust bitmap bb_counters and borders appropriately.
-                        }
+         */
-                        e4b->bd_info->bb_counters[order]--;
+        if (first & 1) {
-                        e4b->bd_info->bb_counters[order]--;
+                first += !left_is_free;
+                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+        }
+        if (!(last & 1)) {
+                last -= !right_is_free;
+                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+        }
-                        block = block >> 1;
+        if (first <= last)
-                        order++;
+                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
-                        e4b->bd_info->bb_counters[order]++;
-                        mb_clear_bit(block, buddy2);
+done:
-                        buddy = buddy2;
-                } while (1);
-        }
        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;
-        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+        grp = ext4_get_group_number(sb, grp_blk);
        /*
         * possible race:
@@ -3807,7 +3918,7 @@ repeat:
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
                BUG_ON(pa->pa_type != MB_INODE_PA);
-                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+                group = ext4_get_group_number(sb, pa->pa_pstart);
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
-                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+                group = ext4_get_group_number(sb, pa->pa_pstart);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
                        ext4_error(sb, "Error loading buddy information for %u",
                                        group);
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        unsigned int inquota = 0;
        unsigned int reserv_clstrs = 0;
+        might_sleep();
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
-                if (can_merge(entry, new_entry)) {
+                if (can_merge(entry, new_entry) &&
+                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                        new_entry->efd_start_cluster = entry->efd_start_cluster;
                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        ext4_journal_callback_del(handle, &entry->efd_jce);
                        kmem_cache_free(ext4_free_data_cachep, entry);
                }
        }
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
-                if (can_merge(new_entry, entry)) {
+                if (can_merge(new_entry, entry) &&
+                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        ext4_journal_callback_del(handle, &entry->efd_jce);
                        kmem_cache_free(ext4_free_data_cachep, entry);
                }
        }
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        int err = 0;
        int ret;
+        might_sleep();
        if (bh) {
                if (block)
                        BUG_ON(block != bh->b_blocknr);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 480acf4a085f..49e8bdff9163 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
                        return retval;
        }
        return retval;
 }
 int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
        return retval;
 }
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+        struct ext4_extent_header       *eh;
+        struct ext4_super_block         *es = EXT4_SB(inode->i_sb)->s_es;
+        struct ext4_inode_info          *ei = EXT4_I(inode);
+        struct ext4_extent              *ex;
+        unsigned int                    i, len;
+        ext4_fsblk_t                    blk;
+        handle_t                        *handle;
+        int                             ret;
+        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+            (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                return -EINVAL;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+                return -EOPNOTSUPP;
+        handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ret = ext4_ext_check_inode(inode);
+        if (ret)
+                goto errout;
+        eh = ext_inode_hdr(inode);
+        ex  = EXT_FIRST_EXTENT(eh);
+        if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+            eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+                ret = -EOPNOTSUPP;
+                goto errout;
+        }
+        if (eh->eh_entries == 0)
+                blk = len = 0;
+        else {
+                len = le16_to_cpu(ex->ee_len);
+                blk = ext4_ext_pblock(ex);
+                if (len > EXT4_NDIR_BLOCKS) {
+                        ret = -EOPNOTSUPP;
+                        goto errout;
+                }
+        }
+        ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+        memset(ei->i_data, 0, sizeof(ei->i_data));
+        for (i=0; i < len; i++)
+                ei->i_data[i] = cpu_to_le32(blk++);
+        ext4_mark_inode_dirty(handle, inode);
+errout:
+        ext4_journal_stop(handle);
+        up_write(&EXT4_I(inode)->i_data_sem);
+        return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f9b551561d2c..214461e42a05 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
 #include "ext4.h"
 /* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
        lock_buffer(bh);
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
-        submit_bh(WRITE_SYNC, bh);
+        submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        sb_end_write(sb);
        if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
                get_bh(*bh);
                lock_buffer(*bh);
                (*bh)->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_SYNC, *bh);
+                submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
                wait_on_buffer(*bh);
                if (!buffer_uptodate(*bh)) {
                        brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 33e1c086858b..3dcbf364022f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 }
 /**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
 *
 * Acquire write lock of i_data_sem of the two inodes
 */
-static void
+void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
 {
        if (first < second) {
                down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
 }
 /**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:         original inode structure to be released its lock first
 * @donor_inode:        donor inode structure to be released its lock second
 * Release write lock of i_data_sem of two inodes (orig and donor).
 */
-static void
+void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+                              struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
                mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
                                                end_ext, eh, range_to_move);
-        if (depth) {
+        return ext4_ext_dirty(handle, orig_inode, orig_path);
-                ret = ext4_handle_dirty_metadata(handle, orig_inode,
-                                                 orig_path->p_bh);
-                if (ret)
-                        return ret;
-        } else {
-                ret = ext4_mark_inode_dirty(handle, orig_inode);
-                if (ret < 0)
-                        return ret;
-        }
-        return 0;
 }
 /**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                donor_off += dext_alen;
                orig_off += dext_alen;
+                BUG_ON(replaced_count > count);
                /* Already moved the expected blocks */
                if (replaced_count >= count)
                        break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
                page_cache_release(page[0]);
                return -ENOMEM;
        }
+        /*
+         * grab_cache_page_write_begin() may not wait on page's writeback if
+         * BDI not demand that. But it is reasonable to be very conservative
+         * here and explicitly wait on page's writeback
+         */
+        wait_on_page_writeback(page[0]);
+        wait_on_page_writeback(page[1]);
        if (inode1 > inode2) {
                struct page *tmp;
                tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
                if (buffer_uptodate(bh))
                        continue;
                if (!buffer_mapped(bh)) {
-                        int err = 0;
                        err = ext4_get_block(inode, block, bh, 0);
                        if (err) {
                                SetPageError(page);
@@ -976,7 +973,7 @@ again:
         * necessary, just swap data blocks between orig and donor.
         */
        if (uninit) {
-                double_down_write_data_sem(orig_inode, donor_inode);
+                ext4_double_down_write_data_sem(orig_inode, donor_inode);
                /* If any of extents in range became initialized we have to
                 * fallback to data copying */
                uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ again:
                        goto drop_data_sem;
                if (!uninit) {
-                        double_up_write_data_sem(orig_inode, donor_inode);
+                        ext4_double_up_write_data_sem(orig_inode, donor_inode);
                        goto data_copy;
                }
                if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ again:
                                                donor_inode, orig_blk_offset,
                                                block_len_in_page, err);
        drop_data_sem:
-                double_up_write_data_sem(orig_inode, donor_inode);
+                ext4_double_up_write_data_sem(orig_inode, donor_inode);
                goto unlock_pages;
        }
 data_copy:
@@ -1033,7 +1030,7 @@ data_copy:
        }
        /* Perform all necessary steps similar write_begin()/write_end()
         * but keeping in mind that i_size will not change */
-        *err = __block_write_begin(pagep[0], from, from + replaced_size,
+        *err = __block_write_begin(pagep[0], from, replaced_size,
                                   ext4_get_block);
        if (!*err)
                *err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ repair_branches:
         * Extents are swapped already, but we are not able to copy data.
         * Try to swap extents to it's original places
         */
-        double_down_write_data_sem(orig_inode, donor_inode);
+        ext4_double_down_write_data_sem(orig_inode, donor_inode);
        replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
                                               orig_blk_offset,
                                               block_len_in_page, &err2);
-        double_up_write_data_sem(orig_inode, donor_inode);
+        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        if (replaced_count != block_len_in_page) {
                EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
                                       "Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
 }
 /**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
 *
 * @inode1:     the inode structure
 * @inode2:     the inode structure
 *
 * Lock two inodes' i_mutex
 */
-static void
+void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
        BUG_ON(inode1 == inode2);
        if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 }
 /**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
 *
 * @inode1:     the inode that is released first
 * @inode2:     the inode that is released second
 *
 */
-static void
+void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
        mutex_unlock(&inode1->i_mutex);
        mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
        /* Protect orig and donor inodes against a truncate */
-        mext_inode_double_lock(orig_inode, donor_inode);
+        ext4_inode_double_lock(orig_inode, donor_inode);
        /* Wait for all existing dio workers */
        ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        inode_dio_wait(donor_inode);
        /* Protect extent tree against block allocations via delalloc */
-        double_down_write_data_sem(orig_inode, donor_inode);
+        ext4_double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                    donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 * b. racing with ->readpage, ->write_begin, and ext4_get_block
                 *    in move_extent_per_page
                 */
-                double_up_write_data_sem(orig_inode, donor_inode);
+                ext4_double_up_write_data_sem(orig_inode, donor_inode);
                while (orig_page_offset <= seq_end_page) {
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
-                double_down_write_data_sem(orig_inode, donor_inode);
+                ext4_double_down_write_data_sem(orig_inode, donor_inode);
                if (ret < 0)
                        break;
@@ -1538,10 +1535,10 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
-        double_up_write_data_sem(orig_inode, donor_inode);
+        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        ext4_inode_resume_unlocked_dio(orig_inode);
        ext4_inode_resume_unlocked_dio(donor_inode);
-        mext_inode_double_unlock(orig_inode, donor_inode);
+        ext4_inode_double_unlock(orig_inode, donor_inode);
        return ret;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3825d6aa8336..6653fc35ecb7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        __u32 csum, old_csum;
+        __u32 csum;
+        __le32 save_csum;
        int size;
        size = count_offset + (count * sizeof(struct dx_entry));
-        old_csum = t->dt_checksum;
+        save_csum = t->dt_checksum;
        t->dt_checksum = 0;
        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
        csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
-        t->dt_checksum = old_csum;
+        t->dt_checksum = save_csum;
        return cpu_to_le32(csum);
 }
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                        hinfo.hash_version +=
                                EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+                if (ext4_has_inline_data(dir)) {
+                        int has_inline_data = 1;
+                        count = htree_inlinedir_to_tree(dir_file, dir, 0,
+                                                        &hinfo, start_hash,
+                                                        start_minor_hash,
+                                                        &has_inline_data);
+                        if (has_inline_data) {
+                                *next_hash = ~0;
+                                return count;
+                        }
+                }
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
                *next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
        return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
-        [S_IFREG >> S_SHIFT]    = EXT4_FT_REG_FILE,
-        [S_IFDIR >> S_SHIFT]    = EXT4_FT_DIR,
-        [S_IFCHR >> S_SHIFT]    = EXT4_FT_CHRDEV,
-        [S_IFBLK >> S_SHIFT]    = EXT4_FT_BLKDEV,
-        [S_IFIFO >> S_SHIFT]    = EXT4_FT_FIFO,
-        [S_IFSOCK >> S_SHIFT]   = EXT4_FT_SOCK,
-        [S_IFLNK >> S_SHIFT]    = EXT4_FT_SYMLINK,
-};
-static inline void ext4_set_de_type(struct super_block *sb,
-                                struct ext4_dir_entry_2 *de,
-                                umode_t mode) {
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
-                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
 /*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        dquot_initialize(dir);
        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-                   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
        inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                            NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
        dquot_initialize(dir);
        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-                   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
        inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                            NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        dquot_initialize(dir);
        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-                   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
        inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
                                            &dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
                 * quota blocks, sb is already counted in previous macros).
                 */
                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
-                          EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
        }
 retry:
        inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 047a6de04a0a..5929cd0baa20 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -29,25 +29,19 @@
 #include "xattr.h"
 #include "acl.h"
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
+static struct kmem_cache *io_end_cachep;
 int __init ext4_init_pageio(void)
 {
-        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
-        if (io_page_cachep == NULL)
-                return -ENOMEM;
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-        if (io_end_cachep == NULL) {
+        if (io_end_cachep == NULL)
-                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
-        }
        return 0;
 }
 void ext4_exit_pageio(void)
 {
        kmem_cache_destroy(io_end_cachep);
-        kmem_cache_destroy(io_page_cachep);
 }
 /*
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
                cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
 }
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
 {
-        if (atomic_dec_and_test(&io_page->p_count)) {
+        BUG_ON(!list_empty(&io_end->list));
-                end_page_writeback(io_page->p_page);
+        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-                put_page(io_page->p_page);
-                kmem_cache_free(io_page_cachep, io_page);
+        if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
-        }
+                wake_up_all(ext4_ioend_wq(io_end->inode));
+        if (io_end->flag & EXT4_IO_END_DIRECT)
+                inode_dio_done(io_end->inode);
+        if (io_end->iocb)
+                aio_complete(io_end->iocb, io_end->result, 0);
+        kmem_cache_free(io_end_cachep, io_end);
 }
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 {
-        int i;
+        struct inode *inode = io_end->inode;
-        BUG_ON(!io);
-        BUG_ON(!list_empty(&io->list));
-        BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
-        for (i = 0; i < io->num_io_pages; i++)
+        io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
-                put_io_page(io->pages[i]);
+        /* Wake up anyone waiting on unwritten extent conversion */
-        io->num_io_pages = 0;
+        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+                wake_up_all(ext4_ioend_wq(inode));
-                wake_up_all(ext4_ioend_wq(io->inode));
-        kmem_cache_free(io_end_cachep, io);
 }
 /* check a range of space and convert unwritten extents to written. */
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
                         "(inode %lu, offset %llu, size %zd, error %d)",
                         inode->i_ino, offset, size, ret);
        }
-        /* Wake up anyone waiting on unwritten extent conversion */
+        ext4_clear_io_unwritten_flag(io);
-        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+        ext4_release_io_end(io);
-                wake_up_all(ext4_ioend_wq(inode));
-        if (io->flag & EXT4_IO_END_DIRECT)
-                inode_dio_done(inode);
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
        return ret;
 }
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
 }
 /* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
        struct ext4_inode_info *ei = EXT4_I(io_end->inode);
        struct workqueue_struct *wq;
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
                err = ext4_end_io(io);
                if (unlikely(!ret && err))
                        ret = err;
-                io->flag &= ~EXT4_IO_END_UNWRITTEN;
-                ext4_free_io_end(io);
        }
        return ret;
 }
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_LIST_HEAD(&io->list);
+                atomic_set(&io->count, 1);
        }
        return io;
 }
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+        if (atomic_dec_and_test(&io_end->count)) {
+                if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+                        ext4_release_io_end(io_end);
+                        return;
+                }
+                ext4_add_complete_io(io_end);
+        }
+}
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+        int err = 0;
+        if (atomic_dec_and_test(&io_end->count)) {
+                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+                        err = ext4_convert_unwritten_extents(io_end->inode,
+                                                io_end->offset, io_end->size);
+                        ext4_clear_io_unwritten_flag(io_end);
+                }
+                ext4_release_io_end(io_end);
+        }
+        return err;
+}
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+        atomic_inc(&io_end->count);
+        return io_end;
+}
 /*
 * Print an buffer I/O error compatible with the fs/buffer.c.  This
 * provides compatibility with dmesg scrapers that look for a specific
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
        ext4_io_end_t *io_end = bio->bi_private;
        struct inode *inode;
        int i;
+        int blocksize;
        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
+        inode = io_end->inode;
+        blocksize = 1 << inode->i_blkbits;
        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        bio_put(bio);
+        for (i = 0; i < bio->bi_vcnt; i++) {
+                struct bio_vec *bvec = &bio->bi_io_vec[i];
-        for (i = 0; i < io_end->num_io_pages; i++) {
+                struct page *page = bvec->bv_page;
-                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
-                loff_t offset;
+                unsigned bio_start = bvec->bv_offset;
-                loff_t io_end_offset;
+                unsigned bio_end = bio_start + bvec->bv_len;
+                unsigned under_io = 0;
+                unsigned long flags;
+                if (!page)
+                        continue;
                if (error) {
                        SetPageError(page);
                        set_bit(AS_EIO, &page->mapping->flags);
-                        head = page_buffers(page);
-                        BUG_ON(!head);
-                        io_end_offset = io_end->offset + io_end->size;
-                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
-                        bh = head;
-                        do {
-                                if ((offset >= io_end->offset) &&
-                                    (offset+bh->b_size <= io_end_offset))
-                                        buffer_io_error(bh);
-                                offset += bh->b_size;
-                                bh = bh->b_this_page;
-                        } while (bh != head);
                }
+                bh = head = page_buffers(page);
-                put_io_page(io_end->pages[i]);
+                /*
+                 * We check all buffers in the page under BH_Uptodate_Lock
+                 * to avoid races with other end io clearing async_write flags
+                 */
+                local_irq_save(flags);
+                bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+                do {
+                        if (bh_offset(bh) < bio_start ||
+                            bh_offset(bh) + blocksize > bio_end) {
+                                if (buffer_async_write(bh))
+                                        under_io++;
+                                continue;
+                        }
+                        clear_buffer_async_write(bh);
+                        if (error)
+                                buffer_io_error(bh);
+                } while ((bh = bh->b_this_page) != head);
+                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+                local_irq_restore(flags);
+                if (!under_io)
+                        end_page_writeback(page);
        }
-        io_end->num_io_pages = 0;
+        bio_put(bio);
-        inode = io_end->inode;
        if (error) {
                io_end->flag |= EXT4_IO_END_ERROR;
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                             bi_sector >> (inode->i_blkbits - 9));
        }
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+        ext4_put_io_end_defer(io_end);
-                ext4_free_io_end(io_end);
-                return;
-        }
-        ext4_add_complete_io(io_end);
 }
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
                bio_put(io->io_bio);
        }
        io->io_bio = NULL;
-        io->io_op = 0;
+}
+void ext4_io_submit_init(struct ext4_io_submit *io,
+                         struct writeback_control *wbc)
+{
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+        io->io_bio = NULL;
        io->io_end = NULL;
 }
-static int io_submit_init(struct ext4_io_submit *io,
+static int io_submit_init_bio(struct ext4_io_submit *io,
-                          struct inode *inode,
+                              struct buffer_head *bh)
-                          struct writeback_control *wbc,
-                          struct buffer_head *bh)
 {
-        ext4_io_end_t *io_end;
-        struct page *page = bh->b_page;
        int nvecs = bio_get_nr_vecs(bh->b_bdev);
        struct bio *bio;
-        io_end = ext4_init_io_end(inode, GFP_NOFS);
-        if (!io_end)
-                return -ENOMEM;
        bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
+        bio->bi_private = ext4_get_io_end(io->io_end);
-        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+        if (!io->io_end->size)
+                io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
+                                     + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
 static int io_submit_add_bh(struct ext4_io_submit *io,
-                            struct ext4_io_page *io_page,
                            struct inode *inode,
-                            struct writeback_control *wbc,
                            struct buffer_head *bh)
 {
        ext4_io_end_t *io_end;
        int ret;
-        if (buffer_new(bh)) {
-                clear_buffer_new(bh);
-                unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
-        }
        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
 submit_and_retry:
                ext4_io_submit(io);
        }
        if (io->io_bio == NULL) {
-                ret = io_submit_init(io, inode, wbc, bh);
+                ret = io_submit_init_bio(io, bh);
                if (ret)
                        return ret;
        }
-        io_end = io->io_end;
-        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
-            (io_end->pages[io_end->num_io_pages-1] != io_page))
-                goto submit_and_retry;
-        if (buffer_uninit(bh))
-                ext4_set_io_unwritten_flag(inode, io_end);
-        io->io_end->size += bh->b_size;
-        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
        if (ret != bh->b_size)
                goto submit_and_retry;
-        if ((io_end->num_io_pages == 0) ||
+        io_end = io->io_end;
-            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+        if (test_clear_buffer_uninit(bh))
-                io_end->pages[io_end->num_io_pages++] = io_page;
+                ext4_set_io_unwritten_flag(inode, io_end);
-                atomic_inc(&io_page->p_count);
+        io_end->size += bh->b_size;
-        }
+        io->io_next_block++;
        return 0;
 }
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                        struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        unsigned block_start, block_end, blocksize;
+        unsigned block_start, blocksize;
-        struct ext4_io_page *io_page;
        struct buffer_head *bh, *head;
        int ret = 0;
+        int nr_submitted = 0;
        blocksize = 1 << inode->i_blkbits;
        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
-        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
-        if (!io_page) {
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return -ENOMEM;
-        }
-        io_page->p_page = page;
-        atomic_set(&io_page->p_count, 1);
-        get_page(page);
        set_page_writeback(page);
        ClearPageError(page);
-        for (bh = head = page_buffers(page), block_start = 0;
+        /*
-             bh != head || !block_start;
+         * In the first loop we prepare and mark buffers to submit. We have to
-             block_start = block_end, bh = bh->b_this_page) {
+         * mark all buffers in the page before submitting so that
+         * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
-                block_end = block_start + blocksize;
+         * on the first buffer finishes and we are still working on submitting
+         * the second buffer.
+         */
+        bh = head = page_buffers(page);
+        do {
+                block_start = bh_offset(bh);
                if (block_start >= len) {
                        /*
                         * Comments copied from block_write_full_page_endio:
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                         * mapped, and writes to that region are not written
                         * out to the file."
                         */
-                        zero_user_segment(page, block_start, block_end);
+                        zero_user_segment(page, block_start,
+                                          block_start + blocksize);
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                                ext4_io_submit(io);
                        continue;
                }
-                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+                if (buffer_new(bh)) {
+                        clear_buffer_new(bh);
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                }
+                set_buffer_async_write(bh);
+        } while ((bh = bh->b_this_page) != head);
+        /* Now submit buffers to write */
+        bh = head = page_buffers(page);
+        do {
+                if (!buffer_async_write(bh))
+                        continue;
+                ret = io_submit_add_bh(io, inode, bh);
                if (ret) {
                        /*
                         * We only get here on ENOMEM.  Not much else
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                        redirty_page_for_writepage(wbc, page);
                        break;
                }
+                nr_submitted++;
                clear_buffer_dirty(bh);
+        } while ((bh = bh->b_this_page) != head);
+        /* Error stopped previous loop? Clean up buffers... */
+        if (ret) {
+                do {
+                        clear_buffer_async_write(bh);
+                        bh = bh->b_this_page;
+                } while (bh != head);
        }
        unlock_page(page);
-        /*
+        /* Nothing submitted - we have to end page writeback */
-         * If the page was truncated before we could do the writeback,
+        if (!nr_submitted)
-         * or we had a memory allocation error while trying to write
+                end_page_writeback(page);
-         * the first buffer head, we won't have submitted any pages for
-         * I/O.  In that case we need to make sure we've cleared the
-         * PageWriteback bit from the page to prevent the system from
-         * wedging later on.
-         */
-        put_io_page(io_page);
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c169477a62c9..b27c96d01965 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ next_group:
                if (start_blk >= last_blk)
                        goto next_group;
                group_data[bb_index].block_bitmap = start_blk++;
-                ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+                group = ext4_get_group_number(sb, start_blk - 1);
                group -= group_data[0].group;
                group_data[group].free_blocks_count--;
                if (flexbg_size > 1)
@@ -284,7 +284,7 @@ next_group:
                if (start_blk >= last_blk)
                        goto next_group;
                group_data[ib_index].inode_bitmap = start_blk++;
-                ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+                group = ext4_get_group_number(sb, start_blk - 1);
                group -= group_data[0].group;
                group_data[group].free_blocks_count--;
                if (flexbg_size > 1)
@@ -296,7 +296,7 @@ next_group:
                if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
                        goto next_group;
                group_data[it_index].inode_table = start_blk;
-                ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+                group = ext4_get_group_number(sb, start_blk - 1);
                group -= group_data[0].group;
                group_data[group].free_blocks_count -=
                                        EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
                ext4_group_t group;
                int err;
-                ext4_get_group_no_and_offset(sb, block, &group, NULL);
+                group = ext4_get_group_number(sb, block);
                start = ext4_group_first_block_no(sb, group);
                group -= flex_gd->groups[0].group;
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
        /* Update the global fs size fields */
        sbi->s_groups_count += flex_gd->count;
+        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -1879,7 +1881,11 @@ retry:
                /* Nothing need to do */
                return 0;
-        ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+        n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+        if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+                ext4_warning(sb, "resize would cause inodes_count overflow");
+                return -EINVAL;
+        }
        ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
        n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..dbc7c090c13a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
        struct super_block              *sb = journal->j_private;
        struct ext4_sb_info             *sbi = EXT4_SB(sb);
        int                             error = is_journal_aborted(journal);
-        struct ext4_journal_cb_entry    *jce, *tmp;
+        struct ext4_journal_cb_entry    *jce;
+        BUG_ON(txn->t_state == T_FINISHED);
        spin_lock(&sbi->s_md_lock);
-        list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+        while (!list_empty(&txn->t_private_list)) {
+                jce = list_entry(txn->t_private_list.next,
+                                 struct ext4_journal_cb_entry, jce_list);
                list_del_init(&jce->jce_list);
                spin_unlock(&sbi->s_md_lock);
                jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
        if ((sbi->s_es->s_feature_ro_compat &
             cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
                /* Use new metadata_csum algorithm */
-                __u16 old_csum;
+                __le16 save_csum;
                __u32 csum32;
-                old_csum = gdp->bg_checksum;
+                save_csum = gdp->bg_checksum;
                gdp->bg_checksum = 0;
                csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
                                     sizeof(le_group));
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
                                     sbi->s_desc_size);
-                gdp->bg_checksum = old_csum;
+                gdp->bg_checksum = save_csum;
                crc = csum32 & 0xFFFF;
                goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
        int offset;
 };
-static int parse_strtoul(const char *buf,
+static int parse_strtoull(const char *buf,
-                unsigned long max, unsigned long *value)
+                unsigned long long max, unsigned long long *value)
 {
-        char *endp;
+        int ret;
-        *value = simple_strtoul(skip_spaces(buf), &endp, 0);
-        endp = skip_spaces(endp);
-        if (*endp || *value > max)
-                return -EINVAL;
-        return 0;
+        ret = kstrtoull(skip_spaces(buf), 0, value);
+        if (!ret && *value > max)
+                ret = -EINVAL;
+        return ret;
 }
 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                          const char *buf, size_t count)
 {
        unsigned long t;
+        int ret;
-        if (parse_strtoul(buf, 0x40000000, &t))
+        ret = kstrtoul(skip_spaces(buf), 0, &t);
-                return -EINVAL;
+        if (ret)
+                return ret;
-        if (t && !is_power_of_2(t))
+        if (t && (!is_power_of_2(t) || t > 0x40000000))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
 {
        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
        unsigned long t;
+        int ret;
-        if (parse_strtoul(buf, 0xffffffff, &t))
+        ret = kstrtoul(skip_spaces(buf), 0, &t);
-                return -EINVAL;
+        if (ret)
+                return ret;
        *ui = t;
        return count;
 }
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+                                  struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+                                   struct ext4_sb_info *sbi,
+                                   const char *buf, size_t count)
+{
+        unsigned long long val;
+        int ret;
+        if (parse_strtoull(buf, -1ULL, &val))
+                return -EINVAL;
+        ret = ext4_reserve_clusters(sbi, val);
+        return ret ? ret : count;
+}
 static ssize_t trigger_test_error(struct ext4_attr *a,
                                  struct ext4_sb_info *sbi,
                                  const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(reserved_clusters),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
        return 0;
 }
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+        ext4_fsblk_t resv_clusters;
+        /*
+         * By default we reserve 2% or 4096 clusters, whichever is smaller.
+         * This should cover the situations where we can not afford to run
+         * out of space like for example punch hole, or converting
+         * uninitialized extents in delalloc path. In most cases such
+         * allocation would require 1, or 2 blocks, higher numbers are
+         * very rare.
+         */
+        resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+        do_div(resv_clusters, 50);
+        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+        return resv_clusters;
+}
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+        ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+                                sbi->s_cluster_bits;
+        if (count >= clusters)
+                return -EINVAL;
+        atomic64_set(&sbi->s_resv_clusters, count);
+        return 0;
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
        char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+        /* Do we have standard group size of blocksize * 8 blocks ? */
+        if (sbi->s_blocks_per_group == blocksize << 3)
+                set_opt2(sb, STD_GROUP_SIZE);
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.function = print_daily_error_info;
        sbi->s_err_report.data = (unsigned long) sb;
+        /* Register extent status tree shrinker */
+        ext4_es_register_shrinker(sb);
        err = percpu_counter_init(&sbi->s_freeclusters_counter,
                        ext4_count_free_clusters(sb));
        if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_writeback_mb_bump = 128;
        sbi->s_extent_max_zeroout_kb = 32;
-        /* Register extent status tree shrinker */
-        ext4_es_register_shrinker(sb);
        /*
         * set up enough so that it can read an inode
         */
@@ -3911,6 +3978,13 @@ no_journal:
                         "available");
        }
+        err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+                         "reserved pool", ext4_calculate_resv_clusters(sbi));
+                goto failed_mount4a;
+        }
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        ext4_es_unregister_shrinker(sb);
        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
        journal->j_private = sb;
-        ll_rw_block(READ, 1, &journal->j_sb_buffer);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
-        ext4_fsblk_t overhead = 0;
+        ext4_fsblk_t overhead = 0, resv_blocks;
        u64 fsid;
        s64 bfree;
+        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
        if (!test_opt(sb, MINIX_DF))
                overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+        buf->f_bavail = buf->f_bfree -
-        if (buf->f_bfree < ext4_r_blocks_count(es))
+                        (ext4_r_blocks_count(es) + resv_blocks);
+        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                return PTR_ERR(qf_inode);
        }
+        /* Don't account quota for quota files to avoid recursion */
+        qf_inode->i_flags |= S_NOQUOTA;
        err = dquot_enable(qf_inode, type, format_id, flags);
        iput(qf_inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a120b277240..c081e34f717f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    struct ext4_xattr_header *hdr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        __u32 csum, old;
+        __u32 csum;
+        __le32 save_csum;
+        __le64 dsk_block_nr = cpu_to_le64(block_nr);
-        old = hdr->h_checksum;
+        save_csum = hdr->h_checksum;
        hdr->h_checksum = 0;
-        block_nr = cpu_to_le64(block_nr);
+        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
-        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+                           sizeof(dsk_block_nr));
-                           sizeof(block_nr));
        csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
                           EXT4_BLOCK_SIZE(inode->i_sb));
-        hdr->h_checksum = old;
+        hdr->h_checksum = save_csum;
        return cpu_to_le32(csum);
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index aa25deb5c6cd..c767dbdd7fc4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
 #define EXT4_XATTR_INDEX_LUSTRE                 5
 #define EXT4_XATTR_INDEX_SECURITY               6
 #define EXT4_XATTR_INDEX_SYSTEM                 7
+#define EXT4_XATTR_INDEX_RICHACL                8
 struct ext4_xattr_header {
        __le32  h_magic;        /* magic number for identification */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int space_left = 0;
        int first_tag = 0;
        int tag_flag;
-        int i, to_free = 0;
+        int i;
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
        spin_unlock(&journal->j_history_lock);
-        commit_transaction->t_state = T_FINISHED;
+        commit_transaction->t_state = T_COMMIT_CALLBACK;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
                                journal->j_average_commit_time*3) / 4;
        else
                journal->j_average_commit_time = commit_time;
        write_unlock(&journal->j_state_lock);
-        if (commit_transaction->t_checkpoint_list == NULL &&
+        if (journal->j_checkpoint_transactions == NULL) {
-            commit_transaction->t_checkpoint_io_list == NULL) {
+                journal->j_checkpoint_transactions = commit_transaction;
-                __jbd2_journal_drop_transaction(journal, commit_transaction);
+                commit_transaction->t_cpnext = commit_transaction;
-                to_free = 1;
+                commit_transaction->t_cpprev = commit_transaction;
        } else {
-                if (journal->j_checkpoint_transactions == NULL) {
+                commit_transaction->t_cpnext =
-                        journal->j_checkpoint_transactions = commit_transaction;
+                        journal->j_checkpoint_transactions;
-                        commit_transaction->t_cpnext = commit_transaction;
+                commit_transaction->t_cpprev =
-                        commit_transaction->t_cpprev = commit_transaction;
+                        commit_transaction->t_cpnext->t_cpprev;
-                } else {
+                commit_transaction->t_cpnext->t_cpprev =
-                        commit_transaction->t_cpnext =
+                        commit_transaction;
-                                journal->j_checkpoint_transactions;
+                commit_transaction->t_cpprev->t_cpnext =
-                        commit_transaction->t_cpprev =
-                                commit_transaction->t_cpnext->t_cpprev;
-                        commit_transaction->t_cpnext->t_cpprev =
-                                commit_transaction;
-                        commit_transaction->t_cpprev->t_cpnext =
                                commit_transaction;
-                }
        }
        spin_unlock(&journal->j_list_lock);
+        /* Drop all spin_locks because commit_callback may be block.
+         * __journal_remove_checkpoint() can not destroy transaction
+         * under us because it is not marked as T_FINISHED yet */
        if (journal->j_commit_callback)
                journal->j_commit_callback(journal, commit_transaction);
        trace_jbd2_end_commit(journal, commit_transaction);
        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
-        if (to_free)
-                jbd2_journal_free_transaction(commit_transaction);
+        write_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        commit_transaction->t_state = T_FINISHED;
+        /* Recheck checkpoint lists after j_list_lock was dropped */
+        if (commit_transaction->t_checkpoint_list == NULL &&
+            commit_transaction->t_checkpoint_io_list == NULL) {
+                __jbd2_journal_drop_transaction(journal, commit_transaction);
+                jbd2_journal_free_transaction(commit_transaction);
+        }
+        spin_unlock(&journal->j_list_lock);
+        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8b220f1ab54f..f6c5ba027f4f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -708,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 }
 /*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+        int     need_to_wait = 1;
+        read_lock(&journal->j_state_lock);
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == tid) {
+                if (journal->j_commit_request != tid) {
+                        /* transaction not yet started, so request it */
+                        read_unlock(&journal->j_state_lock);
+                        jbd2_log_start_commit(journal, tid);
+                        goto wait_commit;
+                }
+        } else if (!(journal->j_committing_transaction &&
+                     journal->j_committing_transaction->t_tid == tid))
+                need_to_wait = 0;
+        read_unlock(&journal->j_state_lock);
+        if (!need_to_wait)
+                return 0;
+wait_commit:
+        return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+/*
 * Log buffer allocation routines:
 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..10f524c59ea8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
-        memset(handle, 0, sizeof(*handle));
        handle->h_buffer_credits = nblocks;
        handle->h_ref = 1;
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        int error;
        char *frozen_buffer = NULL;
        int need_copy = 0;
+        unsigned long start_lock, time_lock;
        if (is_handle_aborted(handle))
                return -EROFS;
@@ -655,9 +655,16 @@ repeat:
        /* @@@ Need to check for errors here at some point. */
+        start_lock = jiffies;
        lock_buffer(bh);
        jbd_lock_bh_state(bh);
+        /* If it takes too long to lock the buffer, trace it */
+        time_lock = jbd2_time_diff(start_lock, jiffies);
+        if (time_lock > HZ/10)
+                trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+                        jiffies_to_msecs(time_lock));
        /* We now hold the buffer lock so it is safe to query the buffer
         * state.  Is the buffer dirty?
         *
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 4c16c4a88d47..9e52b0626b39 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
        BH_Write_EIO,   /* I/O error on write */
        BH_Unwritten,   /* Buffer is allocated on disk but not written */
        BH_Quiet,       /* Buffer Error Prinks to be quiet */
+        BH_Meta,        /* Buffer contains metadata */
+        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
 BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Meta, meta)
+BUFFER_FNS(Prio, prio)
 #define bh_offset(bh)           ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 50e5a5e6a712..6e051f472edb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
                T_COMMIT,
                T_COMMIT_DFLUSH,
                T_COMMIT_JFLUSH,
+                T_COMMIT_CALLBACK,
                T_FINISHED
        }                       t_state;
@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
 static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
 {
-        return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+        return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
 }
 static inline void jbd2_free_handle(handle_t *handle)
@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index c18b46f8aeeb..13a3da25ff07 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
        /*
         * Journalling list for this buffer [jbd_lock_bh_state()]
         */
-        unsigned b_jlist;
+        unsigned b_jlist:4;
        /*
         * This flag signals the buffer has been modified by
         * the currently running transaction
         * [jbd_lock_bh_state()]
         */
-        unsigned b_modified;
+        unsigned b_modified:1;
-        /*
-         * This feild tracks the last transaction id in which this buffer
-         * has been cowed
-         * [jbd_lock_bh_state()]
-         */
-        tid_t b_cow_tid;
        /*
         * Copy of the buffer data frozen for writing to the log.
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4ee471003859..d0e686402df8 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end,
                  __entry->pos, __entry->len, __entry->copied)
 );
-DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_write_end,
-        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-                 unsigned int copied),
-        TP_ARGS(inode, pos, len, copied)
-);
-DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),
@@ -1956,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks,
                __entry->to             = to;
                __entry->partial        = partial_cluster;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
-                __entry->ee_lblk        = cpu_to_le32(ex->ee_block);
+                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len         = ext4_ext_get_actual_len(ex);
        ),
@@ -2060,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space,
 TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
-                ext4_lblk_t partial, unsigned short eh_entries),
+                ext4_lblk_t partial, __le16 eh_entries),
        TP_ARGS(inode, start, depth, partial, eh_entries),
@@ -2079,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
                __entry->start          = start;
                __entry->depth          = depth;
                __entry->partial        = partial;
-                __entry->eh_entries     = eh_entries;
+                __entry->eh_entries     = le16_to_cpu(eh_entries);
        ),
        TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 070df49e4a1d..c1d1f3eb242d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock,
                  MINOR(__entry->dev), __entry->write_op)
 );
+TRACE_EVENT(jbd2_lock_buffer_stall,
+        TP_PROTO(dev_t dev, unsigned long stall_ms),
+        TP_ARGS(dev, stall_ms),
+        TP_STRUCT__entry(
+                __field(        dev_t, dev      )
+                __field(unsigned long, stall_ms )
+        ),
+        TP_fast_assign(
+                __entry->dev            = dev;
+                __entry->stall_ms       = stall_ms;
+        ),
+        TP_printk("dev %d,%d stall_ms %lu",
+                MAJOR(__entry->dev), MINOR(__entry->dev),
+                __entry->stall_ms)
+);
 #endif /* _TRACE_JBD2_H */
 /* This part must be outside protection */