13 files changed, 1048 insertions, 555 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
          If unsure, say N.
-config EXT4DEV_COMPAT
-        bool "Enable ext4dev compatibility"
-        depends on EXT4_FS
-        help
-          Starting with 2.6.28, the name of the ext4 filesystem was
-          renamed from ext4dev to ext4.  Unfortunately there are some
-          legacy userspace programs (such as klibc's fstype) have
-          "ext4dev" hardcoded.
-          To enable backwards compatibility so that systems that are
-          still expecting to mount ext4 filesystems using ext4dev,
-          choose Y here.   This feature will go away by 2.6.31, so
-          please arrange to get your userspace programs fixed!
 config EXT4_FS_XATTR
        bool "Ext4 extended attributes"
        depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
+/*
+ * Flags used in mballoc's allocation_context flags field.  
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
+#define DIO_AIO_UNWRITTEN       0x1
+typedef struct ext4_io_end {
+        struct list_head        list;           /* per-file finished AIO list */
+        struct inode            *inode;         /* file being written to */
+        unsigned int            flag;           /* unwritten or not */
+        int                     error;          /* I/O error code */
+        ext4_lblk_t             offset;         /* offset in the file */
+        size_t                  size;           /* size of the extent */
+        struct work_struct      work;           /* data work queue */
+} ext4_io_end_t;
 /*
 * Special inodes numbers
@@ -347,7 +363,16 @@ struct ext4_new_group_data {
        /* Call ext4_da_update_reserve_space() after successfully 
           allocating the blocks */
 #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
+        /* caller is from the direct IO path, request to creation of an
+        unitialized extents if not allocated, split the uninitialized
+        extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO                     0x0010
+#define EXT4_GET_BLOCKS_CONVERT                 0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT          (EXT4_GET_BLOCKS_DIO|\
+                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+        /* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
+                                         EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 /*
 * ioctl commands
@@ -500,8 +525,8 @@ struct move_extent {
 static inline __le32 ext4_encode_extra_time(struct timespec *time)
 {
       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
-                           time->tv_sec >> 32 : 0) |
+                           (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
-                           ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+                          ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
 }
 static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
       if (sizeof(time->tv_sec) > 4)
               time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
                               << 32;
-       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)                          \
@@ -672,6 +697,11 @@ struct ext4_inode_info {
        __u16 i_extra_isize;
        spinlock_t i_block_reservation_lock;
+        /* completed async DIOs that might need unwritten extents handling */
+        struct list_head i_aio_dio_complete_list;
+        /* current io_end structure for async DIO write*/
+        ext4_io_end_t *cur_aio_dio;
 };
 /*
@@ -942,18 +972,11 @@ struct ext4_sb_info {
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
+        unsigned int s_max_writeback_mb_bump;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
-        /* history to debug policy */
-        struct ext4_mb_history *s_mb_history;
-        int s_mb_history_cur;
-        int s_mb_history_max;
-        int s_mb_history_num;
-        spinlock_t s_mb_history_lock;
-        int s_mb_history_filter;
        /* stats for buddy allocator */
        spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
@@ -980,6 +1003,9 @@ struct ext4_sb_info {
        unsigned int s_log_groups_per_flex;
        struct flex_groups *s_flex_groups;
+        /* workqueue for dio unwritten */
+        struct workqueue_struct *dio_unwritten_wq;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
+extern int flush_aio_dio_completed_IO(struct inode *inode);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+                          loff_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
 extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
-#define EXT4_NOJOURNAL_HANDLE   ((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+/* Note:  Do not use this for NULL handles.  This is only to determine if
+ * a properly allocated handle is using a journal or not. */
 static inline int ext4_handle_valid(handle_t *handle)
 {
-        if (handle == EXT4_NOJOURNAL_HANDLE)
+        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *curp,
                                int logical, ext4_fsblk_t ptr)
 {
@@ -1586,7 +1586,7 @@ out:
 */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path,
-                                struct ext4_extent *newext)
+                                struct ext4_extent *newext, int flag)
 {
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        BUG_ON(path[depth].p_hdr == NULL);
        /* try to insert block into found extent and return */
-        if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+        if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
 merge:
        /* try to merge extents to the right */
-        ext4_ext_try_to_merge(inode, path, nearex);
+        if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+                ext4_ext_try_to_merge(inode, path, nearex);
        /* try to merge extents to the left */
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
         */
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
                printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
                printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
                printk(", stats");
 #endif
                printk("\n");
+#endif
 #ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
                EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
 * This function is called by ext4_ext_get_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex3->ee_block = cpu_to_le32(iblock);
                        ext4_ext_store_pblock(ex3, newblock);
                        ex3->ee_len = cpu_to_le16(allocated);
-                        err = ext4_ext_insert_extent(handle, inode, path, ex3);
+                        err = ext4_ext_insert_extent(handle, inode, path,
+                                                        ex3, 0);
                        if (err == -ENOSPC) {
                                err =  ext4_ext_zeroout(inode, &orig_ex);
                                if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                ext4_ext_store_pblock(ex3, newblock + max_blocks);
                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3);
+                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
                if (err == -ENOSPC) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        err = ext4_ext_dirty(handle, inode, path + depth);
        goto out;
 insert:
-        err = ext4_ext_insert_extent(handle, inode, path, &newex);
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
        if (err == -ENOSPC) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
@@ -2785,6 +2789,324 @@ fix_extent_len:
 }
 /*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be uninitialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+                                        struct inode *inode,
+                                        struct ext4_ext_path *path,
+                                        ext4_lblk_t iblock,
+                                        unsigned int max_blocks,
+                                        int flags)
+{
+        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_extent *ex1 = NULL;
+        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex3 = NULL;
+        struct ext4_extent_header *eh;
+        ext4_lblk_t ee_block;
+        unsigned int allocated, ee_len, depth;
+        ext4_fsblk_t newblock;
+        int err = 0;
+        int ret = 0;
+        ext_debug("ext4_split_unwritten_extents: inode %lu,"
+                  "iblock %llu, max_blocks %u\n", inode->i_ino,
+                  (unsigned long long)iblock, max_blocks);
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        allocated = ee_len - (iblock - ee_block);
+        newblock = iblock - ee_block + ext_pblock(ex);
+        ex2 = ex;
+        orig_ex.ee_block = ex->ee_block;
+        orig_ex.ee_len   = cpu_to_le16(ee_len);
+        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        /*
+         * if the entire unintialized extent length less than
+         * the size of extent to write, there is no need to split
+         * uninitialized extent
+         */
+        if (allocated <= max_blocks)
+                return ret;
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        /* ex1: ee_block to iblock - 1 : uninitialized */
+        if (iblock > ee_block) {
+                ex1 = ex;
+                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ext4_ext_mark_uninitialized(ex1);
+                ex2 = &newex;
+        }
+        /*
+         * for sanity, update the length of the ex2 extent before
+         * we insert ex3, if ex1 is NULL. This is to avoid temporary
+         * overlap of blocks.
+         */
+        if (!ex1 && allocated > max_blocks)
+                ex2->ee_len = cpu_to_le16(max_blocks);
+        /* ex3: to ee_block + ee_len : uninitialised */
+        if (allocated > max_blocks) {
+                unsigned int newdepth;
+                ex3 = &newex;
+                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ext4_ext_mark_uninitialized(ex3);
+                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+                if (err == -ENOSPC) {
+                        err =  ext4_ext_zeroout(inode, &orig_ex);
+                        if (err)
+                                goto fix_extent_len;
+                        /* update the extent length and mark as initialized */
+                        ex->ee_block = orig_ex.ee_block;
+                        ex->ee_len   = orig_ex.ee_len;
+                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_dirty(handle, inode, path + depth);
+                        /* zeroed the full extent */
+                        /* blocks available from iblock */
+                        return allocated;
+                } else if (err)
+                        goto fix_extent_len;
+                /*
+                 * The depth, and hence eh & ex might change
+                 * as part of the insert above.
+                 */
+                newdepth = ext_depth(inode);
+                /*
+                 * update the extent length after successful insert of the
+                 * split extent
+                 */
+                orig_ex.ee_len = cpu_to_le16(ee_len -
+                                                ext4_ext_get_actual_len(ex3));
+                depth = newdepth;
+                ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode, iblock, path);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        goto out;
+                }
+                eh = path[depth].p_hdr;
+                ex = path[depth].p_ext;
+                if (ex2 != &newex)
+                        ex2 = ex;
+                err = ext4_ext_get_access(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                allocated = max_blocks;
+        }
+        /*
+         * If there was a change of depth as part of the
+         * insertion of ex3 above, we need to update the length
+         * of the ex1 extent again here
+         */
+        if (ex1 && ex1 != ex) {
+                ex1 = ex;
+                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ext4_ext_mark_uninitialized(ex1);
+                ex2 = &newex;
+        }
+        /*
+         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+         * uninitialised still.
+         */
+        ex2->ee_block = cpu_to_le32(iblock);
+        ext4_ext_store_pblock(ex2, newblock);
+        ex2->ee_len = cpu_to_le16(allocated);
+        ext4_ext_mark_uninitialized(ex2);
+        if (ex2 != ex)
+                goto insert;
+        /* Mark modified extent as dirty */
+        err = ext4_ext_dirty(handle, inode, path + depth);
+        ext_debug("out here\n");
+        goto out;
+insert:
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+        if (err == -ENOSPC) {
+                err =  ext4_ext_zeroout(inode, &orig_ex);
+                if (err)
+                        goto fix_extent_len;
+                /* update the extent length and mark as initialized */
+                ex->ee_block = orig_ex.ee_block;
+                ex->ee_len   = orig_ex.ee_len;
+                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_dirty(handle, inode, path + depth);
+                /* zero out the first half */
+                return allocated;
+        } else if (err)
+                goto fix_extent_len;
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err ? err : allocated;
+fix_extent_len:
+        ex->ee_block = orig_ex.ee_block;
+        ex->ee_len   = orig_ex.ee_len;
+        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_mark_uninitialized(ex);
+        ext4_ext_dirty(handle, inode, path + depth);
+        return err;
+}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+                                              struct inode *inode,
+                                              struct ext4_ext_path *path)
+{
+        struct ext4_extent *ex;
+        struct ext4_extent_header *eh;
+        int depth;
+        int err = 0;
+        int ret = 0;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        /* first mark the extent as initialized */
+        ext4_ext_mark_initialized(ex);
+        /*
+         * We have to see if it can be merged with the extent
+         * on the left.
+         */
+        if (ex > EXT_FIRST_EXTENT(eh)) {
+                /*
+                 * To merge left, pass "ex - 1" to try_to_merge(),
+                 * since it merges towards right _only_.
+                 */
+                ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+                if (ret) {
+                        err = ext4_ext_correct_indexes(handle, inode, path);
+                        if (err)
+                                goto out;
+                        depth = ext_depth(inode);
+                        ex--;
+                }
+        }
+        /*
+         * Try to Merge towards right.
+         */
+        ret = ext4_ext_try_to_merge(inode, path, ex);
+        if (ret) {
+                err = ext4_ext_correct_indexes(handle, inode, path);
+                if (err)
+                        goto out;
+                depth = ext_depth(inode);
+        }
+        /* Mark modified extent as dirty */
+        err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+}
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+                        ext4_lblk_t iblock, unsigned int max_blocks,
+                        struct ext4_ext_path *path, int flags,
+                        unsigned int allocated, struct buffer_head *bh_result,
+                        ext4_fsblk_t newblock)
+{
+        int ret = 0;
+        int err = 0;
+        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+                  "block %llu, max_blocks %u, flags %d, allocated %u",
+                  inode->i_ino, (unsigned long long)iblock, max_blocks,
+                  flags, allocated);
+        ext4_ext_show_leaf(inode, path);
+        /* DIO get_block() before submit the IO, split the extent */
+        if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+                ret = ext4_split_unwritten_extents(handle,
+                                                inode, path, iblock,
+                                                max_blocks, flags);
+                /* flag the io_end struct that we need convert when IO done */
+                if (io)
+                        io->flag = DIO_AIO_UNWRITTEN;
+                goto out;
+        }
+        /* DIO end_io complete, convert the filled extent to written */
+        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+                ret = ext4_convert_unwritten_extents_dio(handle, inode,
+                                                        path);
+                goto out2;
+        }
+        /* buffered IO case */
+        /*
+         * repeat fallocate creation request
+         * we already have an unwritten extent
+         */
+        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+                goto map_out;
+        /* buffered READ or buffered write_begin() lookup */
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                /*
+                 * We have blocks reserved already.  We
+                 * return allocated blocks so that delalloc
+                 * won't do block reservation for us.  But
+                 * the buffer head will be unmapped so that
+                 * a read from the block returns 0s.
+                 */
+                set_buffer_unwritten(bh_result);
+                goto out1;
+        }
+        /* buffered write, writepage time, convert*/
+        ret = ext4_ext_convert_to_initialized(handle, inode,
+                                                path, iblock,
+                                                max_blocks);
+out:
+        if (ret <= 0) {
+                err = ret;
+                goto out2;
+        } else
+                allocated = ret;
+        set_buffer_new(bh_result);
+map_out:
+        set_buffer_mapped(bh_result);
+out1:
+        if (allocated > max_blocks)
+                allocated = max_blocks;
+        ext4_ext_show_leaf(inode, path);
+        bh_result->b_bdev = inode->i_sb->s_bdev;
+        bh_result->b_blocknr = newblock;
+out2:
+        if (path) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+        return err ? err : allocated;
+}
+/*
 * Block allocation/map/preallocation routine for extents based files
 *
 *
@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
+        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
-                        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                goto out;
+                                        inode, iblock, max_blocks, path,
-                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                                        flags, allocated, bh_result, newblock);
-                                if (allocated > max_blocks)
+                        return ret;
-                                        allocated = max_blocks;
-                                /*
-                                 * We have blocks reserved already.  We
-                                 * return allocated blocks so that delalloc
-                                 * won't do block reservation for us.  But
-                                 * the buffer head will be unmapped so that
-                                 * a read from the block returns 0s.
-                                 */
-                                set_buffer_unwritten(bh_result);
-                                bh_result->b_bdev = inode->i_sb->s_bdev;
-                                bh_result->b_blocknr = newblock;
-                                goto out2;
-                        }
-                        ret = ext4_ext_convert_to_initialized(handle, inode,
-                                                                path, iblock,
-                                                                max_blocks);
-                        if (ret <= 0) {
-                                err = ret;
-                                goto out2;
-                        } else
-                                allocated = ret;
-                        goto outnew;
                }
        }
@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
        newex.ee_len = cpu_to_le16(ar.len);
-        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
+        /* Mark uninitialized */
+        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
                ext4_ext_mark_uninitialized(&newex);
-        err = ext4_ext_insert_extent(handle, inode, path, &newex);
+                /*
+                 * io_end structure was created for every async
+                 * direct IO write to the middle of the file.
+                 * To avoid unecessary convertion for every aio dio rewrite
+                 * to the mid of file, here we flag the IO that is really
+                 * need the convertion.
+                 *
+                 */
+                if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+                        io->flag = DIO_AIO_UNWRITTEN;
+        }
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
-outnew:
        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3512,63 @@ retry:
 }
 /*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+                                    loff_t len)
+{
+        handle_t *handle;
+        ext4_lblk_t block;
+        unsigned int max_blocks;
+        int ret = 0;
+        int ret2 = 0;
+        struct buffer_head map_bh;
+        unsigned int credits, blkbits = inode->i_blkbits;
+        block = offset >> blkbits;
+        /*
+         * We can't just convert len to max_blocks because
+         * If blocksize = 4096 offset = 3072 and len = 2048
+         */
+        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+                                                        - block;
+        /*
+         * credits to insert 1 extent into extent tree
+         */
+        credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        while (ret >= 0 && ret < max_blocks) {
+                block = block + ret;
+                max_blocks = max_blocks - ret;
+                handle = ext4_journal_start(inode, credits);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        break;
+                }
+                map_bh.b_state = 0;
+                ret = ext4_get_blocks(handle, inode, block,
+                                      max_blocks, &map_bh,
+                                      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+                if (ret <= 0) {
+                        WARN_ON(ret <= 0);
+                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                                    "returned error inode#%lu, block=%u, "
+                                    "max_blocks=%u", __func__,
+                                    inode->i_ino, block, max_blocks);
+                }
+                ext4_mark_inode_dirty(handle, inode);
+                ret2 = ext4_journal_stop(handle);
+                if (ret <= 0 || ret2 )
+                        break;
+        }
+        return ret > 0 ? ret2 : ret;
+}
+/*
 * Callback function called for each extent to gather FIEMAP information.
 */
 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
 *
 * What we do is just kick off a commit and wait on it.  This will snapshot the
 * inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
 */
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        trace_ext4_sync_file(file, dentry, datasync);
+        ret = flush_aio_dio_completed_IO(inode);
+        if (ret < 0)
+                goto out;
        /*
         * data=writeback:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/workqueue.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -1145,6 +1146,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
 }
 /*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+                                    unsigned int max_pages)
+{
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t index;
+        struct pagevec pvec;
+        pgoff_t num = 0;
+        int i, nr_pages, done = 0;
+        if (max_pages == 0)
+                return 0;
+        pagevec_init(&pvec, 0);
+        while (!done) {
+                index = idx;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                              PAGECACHE_TAG_DIRTY,
+                                              (pgoff_t)PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        lock_page(page);
+                        if (unlikely(page->mapping != mapping) ||
+                            !PageDirty(page) ||
+                            PageWriteback(page) ||
+                            page->index != idx) {
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (page_has_buffers(page)) {
+                                bh = head = page_buffers(page);
+                                do {
+                                        if (!buffer_delay(bh) &&
+                                            !buffer_unwritten(bh))
+                                                done = 1;
+                                        bh = bh->b_this_page;
+                                } while (!done && (bh != head));
+                        }
+                        unlock_page(page);
+                        if (done)
+                                break;
+                        idx++;
+                        num++;
+                        if (num >= max_pages)
+                                break;
+                }
+                pagevec_release(&pvec);
+        }
+        return num;
+}
+/*
 * The ext4_get_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
        clear_buffer_mapped(bh);
        clear_buffer_unwritten(bh);
+        ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, max_blocks,
+                  (unsigned long)block);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -1796,11 +1858,11 @@ repeat:
        if (ext4_claim_free_blocks(sbi, total)) {
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                vfs_dq_release_reservation_block(inode, total);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
                }
-                vfs_dq_release_reservation_block(inode, total);
                return -ENOSPC;
        }
        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        printk(KERN_EMERG "Total free blocks count %lld\n",
+        printk(KERN_CRIT "Total free blocks count %lld\n",
-                        ext4_count_free_blocks(inode->i_sb));
+               ext4_count_free_blocks(inode->i_sb));
-        printk(KERN_EMERG "Free/Dirty block details\n");
+        printk(KERN_CRIT "Free/Dirty block details\n");
-        printk(KERN_EMERG "free_blocks=%lld\n",
+        printk(KERN_CRIT "free_blocks=%lld\n",
-                        (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
+               (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
-        printk(KERN_EMERG "dirty_blocks=%lld\n",
+        printk(KERN_CRIT "dirty_blocks=%lld\n",
-                        (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
-        printk(KERN_EMERG "Block reservation details\n");
+        printk(KERN_CRIT "Block reservation details\n");
-        printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
+        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
-                        EXT4_I(inode)->i_reserved_data_blocks);
+               EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
+        printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
-                        EXT4_I(inode)->i_reserved_meta_blocks);
+               EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                 * writepage and writepages will again try to write
                 * the same.
                 */
-                printk(KERN_EMERG "%s block allocation failed for inode %lu "
+                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
-                                  "at logical offset %llu with max blocks "
+                         "delayed block allocation failed for inode %lu at "
-                                  "%zd with error %d\n",
+                         "logical offset %llu with max blocks %zd with "
-                                  __func__, mpd->inode->i_ino,
+                         "error %d\n", mpd->inode->i_ino,
-                                  (unsigned long long)next,
+                         (unsigned long long) next,
-                                  mpd->b_size >> mpd->inode->i_blkbits, err);
+                         mpd->b_size >> mpd->inode->i_blkbits, err);
-                printk(KERN_EMERG "This should not happen.!! "
+                printk(KERN_CRIT "This should not happen!!  "
-                                        "Data will be lost\n");
+                       "Data will be lost\n");
                if (err == -ENOSPC) {
                        ext4_print_free_blocks(mpd->inode);
                }
@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
+        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
-        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        int needed_blocks, ret = 0;
+        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
-        /*
-         * Make sure nr_to_write is >= sbi->s_mb_stream_request
-         * This make sure small files blocks are allocated in
-         * single attempt. This ensure that small files
-         * get less fragmented.
-         */
-        if (wbc->nr_to_write < sbi->s_mb_stream_request) {
-                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
-                wbc->nr_to_write = sbi->s_mb_stream_request;
-        }
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
        } else
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        /*
+         * This works around two forms of stupidity.  The first is in
+         * the writeback code, which caps the maximum number of pages
+         * written to be 1024 pages.  This is wrong on multiple
+         * levels; different architectues have a different page size,
+         * which changes the maximum amount of data which gets
+         * written.  Secondly, 4 megabytes is way too small.  XFS
+         * forces this value to be 16 megabytes by multiplying
+         * nr_to_write parameter by four, and then relies on its
+         * allocator to allocate larger extents to make them
+         * contiguous.  Unfortunately this brings us to the second
+         * stupidity, which is that ext4's mballoc code only allocates
+         * at most 2048 blocks.  So we force contiguous writes up to
+         * the number of dirty blocks in the inode, or
+         * sbi->max_writeback_mb_bump whichever is smaller.
+         */
+        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+        if (!range_cyclic && range_whole)
+                desired_nr_to_write = wbc->nr_to_write * 8;
+        else
+                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+                                                           max_pages);
+        if (desired_nr_to_write > max_pages)
+                desired_nr_to_write = max_pages;
+        if (wbc->nr_to_write < desired_nr_to_write) {
+                nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+                wbc->nr_to_write = desired_nr_to_write;
+        }
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
@@ -2822,10 +2906,9 @@ retry:
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        printk(KERN_CRIT "%s: jbd2_start: "
+                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d\n", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
-                        dump_stack();
                        goto out_writepages;
                }
@@ -2897,9 +2980,10 @@ retry:
                goto retry;
        }
        if (pages_skipped != wbc->pages_skipped)
-                printk(KERN_EMERG "This should not happen leaving %s "
+                ext4_msg(inode->i_sb, KERN_CRIT,
-                                "with nr_to_write = %ld ret = %d\n",
+                         "This should not happen leaving %s "
-                                __func__, wbc->nr_to_write, ret);
+                         "with nr_to_write = %ld ret = %d\n",
+                         __func__, wbc->nr_to_write, ret);
        /* Update index */
        index += pages_written;
@@ -2914,7 +2998,8 @@ retry:
 out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
-        wbc->nr_to_write -= nr_to_writebump;
+        if (wbc->nr_to_write > nr_to_writebump)
+                wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 /*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
 * If the O_DIRECT write will extend the file then add this inode to the
 * orphan list.  So recovery will truncate it back to the original size
 * if the machine crashes during the write.
@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * crashes then stale disk data _may_ be exposed inside the file. But current
 * VFS code falls back into buffered path in that case so we are safe.
 */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                              const struct iovec *iov, loff_t offset,
                              unsigned long nr_segs)
 {
@@ -3291,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        ssize_t ret;
        int orphan = 0;
        size_t count = iov_length(iov, nr_segs);
+        int retries = 0;
        if (rw == WRITE) {
                loff_t final_size = offset + count;
@@ -3313,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                }
        }
+retry:
        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
        if (orphan) {
                int err;
@@ -3354,6 +3445,359 @@ out:
        return ret;
 }
+/* Maximum number of blocks we map for direct IO at once. */
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create)
+{
+        handle_t *handle = NULL;
+        int ret = 0;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int dio_credits;
+        ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+                   inode->i_ino, create);
+        /*
+         * DIO VFS code passes create = 0 flag for write to
+         * the middle of file. It does this to avoid block
+         * allocation for holes, to prevent expose stale data
+         * out when there is parallel buffered read (which does
+         * not hold the i_mutex lock) while direct IO write has
+         * not completed. DIO request on holes finally falls back
+         * to buffered IO for this reason.
+         *
+         * For ext4 extent based file, since we support fallocate,
+         * new allocated extent as uninitialized, for holes, we
+         * could fallocate blocks for holes, thus parallel
+         * buffered IO read will zero out the page when read on
+         * a hole while parallel DIO write to the hole has not completed.
+         *
+         * when we come here, we know it's a direct IO write to
+         * to the middle of file (<i_size)
+         * so it's safe to override the create flag from VFS.
+         */
+        create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+        if (max_blocks > DIO_MAX_BLOCKS)
+                max_blocks = DIO_MAX_BLOCKS;
+        dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        handle = ext4_journal_start(inode, dio_credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+                              create);
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+        BUG_ON(!io);
+        iput(io->inode);
+        kfree(io);
+}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef  EXT4_DEBUG
+        struct list_head *cur, *before, *after;
+        ext4_io_end_t *io, *io0, *io1;
+        if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+                ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+                return;
+        }
+        ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+        list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+                cur = &io->list;
+                before = cur->prev;
+                io0 = container_of(before, ext4_io_end_t, list);
+                after = cur->next;
+                io1 = container_of(after, ext4_io_end_t, list);
+                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                            io, inode->i_ino, io0, io1);
+        }
+#endif
+}
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+{
+        struct inode *inode = io->inode;
+        loff_t offset = io->offset;
+        size_t size = io->size;
+        int ret = 0;
+        ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+                   "list->prev 0x%p\n",
+                   io, inode->i_ino, io->list.next, io->list.prev);
+        if (list_empty(&io->list))
+                return ret;
+        if (io->flag != DIO_AIO_UNWRITTEN)
+                return ret;
+        if (offset + size <= i_size_read(inode))
+                ret = ext4_convert_unwritten_extents(inode, offset, size);
+        if (ret < 0) {
+                printk(KERN_EMERG "%s: failed to convert unwritten"
+                        "extents to written extents, error is %d"
+                        " io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+                return ret;
+        }
+        /* clear the DIO AIO unwritten flag */
+        io->flag = 0;
+        return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+        ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
+        struct inode *inode = io->inode;
+        int ret = 0;
+        mutex_lock(&inode->i_mutex);
+        ret = ext4_end_aio_dio_nolock(io);
+        if (ret >= 0) {
+                if (!list_empty(&io->list))
+                        list_del_init(&io->list);
+                ext4_free_io_end(io);
+        }
+        mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+        ext4_io_end_t *io;
+        int ret = 0;
+        int ret2 = 0;
+        if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+                return ret;
+        dump_aio_dio_list(inode);
+        while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+                io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+                                ext4_io_end_t, list);
+                /*
+                 * Calling ext4_end_aio_dio_nolock() to convert completed
+                 * IO to written.
+                 *
+                 * When ext4_sync_file() is called, run_queue() may already
+                 * about to flush the work corresponding to this io structure.
+                 * It will be upset if it founds the io structure related
+                 * to the work-to-be schedule is freed.
+                 *
+                 * Thus we need to keep the io structure still valid here after
+                 * convertion finished. The io structure has a flag to
+                 * avoid double converting from both fsync and background work
+                 * queue work.
+                 */
+                ret = ext4_end_aio_dio_nolock(io);
+                if (ret < 0)
+                        ret2 = ret;
+                else
+                        list_del_init(&io->list);
+        }
+        return (ret2 < 0) ? ret2 : 0;
+}
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+{
+        ext4_io_end_t *io = NULL;
+        io = kmalloc(sizeof(*io), GFP_NOFS);
+        if (io) {
+                igrab(inode);
+                io->inode = inode;
+                io->flag = 0;
+                io->offset = 0;
+                io->size = 0;
+                io->error = 0;
+                INIT_WORK(&io->work, ext4_end_aio_dio_work);
+                INIT_LIST_HEAD(&io->list);
+        }
+        return io;
+}
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+                            ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+        struct workqueue_struct *wq;
+        ext_debug("ext4_end_io_dio(): io_end 0x%p"
+                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+                  iocb->private, io_end->inode->i_ino, iocb, offset,
+                  size);
+        /* if not async direct IO or dio with 0 bytes write, just return */
+        if (!io_end || !size)
+                return;
+        /* if not aio dio with unwritten extents, just free io and return */
+        if (io_end->flag != DIO_AIO_UNWRITTEN){
+                ext4_free_io_end(io_end);
+                iocb->private = NULL;
+                return;
+        }
+        io_end->offset = offset;
+        io_end->size = size;
+        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+        /* Add the io_end to per-inode completed aio dio list*/
+        list_add_tail(&io_end->list,
+                 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+        iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = offset + count;
+        if (rw == WRITE && final_size <= inode->i_size) {
+                /*
+                 * We could direct write to holes and fallocate.
+                 *
+                 * Allocated blocks to fill the hole are marked as uninitialized
+                 * to prevent paralel buffered read to expose the stale data
+                 * before DIO complete the data IO.
+                 *
+                 * As to previously fallocated extents, ext4 get_block
+                 * will just simply mark the buffer mapped but still
+                 * keep the extents uninitialized.
+                 *
+                 * for non AIO case, we will convert those unwritten extents
+                 * to written after return back from blockdev_direct_IO.
+                 *
+                 * for async DIO, the conversion needs to be defered when
+                 * the IO is completed. The ext4 end_io callback function
+                 * will be called to take care of the conversion work.
+                 * Here for async case, we allocate an io_end structure to
+                 * hook to the iocb.
+                 */
+                iocb->private = NULL;
+                EXT4_I(inode)->cur_aio_dio = NULL;
+                if (!is_sync_kiocb(iocb)) {
+                        iocb->private = ext4_init_io_end(inode);
+                        if (!iocb->private)
+                                return -ENOMEM;
+                        /*
+                         * we save the io structure for current async
+                         * direct IO, so that later ext4_get_blocks()
+                         * could flag the io structure whether there
+                         * is a unwritten extents needs to be converted
+                         * when IO is completed.
+                         */
+                        EXT4_I(inode)->cur_aio_dio = iocb->private;
+                }
+                ret = blockdev_direct_IO(rw, iocb, inode,
+                                         inode->i_sb->s_bdev, iov,
+                                         offset, nr_segs,
+                                         ext4_get_block_dio_write,
+                                         ext4_end_io_dio);
+                if (iocb->private)
+                        EXT4_I(inode)->cur_aio_dio = NULL;
+                /*
+                 * The io_end structure takes a reference to the inode,
+                 * that structure needs to be destroyed and the
+                 * reference to the inode need to be dropped, when IO is
+                 * complete, even with 0 byte write, or failed.
+                 *
+                 * In the successful AIO DIO case, the io_end structure will be
+                 * desctroyed and the reference to the inode will be dropped
+                 * after the end_io call back function is called.
+                 *
+                 * In the case there is 0 byte write, or error case, since
+                 * VFS direct IO won't invoke the end_io call back function,
+                 * we need to free the end_io structure here.
+                 */
+                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+                        ext4_free_io_end(iocb->private);
+                        iocb->private = NULL;
+                } else if (ret > 0)
+                        /*
+                         * for non AIO case, since the IO is already
+                         * completed, we could do the convertion right here
+                         */
+                        ret = ext4_convert_unwritten_extents(inode,
+                                                                offset, ret);
+                return ret;
+        }
+        /* for write the the end of file case, we fall back to old way */
+        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
 /*
 * Pages can be marked dirty completely asynchronously from ext4's journalling
 * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@ -4551,8 +4995,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
 */
 static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
-                                struct ext4_iloc *iloc,
+                                struct ext4_iloc *iloc)
-                                int do_sync)
 {
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5096,10 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
-        /*
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-         * If we're not using a journal and we were called from
+        rc = ext4_handle_dirty_metadata(handle, inode, bh);
-         * ext4_write_inode() to sync the inode (making do_sync true),
+        if (!err)
-         * we can just use sync_dirty_buffer() directly to do our dirty
+                err = rc;
-         * work.  Testing s_journal here is a bit redundant but it's
-         * worth it to avoid potential future trouble.
-         */
-        if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
-                BUFFER_TRACE(bh, "call sync_dirty_buffer");
-                sync_dirty_buffer(bh);
-        } else {
-                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                rc = ext4_handle_dirty_metadata(handle, inode, bh);
-                if (!err)
-                        err = rc;
-        }
        ei->i_state &= ~EXT4_STATE_NEW;
 out_brelse:
@@ -4736,8 +5167,16 @@ int ext4_write_inode(struct inode *inode, int wait)
                err = ext4_get_inode_loc(inode, &iloc);
                if (err)
                        return err;
-                err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+                if (wait)
-                                           inode, &iloc, wait);
+                        sync_dirty_buffer(iloc.bh);
+                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+                        ext4_error(inode->i_sb, __func__,
+                                   "IO error syncing inode, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long)iloc.bh->b_blocknr);
+                        err = -EIO;
+                }
        }
        return err;
 }
@@ -5033,7 +5472,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
        get_bh(iloc->bh);
        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
-        err = ext4_do_update_inode(handle, inode, iloc, 0);
+        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
 }
@@ -5177,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 */
 void ext4_dirty_inode(struct inode *inode)
 {
-        handle_t *current_handle = ext4_journal_current_handle();
        handle_t *handle;
-        if (!ext4_handle_valid(current_handle)) {
-                ext4_mark_inode_dirty(current_handle, inode);
-                return;
-        }
        handle = ext4_journal_start(inode, 2);
        if (IS_ERR(handle))
                goto out;
-        if (current_handle &&
-                current_handle->h_transaction != handle->h_transaction) {
+        ext4_mark_inode_dirty(handle, inode);
-                /* This task has a transaction open against a different fs */
-                printk(KERN_EMERG "%s: transactions do not match!\n",
-                       __func__);
-        } else {
-                jbd_debug(5, "marking dirty.  outer handle=%p\n",
-                                current_handle);
-                ext4_mark_inode_dirty(handle, inode);
-        }
        ext4_journal_stop(handle);
 out:
        return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
        return err;
 }
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
-        struct ext4_mb_history *history;
-        struct super_block *sb;
-        int start;
-        int max;
-};
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
-                                        struct ext4_mb_history *hs,
-                                        int first)
-{
-        if (hs == s->history + s->max)
-                hs = s->history;
-        if (!first && hs == s->history + s->start)
-                return NULL;
-        while (hs->orig.fe_len == 0) {
-                hs++;
-                if (hs == s->history + s->max)
-                        hs = s->history;
-                if (hs == s->history + s->start)
-                        return NULL;
-        }
-        return hs;
-}
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
-        struct ext4_mb_proc_session *s = seq->private;
-        struct ext4_mb_history *hs;
-        int l = *pos;
-        if (l == 0)
-                return SEQ_START_TOKEN;
-        hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-        if (!hs)
-                return NULL;
-        while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
-        return hs;
-}
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
-                                      loff_t *pos)
-{
-        struct ext4_mb_proc_session *s = seq->private;
-        struct ext4_mb_history *hs = v;
-        ++*pos;
-        if (v == SEQ_START_TOKEN)
-                return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-        else
-                return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
-        char buf[25], buf2[25], buf3[25], *fmt;
-        struct ext4_mb_history *hs = v;
-        if (v == SEQ_START_TOKEN) {
-                seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-                                "%-5s %-2s %-6s %-5s %-5s %-6s\n",
-                          "pid", "inode", "original", "goal", "result", "found",
-                           "grps", "cr", "flags", "merge", "tail", "broken");
-                return 0;
-        }
-        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
-                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-                        "0x%04x %-5s %-5u %-6u\n";
-                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
-                        hs->result.fe_start, hs->result.fe_len,
-                        hs->result.fe_logical);
-                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
-                        hs->orig.fe_start, hs->orig.fe_len,
-                        hs->orig.fe_logical);
-                sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
-                        hs->goal.fe_start, hs->goal.fe_len,
-                        hs->goal.fe_logical);
-                seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
-                                hs->found, hs->groups, hs->cr, hs->flags,
-                                hs->merged ? "M" : "", hs->tail,
-                                hs->buddy ? 1 << hs->buddy : 0);
-        } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
-                fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
-                        hs->result.fe_start, hs->result.fe_len,
-                        hs->result.fe_logical);
-                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
-                        hs->orig.fe_start, hs->orig.fe_len,
-                        hs->orig.fe_logical);
-                seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
-        } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
-                        hs->result.fe_start, hs->result.fe_len);
-                seq_printf(seq, "%-5u %-8u %-23s discard\n",
-                                hs->pid, hs->ino, buf2);
-        } else if (hs->op == EXT4_MB_HISTORY_FREE) {
-                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
-                        hs->result.fe_start, hs->result.fe_len);
-                seq_printf(seq, "%-5u %-8u %-23s free\n",
-                                hs->pid, hs->ino, buf2);
-        }
-        return 0;
-}
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-static const struct seq_operations ext4_mb_seq_history_ops = {
-        .start  = ext4_mb_seq_history_start,
-        .next   = ext4_mb_seq_history_next,
-        .stop   = ext4_mb_seq_history_stop,
-        .show   = ext4_mb_seq_history_show,
-};
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
-        struct super_block *sb = PDE(inode)->data;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_mb_proc_session *s;
-        int rc;
-        int size;
-        if (unlikely(sbi->s_mb_history == NULL))
-                return -ENOMEM;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
-        if (s == NULL)
-                return -ENOMEM;
-        s->sb = sb;
-        size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
-        s->history = kmalloc(size, GFP_KERNEL);
-        if (s->history == NULL) {
-                kfree(s);
-                return -ENOMEM;
-        }
-        spin_lock(&sbi->s_mb_history_lock);
-        memcpy(s->history, sbi->s_mb_history, size);
-        s->max = sbi->s_mb_history_max;
-        s->start = sbi->s_mb_history_cur % s->max;
-        spin_unlock(&sbi->s_mb_history_lock);
-        rc = seq_open(file, &ext4_mb_seq_history_ops);
-        if (rc == 0) {
-                struct seq_file *m = (struct seq_file *)file->private_data;
-                m->private = s;
-        } else {
-                kfree(s->history);
-                kfree(s);
-        }
-        return rc;
-}
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
-        struct seq_file *seq = (struct seq_file *)file->private_data;
-        struct ext4_mb_proc_session *s = seq->private;
-        kfree(s->history);
-        kfree(s);
-        return seq_release(inode, file);
-}
-static ssize_t ext4_mb_seq_history_write(struct file *file,
-                                const char __user *buffer,
-                                size_t count, loff_t *ppos)
-{
-        struct seq_file *seq = (struct seq_file *)file->private_data;
-        struct ext4_mb_proc_session *s = seq->private;
-        struct super_block *sb = s->sb;
-        char str[32];
-        int value;
-        if (count >= sizeof(str)) {
-                printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
-                                "mb_history", (int)sizeof(str));
-                return -EOVERFLOW;
-        }
-        if (copy_from_user(str, buffer, count))
-                return -EFAULT;
-        value = simple_strtol(str, NULL, 0);
-        if (value < 0)
-                return -ERANGE;
-        EXT4_SB(sb)->s_mb_history_filter = value;
-        return count;
-}
-static const struct file_operations ext4_mb_seq_history_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ext4_mb_seq_history_open,
-        .read           = seq_read,
-        .write          = ext4_mb_seq_history_write,
-        .llseek         = seq_lseek,
-        .release        = ext4_mb_seq_history_release,
-};
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
        struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
-static void ext4_mb_history_release(struct super_block *sb)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_proc != NULL) {
-                remove_proc_entry("mb_groups", sbi->s_proc);
-                if (sbi->s_mb_history_max)
-                        remove_proc_entry("mb_history", sbi->s_proc);
-        }
-        kfree(sbi->s_mb_history);
-}
-static void ext4_mb_history_init(struct super_block *sb)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int i;
-        if (sbi->s_proc != NULL) {
-                if (sbi->s_mb_history_max)
-                        proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-                                         &ext4_mb_seq_history_fops, sb);
-                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
-                                 &ext4_mb_seq_groups_fops, sb);
-        }
-        sbi->s_mb_history_cur = 0;
-        spin_lock_init(&sbi->s_mb_history_lock);
-        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
-        /* if we can't allocate history, then we simple won't use it */
-}
-static noinline_for_stack void
-ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-        struct ext4_mb_history h;
-        if (sbi->s_mb_history == NULL)
-                return;
-        if (!(ac->ac_op & sbi->s_mb_history_filter))
-                return;
-        h.op = ac->ac_op;
-        h.pid = current->pid;
-        h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
-        h.orig = ac->ac_o_ex;
-        h.result = ac->ac_b_ex;
-        h.flags = ac->ac_flags;
-        h.found = ac->ac_found;
-        h.groups = ac->ac_groups_scanned;
-        h.cr = ac->ac_criteria;
-        h.tail = ac->ac_tail;
-        h.buddy = ac->ac_buddy;
-        h.merged = 0;
-        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
-                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
-                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
-                        h.merged = 1;
-                h.goal = ac->ac_g_ex;
-                h.result = ac->ac_f_ex;
-        }
-        spin_lock(&sbi->s_mb_history_lock);
-        memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
-        if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
-                sbi->s_mb_history_cur = 0;
-        spin_unlock(&sbi->s_mb_history_lock);
-}
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-        sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
-        ext4_mb_history_init(sb);
+        if (sbi->s_proc)
+                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+                                 &ext4_mb_seq_groups_fops, sb);
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
 }
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
        }
        free_percpu(sbi->s_locality_groups);
-        ext4_mb_history_release(sb);
+        if (sbi->s_proc)
+                remove_proc_entry("mb_groups", sbi->s_proc);
        return 0;
 }
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
                        atomic_inc(&sbi->s_bal_breaks);
        }
-        ext4_mb_store_history(ac);
+        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+                trace_ext4_mballoc_alloc(ac);
+        else
+                trace_ext4_mballoc_prealloc(ac);
 }
 /*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        if (ac) {
                ac->ac_sb = sb;
                ac->ac_inode = pa->pa_inode;
-                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
        }
        while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ac->ac_b_ex.fe_start = bit;
                        ac->ac_b_ex.fe_len = next - bit;
                        ac->ac_b_ex.fe_logical = 0;
-                        ext4_mb_store_history(ac);
+                        trace_ext4_mballoc_discard(ac);
                }
                trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        ext4_group_t group;
        ext4_grpblk_t bit;
-        if (ac)
-                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
        trace_ext4_mb_release_group_pa(ac, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
                ac->ac_b_ex.fe_start = bit;
                ac->ac_b_ex.fe_len = pa->pa_len;
                ac->ac_b_ex.fe_logical = 0;
-                ext4_mb_store_history(ac);
+                trace_ext4_mballoc_discard(ac);
        }
        return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;
-        size = max(size, isize);
        if ((size == isize) &&
            !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        }
        /* don't use group allocation for large files */
+        size = max(size, isize);
        if (size >= sbi->s_mb_stream_request) {
                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
-                ac->ac_op = EXT4_MB_HISTORY_FREE;
                ac->ac_inode = inode;
                ac->ac_sb = sb;
        }
@@ -4806,7 +4527,7 @@ do_more:
                ac->ac_b_ex.fe_group = block_group;
                ac->ac_b_ex.fe_start = bit;
                ac->ac_b_ex.fe_len = count;
-                ext4_mb_store_history(ac);
+                trace_ext4_mballoc_free(ac);
        }
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
 #define mb_debug(n, fmt, a...)
 #endif
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
 #define EXT4_MB_HISTORY_ALLOC           1       /* allocation */
 #define EXT4_MB_HISTORY_PREALLOC        2       /* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD         4       /* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE            8       /* free */
-#define EXT4_MB_HISTORY_DEFAULT         (EXT4_MB_HISTORY_ALLOC | \
-                                         EXT4_MB_HISTORY_PREALLOC)
 /*
 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
 * with 'ext4_mb_stats' allocator will collect stats that will be
 * shown at umount. The collecting costs though!
 */
-#define MB_DEFAULT_STATS                1
+#define MB_DEFAULT_STATS                0
 /*
 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
 #define AC_STATUS_FOUND         2
 #define AC_STATUS_BREAK         3
-struct ext4_mb_history {
-        struct ext4_free_extent orig;   /* orig allocation */
-        struct ext4_free_extent goal;   /* goal allocation */
-        struct ext4_free_extent result; /* result allocation */
-        unsigned pid;
-        unsigned ino;
-        __u16 found;    /* how many extents have been found */
-        __u16 groups;   /* how many groups have been scanned */
-        __u16 tail;     /* what tail broke some buddy */
-        __u16 buddy;    /* buddy the tail ^^^ broke */
-        __u16 flags;
-        __u8 cr:3;      /* which phase the result extent was found at */
-        __u8 op:4;
-        __u8 merged:1;
-};
 struct ext4_buddy {
        struct page *bd_buddy_page;
        void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-        return;
-}
-#endif
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
                                goto err_out;
                }
        }
-        retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+        retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
 err_out:
        if (path) {
                ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                        goto out;
                if (ext4_ext_insert_extent(handle, orig_inode,
-                                        orig_path, new_ext))
+                                        orig_path, new_ext, 0))
                        goto out;
        }
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                        goto out;
                if (ext4_ext_insert_extent(handle, orig_inode,
-                                           orig_path, end_ext))
+                                           orig_path, end_ext, 0))
                        goto out;
        }
 out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        /* orig and donor should be different file */
-        if (orig_inode->i_ino == donor_inode->i_ino) {
-                ext4_debug("ext4 move extent: The argument files should not "
-                        "be same file [ino:orig %lu, donor %lu]\n",
-                        orig_inode->i_ino, donor_inode->i_ino);
-                return -EINVAL;
-        }
        /* Ext4 move extent supports only extent based file */
        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
                ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        int block_len_in_page;
        int uninit;
+        /* orig and donor should be different file */
+        if (orig_inode->i_ino == donor_inode->i_ino) {
+                ext4_debug("ext4 move extent: The argument files should not "
+                        "be same file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
        /* protect orig and donor against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        struct ext4_iloc iloc;
        int err = 0;
-        if (!ext4_handle_valid(handle))
+        /* ext4_handle_valid() assumes a valid handle_t pointer */
+        if (handle && !ext4_handle_valid(handle))
                return 0;
        mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-static int default_mb_history_length = 1000;
-module_param_named(default_mb_history_length, default_mb_history_length,
-                   int, 0644);
-MODULE_PARM_DESC(default_mb_history_length,
-                 "Default number of entries saved for mb_history");
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+        handle_t *handle = current->journal_info;
+        unsigned long ref_cnt = (unsigned long)handle;
+        BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+        ref_cnt++;
+        handle = (handle_t *)ref_cnt;
+        current->journal_info = handle;
+        return handle;
+}
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+        unsigned long ref_cnt = (unsigned long)handle;
+        BUG_ON(ref_cnt == 0);
+        ref_cnt--;
+        handle = (handle_t *)ref_cnt;
+        current->journal_info = handle;
+}
 /*
 * Wrappers for jbd2_journal_start/end.
 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
                }
                return jbd2_journal_start(journal, nblocks);
        }
-        /*
+        return ext4_get_nojournal();
-         * We're not journaling, return the appropriate indication.
-         */
-        current->journal_info = EXT4_NOJOURNAL_HANDLE;
-        return current->journal_info;
 }
 /*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        int rc;
        if (!ext4_handle_valid(handle)) {
-                /*
+                ext4_put_nojournal(handle);
-                 * Do this here since we don't call jbd2_journal_stop() in
-                 * no-journal mode.
-                 */
-                current->journal_info = NULL;
                return 0;
        }
        sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        flush_workqueue(sbi->dio_unwritten_wq);
+        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
        lock_kernel();
        if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_allocated_meta_blocks = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
+        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+        ei->cur_aio_dio = NULL;
        return &ei->vfs_inode;
 }
@@ -1052,7 +1072,7 @@ enum {
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-        Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
+        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
-        {Opt_mb_history_length, "mb_history_length=%u"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_data_err_ignore:
                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
                        break;
-                case Opt_mb_history_length:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_mb_history_max = option;
-                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
-        if (EXT4_SB(sb)->s_journal) {
-                ext4_msg(sb, KERN_INFO, "%s journal on %s",
-                       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-                       "external", EXT4_SB(sb)->s_journal->j_devname);
-        } else {
-                ext4_msg(sb, KERN_INFO, "no journal");
-        }
        return res;
 }
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
 static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(mb_order2_req),
        ATTR_LIST(mb_stream_req),
        ATTR_LIST(mb_group_prealloc),
+        ATTR_LIST(max_writeback_mb_bump),
        NULL,
 };
@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-        sbi->s_mb_history_max = default_mb_history_length;
        set_opt(sbi->s_mount_opt, BARRIER);
@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
+        sbi->s_max_writeback_mb_bump = 128;
        /*
         * set up enough so that it can read an inode
@@ -2798,6 +2805,12 @@ no_journal:
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
+        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+        if (!EXT4_SB(sb)->dio_unwritten_wq) {
+                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+                goto failed_mount_wq;
+        }
        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2862,12 @@ no_journal:
                         "available");
        }
-        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+        if (test_opt(sb, DELALLOC) &&
+            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
-        } else if (test_opt(sb, DELALLOC))
+        }
-                ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
        err = ext4_setup_system_zone(sb);
        if (err) {
@@ -2910,6 +2923,8 @@ cantfind_ext4:
 failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
+        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
        ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
                        return -EINVAL;
        }
-        if (journal->j_flags & JBD2_BARRIER)
+        if (!(journal->j_flags & JBD2_BARRIER))
-                ext4_msg(sb, KERN_INFO, "barriers enabled");
-        else
                ext4_msg(sb, KERN_INFO, "barriers disabled");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        int ret = 0;
        tid_t target;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
        trace_ext4_sync_fs(sb, wait);
-        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+        flush_workqueue(sbi->dio_unwritten_wq);
+        if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                if (wait)
-                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+                        jbd2_log_wait_commit(sbi->s_journal, target);
        }
        return ret;
 }
@@ -3951,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
-                          const char *dev_name, void *data,struct vfsmount *mnt)
-{
-        printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
-               "to mount using ext4\n", dev_name);
-        printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
-               "will go away by 2.6.31\n", dev_name);
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
-}
-static struct file_system_type ext4dev_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext4dev",
-        .get_sb         = ext4dev_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS("ext4dev");
-#endif
 static int __init init_ext4_fs(void)
 {
        int err;
@@ -3996,13 +3990,6 @@ static int __init init_ext4_fs(void)
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
-#ifdef CONFIG_EXT4DEV_COMPAT
-        err = register_filesystem(&ext4dev_fs_type);
-        if (err) {
-                unregister_filesystem(&ext4_fs_type);
-                goto out;
-        }
-#endif
        return 0;
 out:
        destroy_inodecache();
@@ -4021,9 +4008,6 @@ out4:
 static void __exit exit_ext4_fs(void)
 {
        unregister_filesystem(&ext4_fs_type);
-#ifdef CONFIG_EXT4DEV_COMPAT
-        unregister_filesystem(&ext4dev_fs_type);
-#endif
        destroy_inodecache();
        exit_ext4_xattr();
        exit_ext4_mballoc();