30 files changed, 1522 insertions, 1611 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 8c10bf375c73..1b7f9acbcbbe 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -144,9 +144,6 @@ journal_async_commit	Commit block can be written to disk without waiting
                        mount the device. This will enable 'journal_checksum'
                        internally.
-journal=update          Update the ext4 file system's journal to the current
-                        format.
 journal_dev=devnum      When the external journal device's major/minor numbers
                        have changed, this option allows the user to specify
                        the new journal location.  The journal device is
@@ -356,11 +353,6 @@ nouid32			Disables 32-bit UIDs and GIDs.  This is for
                        interoperability  with  older kernels which only
                        store and expect 16-bit values.
-resize                  Allows to resize filesystem to the end of the last
-                        existing block group, further resize has to be done
-                        with resize2fs either online, or offline. It can be
-                        used only with conjunction with remount.
 block_validity          This options allows to enables/disables the in-kernel
 noblock_validity        facility for tracking filesystem metadata blocks
                        within internal data structures. This allows multi-
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf711..4bbd07a6fa18 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc *desc;
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_block_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-                ext4_error(sb, "Cannot read block bitmap - "
+                ext4_error(sb, "Cannot get buffer for block bitmap - "
-                            "block_group = %u, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, bitmap_blk);
                return NULL;
        }
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        /*
-         * submit the buffer_head for read. We can
+         * submit the buffer_head for reading
-         * safely mark the bitmap as uptodate now.
-         * We do it here so the bitmap uptodate bit
-         * get set with buffer lock held.
         */
+        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group);
-        set_bitmap_uptodate(bh);
+        bh->b_end_io = ext4_end_bitmap_read;
-        if (bh_submit_read(bh) < 0) {
+        get_bh(bh);
-                put_bh(bh);
+        submit_bh(READ, bh);
+        return bh;
+}
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+                           struct buffer_head *bh)
+{
+        struct ext4_group_desc *desc;
+        if (!buffer_new(bh))
+                return 0;
+        desc = ext4_get_group_desc(sb, block_group, NULL);
+        if (!desc)
+                return 1;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
                ext4_error(sb, "Cannot read block bitmap - "
-                            "block_group = %u, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, (unsigned long long) bh->b_blocknr);
-                return NULL;
+                return 1;
        }
+        clear_buffer_new(bh);
+        /* Panic or remount fs read-only if block bitmap is invalid */
        ext4_valid_block_bitmap(sb, desc, block_group, bh);
-        /*
+        return 0;
-         * file system mounted not to panic on error,
+}
-         * continue with corrupt bitmap
-         */
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+        struct buffer_head *bh;
+        bh = ext4_read_block_bitmap_nowait(sb, block_group);
+        if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+                put_bh(bh);
+                return NULL;
+        }
        return bh;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e58..ad56866d729a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                return 0;
        if (filp)
-                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset%bh->b_size),
+                                error_msg, (unsigned) (offset % bh->b_size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
        else
-                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset%bh->b_size),
+                                error_msg, (unsigned) (offset % bh->b_size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
@@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,
        sb = inode->i_sb;
        if (!fname) {
-                printk(KERN_ERR "EXT4-fs: call_filldir: called with "
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
-                       "null fname?!?\n");
+                         "called with null fname?!?", __func__, __LINE__,
+                         inode->i_ino, current->comm);
                return 0;
        }
        curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d84..ded731ac8a32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
                printk(KERN_DEBUG f, ## a);                             \
        } while (0)
 #else
-#define ext4_debug(f, a...)     do {} while (0)
+#define ext4_debug(fmt, ...)    no_printk(fmt, ##__VA_ARGS__)
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
 #define EXT4_IO_END_UNWRITTEN   0x0001
 #define EXT4_IO_END_ERROR       0x0002
 #define EXT4_IO_END_QUEUED      0x0004
+#define EXT4_IO_END_DIRECT      0x0008
+#define EXT4_IO_END_IN_FSYNC    0x0010
 struct ext4_io_page {
        struct page     *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
 #define MAX_IO_PAGES 128
+/*
+ * For converting uninitialized extents on a work queue.
+ *
+ * 'page' is only used from the writepage() path; 'pages' is only used for
+ * buffered writes; they are used to keep page references until conversion
+ * takes place.  For AIO/DIO, neither field is filled in.
+ */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
-        struct page             *page;          /* page struct for buffer write */
+        struct page             *page;          /* for writepage() path */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
-        int                     num_io_pages;
+        int                     num_io_pages;   /* for writepages() */
-        struct ext4_io_page     *pages[MAX_IO_PAGES];
+        struct ext4_io_page     *pages[MAX_IO_PAGES]; /* for writepages() */
 } ext4_io_end_t;
 struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_CONT          0x00010 /* Continue on errors */
 #define EXT4_MOUNT_ERRORS_RO            0x00020 /* Remount fs ro on errors */
 #define EXT4_MOUNT_ERRORS_PANIC         0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK          0x00070
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DIOREAD_NOLOCK       0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
+        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
        gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG           0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM  0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR          0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x8000 /* data in inode */
 #define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                      ext4_group_t block_group);
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+                                                ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+                                  ext4_group_t block_group,
+                                  struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                                  ext4_group_t block_group);
 extern void ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t group,
@@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
 extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 /* mballoc.c */
 extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a30..0f58b86e3a02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
 */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
-#define ext_debug(a...)         printk(a)
+#define ext_debug(fmt, ...)     printk(fmt, ##__VA_ARGS__)
 #else
-#define ext_debug(a...)
+#define ext_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab18..83b20fcf9400 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+        /* list information for other callbacks attached to the same handle */
+        struct list_head jce_list;
+        /*  Function to call with this callback structure */
+        void (*jce_func)(struct super_block *sb,
+                         struct ext4_journal_cb_entry *jce, int error);
+        /* user data goes here */
+};
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+                        void (*func)(struct super_block *sb,
+                                     struct ext4_journal_cb_entry *jce,
+                                     int rc),
+                        struct ext4_journal_cb_entry *jce)
+{
+        struct ext4_sb_info *sbi =
+                        EXT4_SB(handle->h_transaction->t_journal->j_private);
+        /* Add the jce to transaction's private list */
+        jce->jce_func = func;
+        spin_lock(&sbi->s_md_lock);
+        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+        spin_unlock(&sbi->s_md_lock);
+}
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+                                             struct ext4_journal_cb_entry *jce)
+{
+        struct ext4_sb_info *sbi =
+                        EXT4_SB(handle->h_transaction->t_journal->j_private);
+        spin_lock(&sbi->s_md_lock);
+        list_del_init(&jce->jce_list);
+        spin_unlock(&sbi->s_md_lock);
+}
 int
 ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE    0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE    0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE  0x04 /* writeback data mode */
+static inline int ext4_inode_journal_mode(struct inode *inode)
 {
        if (EXT4_JOURNAL(inode) == NULL)
-                return 0;
+                return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
-        if (!S_ISREG(inode->i_mode))
+        /* We do not support data journalling with delayed allocation */
-                return 1;
+        if (!S_ISREG(inode->i_mode) ||
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-                return 1;
+                return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-                return 1;
+            !test_opt(inode->i_sb, DELALLOC))
-        return 0;
+                return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                return EXT4_INODE_ORDERED_DATA_MODE;    /* ordered */
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+                return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
+        else
+                BUG();
+}
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
 }
 static inline int ext4_should_order_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
-                return 0;
-        if (!S_ISREG(inode->i_mode))
-                return 0;
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-                return 0;
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-                return 1;
-        return 0;
 }
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
+        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
-                return 1;
-        if (!S_ISREG(inode->i_mode))
-                return 0;
-        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-                return 0;
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-                return 1;
-        return 0;
 }
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1b..1421938e6792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
 #include <trace/events/ext4.h>
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
+                                        due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
 static int ext4_split_extent(handle_t *handle,
                                struct inode *inode,
                                struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
                                int split_flag,
                                int flags);
+static int ext4_split_extent_at(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             ext4_lblk_t split,
+                             int split_flag,
+                             int flags);
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
+        if (len == 0)
+                return 0;
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        struct ext4_extent *ex;
        /* the header must be checked already in ext4_ext_remove_space() */
-        ext_debug("truncate since %u in leaf\n", start);
+        ext_debug("truncate since %u in leaf to %u\n", start, end);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                ext_debug("  border %u:%u\n", a, b);
                /* If this extent is beyond the end of the hole, skip it */
-                if (end <= ex_ee_block) {
+                if (end < ex_ee_block) {
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
-                        EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+                        EXT4_ERROR_INODE(inode,
-                                         start, end);
+                                         "can not handle truncate %u:%u "
+                                         "on extent %u:%u",
+                                         start, end, ex_ee_block,
+                                         ex_ee_block + ex_ee_len - 1);
                        err = -EIO;
                        goto out;
                } else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                 ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        handle_t *handle;
        int i, err;
-        ext_debug("truncate since %u\n", start);
+        ext_debug("truncate since %u to %u\n", start, end);
        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
        trace_ext4_ext_remove_space(inode, start, depth);
        /*
+         * Check if we are removing extents inside the extent tree. If that
+         * is the case, we are going to punch a hole inside the extent tree
+         * so we have to check whether we need to split the extent covering
+         * the last block to remove so we can easily remove the part of it
+         * in ext4_ext_rm_leaf().
+         */
+        if (end < EXT_MAX_BLOCKS - 1) {
+                struct ext4_extent *ex;
+                ext4_lblk_t ee_block;
+                /* find extent for this block */
+                path = ext4_ext_find_extent(inode, end, NULL);
+                if (IS_ERR(path)) {
+                        ext4_journal_stop(handle);
+                        return PTR_ERR(path);
+                }
+                depth = ext_depth(inode);
+                ex = path[depth].p_ext;
+                if (!ex)
+                        goto cont;
+                ee_block = le32_to_cpu(ex->ee_block);
+                /*
+                 * See if the last block is inside the extent, if so split
+                 * the extent at 'end' block so we can easily remove the
+                 * tail of the first part of the split extent in
+                 * ext4_ext_rm_leaf().
+                 */
+                if (end >= ee_block &&
+                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+                        int split_flag = 0;
+                        if (ext4_ext_is_uninitialized(ex))
+                                split_flag = EXT4_EXT_MARK_UNINIT1 |
+                                             EXT4_EXT_MARK_UNINIT2;
+                        /*
+                         * Split the extent in two so that 'end' is the last
+                         * block in the first new extent
+                         */
+                        err = ext4_split_extent_at(handle, inode, path,
+                                                end + 1, split_flag,
+                                                EXT4_GET_BLOCKS_PRE_IO |
+                                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                        if (err < 0)
+                                goto out;
+                }
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+cont:
+        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
@@ -2515,6 +2591,7 @@ again:
        }
        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
@@ -2526,7 +2603,7 @@ again:
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                                               &partial_cluster, start,
-                                               EXT_MAX_BLOCKS - 1);
+                                               end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
-                printk(KERN_INFO "EXT4-fs: file extents enabled");
+                printk(KERN_INFO "EXT4-fs: file extents enabled"
 #ifdef AGGRESSIVE_TEST
-                printk(", aggressive tests");
+                       ", aggressive tests"
 #endif
 #ifdef CHECK_BINSEARCH
-                printk(", check binsearch");
+                       ", check binsearch"
 #endif
 #ifdef EXTENTS_STATS
-                printk(", stats");
+                       ", stats"
 #endif
-                printk("\n");
+                       "\n");
 #endif
 #ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 }
 /*
- * used by extent splitting.
- */
-#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
-                                        due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
-/*
 * ext4_split_extent_at() splits an extent at given block.
 *
 * @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
-        if (unlikely(!eh->eh_entries)) {
+        /*
-                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+         * We're going to remove EOFBLOCKS_FL entirely in future so we
-                                 "EOFBLOCKS_FL set");
+         * do not care for this case anymore. Simply remove the flag
-                return -EIO;
+         * if there are no extents.
-        }
+         */
+        if (unlikely(!eh->eh_entries))
+                goto out;
        last_ex = EXT_LAST_EXTENT(eh);
        /*
         * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        for (i = depth-1; i >= 0; i--)
                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
                        return 0;
+out:
        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        return ext4_mark_inode_dirty(handle, inode);
 }
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
-        unsigned int punched_out = 0;
-        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
        ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-                ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((sbi->s_cluster_ratio > 1) &&
                            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        struct ext4_map_blocks punch_map;
-                        ext4_fsblk_t partial_cluster = 0;
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-                                /*
-                                 * Do not put uninitialized extent
-                                 * in the cache
-                                 */
-                                if (!ext4_ext_is_uninitialized(ex)) {
-                                        ext4_ext_put_in_cache(inode, ee_block,
-                                                ee_len, ee_start);
-                                        goto out;
-                                }
-                                ret = ext4_ext_handle_uninitialized_extents(
-                                        handle, inode, map, path, flags,
-                                        allocated, newblock);
-                                return ret;
-                        }
-                        /*
-                         * Punch out the map length, but only to the
-                         * end of the extent
-                         */
-                        punched_out = allocated < map->m_len ?
-                                allocated : map->m_len;
                        /*
-                         * Sense extents need to be converted to
+                         * Do not put uninitialized extent
-                         * uninitialized, they must fit in an
+                         * in the cache
-                         * uninitialized extent
                         */
-                        if (punched_out > EXT_UNINIT_MAX_LEN)
+                        if (!ext4_ext_is_uninitialized(ex)) {
-                                punched_out = EXT_UNINIT_MAX_LEN;
+                                ext4_ext_put_in_cache(inode, ee_block,
+                                        ee_len, ee_start);
-                        punch_map.m_lblk = map->m_lblk;
+                                goto out;
-                        punch_map.m_pblk = newblock;
-                        punch_map.m_len = punched_out;
-                        punch_map.m_flags = 0;
-                        /* Check to see if the extent needs to be split */
-                        if (punch_map.m_len != ee_len ||
-                                punch_map.m_lblk != ee_block) {
-                                ret = ext4_split_extent(handle, inode,
-                                path, &punch_map, 0,
-                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-                                EXT4_GET_BLOCKS_PRE_IO);
-                                if (ret < 0) {
-                                        err = ret;
-                                        goto out2;
-                                }
-                                /*
-                                 * find extent for the block at
-                                 * the start of the hole
-                                 */
-                                ext4_ext_drop_refs(path);
-                                kfree(path);
-                                path = ext4_ext_find_extent(inode,
-                                map->m_lblk, NULL);
-                                if (IS_ERR(path)) {
-                                        err = PTR_ERR(path);
-                                        path = NULL;
-                                        goto out2;
-                                }
-                                depth = ext_depth(inode);
-                                ex = path[depth].p_ext;
-                                ee_len = ext4_ext_get_actual_len(ex);
-                                ee_block = le32_to_cpu(ex->ee_block);
-                                ee_start = ext4_ext_pblock(ex);
-                        }
-                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_invalidate_cache(inode);
-                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                               &partial_cluster, map->m_lblk,
-                                               map->m_lblk + punched_out);
-                        if (!err && path->p_hdr->eh_entries == 0) {
-                                /*
-                                 * Punch hole freed all of this sub tree,
-                                 * so we need to correct eh_depth
-                                 */
-                                err = ext4_ext_get_access(handle, inode, path);
-                                if (err == 0) {
-                                        ext_inode_hdr(inode)->eh_depth = 0;
-                                        ext_inode_hdr(inode)->eh_max =
-                                        cpu_to_le16(ext4_ext_space_root(
-                                                inode, 0));
-                                        err = ext4_ext_dirty(
-                                                handle, inode, path);
-                                }
                        }
+                        ret = ext4_ext_handle_uninitialized_extents(
-                        goto out2;
+                                handle, inode, map, path, flags,
+                                allocated, newblock);
+                        return ret;
                }
        }
@@ -4165,13 +4146,11 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
-                        punched_out : allocated;
        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-                newblock, map->m_len, err ? err : result);
+                newblock, map->m_len, err ? err : allocated);
-        return err ? err : result;
+        return err ? err : allocated;
 }
 void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
+                        ext4_msg(inode->i_sb, KERN_ERR,
-                                    "returned error inode#%lu, block=%u, "
+                                 "%s:%d: inode #%lu: block %u: len %u: "
-                                    "max_blocks=%u", __func__,
+                                 "ext4_ext_map_blocks returned %d",
-                                    inode->i_ino, map.m_lblk, map.m_len);
+                                 __func__, __LINE__, inode->i_ino, map.m_lblk,
+                                 map.m_len, ret);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct super_block *sb = inode->i_sb;
-        struct ext4_ext_cache cache_ex;
+        ext4_lblk_t first_block, stop_block;
-        ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
        struct address_space *mapping = inode->i_mapping;
-        struct ext4_map_blocks map;
        handle_t *handle;
        loff_t first_page, last_page, page_len;
        loff_t first_page_offset, last_page_offset;
-        int ret, credits, blocks_released, err = 0;
+        int credits, err = 0;
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                   offset;
        }
-        first_block = (offset + sb->s_blocksize - 1) >>
-                EXT4_BLOCK_SIZE_BITS(sb);
-        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                }
        }
        /*
         * If i_size is contained in the last page, we need to
         * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                }
        }
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
        /* If there are no blocks to remove, return now */
-        if (first_block >= last_block)
+        if (first_block >= stop_block)
                goto out;
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
        ext4_discard_preallocations(inode);
-        /*
+        err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-         * Loop over all the blocks and identify blocks
-         * that need to be punched out
-         */
-        iblock = first_block;
-        blocks_released = 0;
-        while (iblock < last_block) {
-                max_blocks = last_block - iblock;
-                num_blocks = 1;
-                memset(&map, 0, sizeof(map));
-                map.m_lblk = iblock;
-                map.m_len = max_blocks;
-                ret = ext4_ext_map_blocks(handle, inode, &map,
-                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
-                if (ret > 0) {
-                        blocks_released += ret;
-                        num_blocks = ret;
-                } else if (ret == 0) {
-                        /*
-                         * If map blocks could not find the block,
-                         * then it is in a hole.  If the hole was
-                         * not already cached, then map blocks should
-                         * put it in the cache.  So we can get the hole
-                         * out of the cache
-                         */
-                        memset(&cache_ex, 0, sizeof(cache_ex));
-                        if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
-                                !cache_ex.ec_start) {
-                                /* The hole is cached */
-                                num_blocks = cache_ex.ec_block +
-                                cache_ex.ec_len - iblock;
-                        } else {
-                                /* The block could not be identified */
-                                err = -EIO;
-                                break;
-                        }
-                } else {
-                        /* Map blocks error */
-                        err = ret;
-                        break;
-                }
-                if (num_blocks == 0) {
-                        /* This condition should never happen */
-                        ext_debug("Block lookup failed");
-                        err = -EIO;
-                        break;
-                }
-                iblock += num_blocks;
-        }
-        if (blocks_released > 0) {
+        ext4_ext_invalidate_cache(inode);
-                ext4_ext_invalidate_cache(inode);
+        ext4_discard_preallocations(inode);
-                ext4_discard_preallocations(inode);
-        }
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
                io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
                list_del_init(&io->list);
+                io->flag |= EXT4_IO_END_IN_FSYNC;
                /*
                 * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
                if (ret < 0)
                        ret2 = ret;
                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                io->flag &= ~EXT4_IO_END_IN_FSYNC;
        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad9..409c2ee7750a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
        return EXT4_INODES_PER_GROUP(sb);
 }
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+        if (uptodate) {
+                set_buffer_uptodate(bh);
+                set_bitmap_uptodate(bh);
+        }
+        unlock_buffer(bh);
+        put_bh(bh);
+}
 /*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        /*
-         * submit the buffer_head for read. We can
+         * submit the buffer_head for reading
-         * safely mark the bitmap as uptodate now.
-         * We do it here so the bitmap uptodate bit
-         * get set with buffer lock held.
         */
        trace_ext4_load_inode_bitmap(sb, block_group);
-        set_bitmap_uptodate(bh);
+        bh->b_end_io = ext4_end_bitmap_read;
-        if (bh_submit_read(bh) < 0) {
+        get_bh(bh);
+        submit_bh(READ, bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
                put_bh(bh);
                ext4_error(sb, "Cannot read inode bitmap - "
-                            "block_group = %u, inode_bitmap = %llu",
+                           "block_group = %u, inode_bitmap = %llu",
-                            block_group, bitmap_blk);
+                           block_group, bitmap_blk);
                return NULL;
        }
        return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
-        if (atomic_read(&inode->i_count) > 1) {
+        if (!sb) {
-                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+                printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
-                       atomic_read(&inode->i_count));
+                       "nonexistent device\n", __func__, __LINE__);
                return;
        }
-        if (inode->i_nlink) {
+        if (atomic_read(&inode->i_count) > 1) {
-                printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
-                       inode->i_nlink);
+                         __func__, __LINE__, inode->i_ino,
+                         atomic_read(&inode->i_count));
                return;
        }
-        if (!sb) {
+        if (inode->i_nlink) {
-                printk(KERN_ERR "ext4_free_inode: inode on "
+                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
-                       "nonexistent device\n");
+                         __func__, __LINE__, inode->i_ino, inode->i_nlink);
                return;
        }
        sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 /*
- * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's ext4_group_lock
- * and clear the uninit flag. The inode bitmap update
- * and group desc uninit flag clear should be done
- * after holding ext4_group_lock so that ext4_read_inode_bitmap
- * doesn't race with the ext4_claim_inode
- */
-static int ext4_claim_inode(struct super_block *sb,
-                        struct buffer_head *inode_bitmap_bh,
-                        unsigned long ino, ext4_group_t group, umode_t mode)
-{
-        int free = 0, retval = 0, count;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-        /*
-         * We have to be sure that new inode allocation does not race with
-         * inode table initialization, because otherwise we may end up
-         * allocating and writing new inode right before sb_issue_zeroout
-         * takes place and overwriting our new inode with zeroes. So we
-         * take alloc_sem to prevent it.
-         */
-        down_read(&grp->alloc_sem);
-        ext4_lock_group(sb, group);
-        if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
-                /* not a free inode */
-                retval = 1;
-                goto err_ret;
-        }
-        ino++;
-        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-                        ino > EXT4_INODES_PER_GROUP(sb)) {
-                ext4_unlock_group(sb, group);
-                up_read(&grp->alloc_sem);
-                ext4_error(sb, "reserved inode or inode > inodes count - "
-                           "block_group = %u, inode=%lu", group,
-                           ino + group * EXT4_INODES_PER_GROUP(sb));
-                return 1;
-        }
-        /* If we didn't allocate from within the initialized part of the inode
-         * table then we need to initialize up to this inode. */
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-                        /* When marking the block group with
-                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                         * on the value of bg_itable_unused even though
-                         * mke2fs could have initialized the same for us.
-                         * Instead we calculated the value below
-                         */
-                        free = 0;
-                } else {
-                        free = EXT4_INODES_PER_GROUP(sb) -
-                                ext4_itable_unused_count(sb, gdp);
-                }
-                /*
-                 * Check the relative inode number against the last used
-                 * relative inode number in this group. if it is greater
-                 * we need to  update the bg_itable_unused count
-                 *
-                 */
-                if (ino > free)
-                        ext4_itable_unused_set(sb, gdp,
-                                        (EXT4_INODES_PER_GROUP(sb) - ino));
-        }
-        count = ext4_free_inodes_count(sb, gdp) - 1;
-        ext4_free_inodes_set(sb, gdp, count);
-        if (S_ISDIR(mode)) {
-                count = ext4_used_dirs_count(sb, gdp) + 1;
-                ext4_used_dirs_set(sb, gdp, count);
-                if (sbi->s_log_groups_per_flex) {
-                        ext4_group_t f = ext4_flex_group(sbi, group);
-                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
-                }
-        }
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-err_ret:
-        ext4_unlock_group(sb, group);
-        up_read(&grp->alloc_sem);
-        return retval;
-}
-/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
        if (ret2 == -1)
                goto out;
+        /*
+         * Normally we will only go through one pass of this loop,
+         * unless we get unlucky and it turns out the group we selected
+         * had its last inode grabbed by someone else.
+         */
        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;
@@ -757,51 +685,24 @@ repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                              inode_bitmap_bh->b_data,
                                              EXT4_INODES_PER_GROUP(sb), ino);
+                if (ino >= EXT4_INODES_PER_GROUP(sb)) {
-                if (ino < EXT4_INODES_PER_GROUP(sb)) {
+                        if (++group == ngroups)
+                                group = 0;
-                        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+                        continue;
-                        err = ext4_journal_get_write_access(handle,
-                                                            inode_bitmap_bh);
-                        if (err)
-                                goto fail;
-                        BUFFER_TRACE(group_desc_bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle,
-                                                                group_desc_bh);
-                        if (err)
-                                goto fail;
-                        if (!ext4_claim_inode(sb, inode_bitmap_bh,
-                                                ino, group, mode)) {
-                                /* we won it */
-                                BUFFER_TRACE(inode_bitmap_bh,
-                                        "call ext4_handle_dirty_metadata");
-                                err = ext4_handle_dirty_metadata(handle,
-                                                                 NULL,
-                                                        inode_bitmap_bh);
-                                if (err)
-                                        goto fail;
-                                /* zero bit is inode number 1*/
-                                ino++;
-                                goto got;
-                        }
-                        /* we lost it */
-                        ext4_handle_release_buffer(handle, inode_bitmap_bh);
-                        ext4_handle_release_buffer(handle, group_desc_bh);
-                        if (++ino < EXT4_INODES_PER_GROUP(sb))
-                                goto repeat_in_this_group;
                }
+                if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
-                /*
+                        ext4_error(sb, "reserved inode found cleared - "
-                 * This case is possible in concurrent environment.  It is very
+                                   "inode=%lu", ino + 1);
-                 * rare.  We cannot repeat the find_group_xxx() call because
+                        continue;
-                 * that will simply return the same blockgroup, because the
+                }
-                 * group descriptor metadata has not yet been updated.
+                ext4_lock_group(sb, group);
-                 * So we just go onto the next blockgroup.
+                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
-                 */
+                ext4_unlock_group(sb, group);
-                if (++group == ngroups)
+                ino++;          /* the inode bitmap is zero-based */
-                        group = 0;
+                if (!ret2)
+                        goto got; /* we grabbed the inode! */
+                if (ino < EXT4_INODES_PER_GROUP(sb))
+                        goto repeat_in_this_group;
        }
        err = -ENOSPC;
        goto out;
@@ -838,6 +739,59 @@ got:
                if (err)
                        goto fail;
        }
+        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+        if (err)
+                goto fail;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, group_desc_bh);
+        if (err)
+                goto fail;
+        /* Update the relevant bg descriptor fields */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                int free;
+                struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+                down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+                ext4_lock_group(sb, group); /* while we modify the bg desc */
+                free = EXT4_INODES_PER_GROUP(sb) -
+                        ext4_itable_unused_count(sb, gdp);
+                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                        free = 0;
+                }
+                /*
+                 * Check the relative inode number against the last used
+                 * relative inode number in this group. if it is greater
+                 * we need to update the bg_itable_unused count
+                 */
+                if (ino > free)
+                        ext4_itable_unused_set(sb, gdp,
+                                        (EXT4_INODES_PER_GROUP(sb) - ino));
+                up_read(&grp->alloc_sem);
+        }
+        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+        if (S_ISDIR(mode)) {
+                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+                if (sbi->s_log_groups_per_flex) {
+                        ext4_group_t f = ext4_flex_group(sbi, group);
+                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+                }
+        }
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+                ext4_unlock_group(sb, group);
+        }
+        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+        if (err)
+                goto fail;
        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
        if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
- * block ext4_claim_inode until we are finished.
+ * block ext4_new_inode() until we are finished.
 */
 int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                            sbi->s_inodes_per_block);
        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
-                ext4_error(sb, "Something is wrong with group %u\n"
+                ext4_error(sb, "Something is wrong with group %u: "
-                           "Used itable blocks: %d"
+                           "used itable blocks: %d; "
-                           "itable unused count: %u\n",
+                           "itable unused count: %u",
                           group, used_blks,
                           ext4_itable_unused_count(sb, gdp));
                ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629d..c77b0bd2c711 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
-                         "with only %d reserved data blocks\n",
+                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 */
                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
-                         "data blocks\n", inode->i_ino, to_free,
+                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        printk(KERN_CRIT "Total free blocks count %lld\n",
+        struct super_block *sb = inode->i_sb;
+        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(inode->i_sb)));
-        printk(KERN_CRIT "Free/Dirty block details\n");
+        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
-        printk(KERN_CRIT "free_blocks=%lld\n",
+        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
-        printk(KERN_CRIT "dirty_blocks=%lld\n",
+        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-        printk(KERN_CRIT "Block reservation details\n");
+        ext4_msg(sb, KERN_CRIT, "Block reservation details");
-        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
-               EXT4_I(inode)->i_reserved_data_blocks);
+                 EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+        ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
               EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
        int write_mode = (int)(unsigned long)fsdata;
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-                if (ext4_should_order_data(inode)) {
+                switch (ext4_inode_journal_mode(inode)) {
+                case EXT4_INODE_ORDERED_DATA_MODE:
                        return ext4_ordered_write_end(file, mapping, pos,
                                        len, copied, page, fsdata);
-                } else if (ext4_should_writeback_data(inode)) {
+                case EXT4_INODE_WRITEBACK_DATA_MODE:
                        return ext4_writeback_write_end(file, mapping, pos,
                                        len, copied, page, fsdata);
-                } else {
+                default:
                        BUG();
                }
        }
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                goto out;
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
                  iocb->private, io_end->inode->i_ino, iocb, offset,
                  size);
@@ -2795,9 +2798,6 @@ out:
        /* queue the work to convert unwritten extents to written */
        queue_work(wq, &io_end->work);
-        /* XXX: probably should move into the real I/O completion handler */
-        inode_dio_done(inode);
 }
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-                printk("sb umounted, discard end_io request for inode %lu\n",
+                ext4_msg(io_end->inode->i_sb, KERN_INFO,
-                        io_end->inode->i_ino);
+                         "sb umounted, discard end_io request for inode %lu",
+                         io_end->inode->i_ino);
                ext4_free_io_end(io_end);
                goto out;
        }
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                iocb->private = NULL;
                EXT4_I(inode)->cur_aio_dio = NULL;
                if (!is_sync_kiocb(iocb)) {
-                        iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+                        ext4_io_end_t *io_end =
-                        if (!iocb->private)
+                                ext4_init_io_end(inode, GFP_NOFS);
+                        if (!io_end)
                                return -ENOMEM;
+                        io_end->flag |= EXT4_IO_END_DIRECT;
+                        iocb->private = io_end;
                        /*
                         * we save the io structure for current async
                         * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                         ext4_get_block_write,
                                         ext4_end_io_dio,
                                         NULL,
-                                         DIO_LOCKING | DIO_SKIP_HOLES);
+                                         DIO_LOCKING);
                if (iocb->private)
                        EXT4_I(inode)->cur_aio_dio = NULL;
                /*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
 void ext4_set_aops(struct inode *inode)
 {
-        if (ext4_should_order_data(inode) &&
+        switch (ext4_inode_journal_mode(inode)) {
-                test_opt(inode->i_sb, DELALLOC))
+        case EXT4_INODE_ORDERED_DATA_MODE:
-                inode->i_mapping->a_ops = &ext4_da_aops;
+                if (test_opt(inode->i_sb, DELALLOC))
-        else if (ext4_should_order_data(inode))
+                        inode->i_mapping->a_ops = &ext4_da_aops;
-                inode->i_mapping->a_ops = &ext4_ordered_aops;
+                else
-        else if (ext4_should_writeback_data(inode) &&
+                        inode->i_mapping->a_ops = &ext4_ordered_aops;
-                 test_opt(inode->i_sb, DELALLOC))
+                break;
-                inode->i_mapping->a_ops = &ext4_da_aops;
+        case EXT4_INODE_WRITEBACK_DATA_MODE:
-        else if (ext4_should_writeback_data(inode))
+                if (test_opt(inode->i_sb, DELALLOC))
-                inode->i_mapping->a_ops = &ext4_writeback_aops;
+                        inode->i_mapping->a_ops = &ext4_da_aops;
-        else
+                else
+                        inode->i_mapping->a_ops = &ext4_writeback_aops;
+                break;
+        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
+                break;
+        default:
+                BUG();
+        }
 }
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        if (!S_ISREG(inode->i_mode))
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                /* TODO: Add support for non extent hole punching */
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        }
        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
-                return -ENOTSUPP;
+                return -EOPNOTSUPP;
        }
        return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
                        ext4_update_dynamic_rev(sb);
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-                        sb->s_dirt = 1;
                        ext4_handle_sync(handle);
-                        err = ext4_handle_dirty_metadata(handle, NULL,
+                        err = ext4_handle_dirty_super(handle, sb);
-                                        EXT4_SB(sb)->s_sbh);
                }
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (attr->ia_size != i_size_read(inode)) {
+                if (attr->ia_size != i_size_read(inode))
                        truncate_setsize(inode, attr->ia_size);
-                        ext4_truncate(inode);
+                ext4_truncate(inode);
-                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
-                        ext4_truncate(inode);
        }
        if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
        int err = 0;
-        if (test_opt(inode->i_sb, I_VERSION))
+        if (IS_I_VERSION(inode))
                inode_inc_iversion(inode);
        /* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c698..99ab428bcfa0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
 * mballoc.c contains the multiblocks allocation routines
 */
+#include "ext4_jbd2.h"
 #include "mballoc.h"
 #include <linux/debugfs.h>
 #include <linux/slab.h>
@@ -339,7 +340,7 @@
 */
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
 /* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+                                struct ext4_journal_cb_entry *jce, int rc);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
        char *bb;
-        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(max == NULL);
        if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        /* at order 0 we see each particular block */
        if (order == 0) {
                *max = 1 << (e4b->bd_blkbits + 3);
-                return EXT4_MB_BITMAP(e4b);
+                return e4b->bd_bitmap;
        }
-        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
        return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                        for (j = 0; j < (1 << order); j++) {
                                k = (i * (1 << order)) + j;
                                MB_CHECK_ASSERT(
-                                        !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+                                        !mb_test_bit(k, e4b->bd_bitmap));
                        }
                        count++;
                }
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        int groups_per_page;
        int err = 0;
        int i;
-        ext4_group_t first_group;
+        ext4_group_t first_group, group;
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        /* allocate buffer_heads to read bitmaps */
        if (groups_per_page > 1) {
-                err = -ENOMEM;
                i = sizeof(struct buffer_head *) * groups_per_page;
                bh = kzalloc(i, GFP_NOFS);
-                if (bh == NULL)
+                if (bh == NULL) {
+                        err = -ENOMEM;
                        goto out;
+                }
        } else
                bh = &bhs;
        first_group = page->index * blocks_per_page / 2;
        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
+        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-                struct ext4_group_desc *desc;
+                if (group >= ngroups)
-                if (first_group + i >= ngroups)
                        break;
-                grinfo = ext4_get_group_info(sb, first_group + i);
+                grinfo = ext4_get_group_info(sb, group);
                /*
                 * If page is uptodate then we came here after online resize
                 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        bh[i] = NULL;
                        continue;
                }
+                if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
-                err = -EIO;
+                        err = -ENOMEM;
-                desc = ext4_get_group_desc(sb, first_group + i, NULL);
-                if (desc == NULL)
-                        goto out;
-                err = -ENOMEM;
-                bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
-                if (bh[i] == NULL)
                        goto out;
-                if (bitmap_uptodate(bh[i]))
-                        continue;
-                lock_buffer(bh[i]);
-                if (bitmap_uptodate(bh[i])) {
-                        unlock_buffer(bh[i]);
-                        continue;
-                }
-                ext4_lock_group(sb, first_group + i);
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        ext4_init_block_bitmap(sb, bh[i],
-                                                first_group + i, desc);
-                        set_bitmap_uptodate(bh[i]);
-                        set_buffer_uptodate(bh[i]);
-                        ext4_unlock_group(sb, first_group + i);
-                        unlock_buffer(bh[i]);
-                        continue;
                }
-                ext4_unlock_group(sb, first_group + i);
+                mb_debug(1, "read bitmap for group %u\n", group);
-                if (buffer_uptodate(bh[i])) {
-                        /*
-                         * if not uninit if bh is uptodate,
-                         * bitmap is also uptodate
-                         */
-                        set_bitmap_uptodate(bh[i]);
-                        unlock_buffer(bh[i]);
-                        continue;
-                }
-                get_bh(bh[i]);
-                /*
-                 * submit the buffer_head for read. We can
-                 * safely mark the bitmap as uptodate now.
-                 * We do it here so the bitmap uptodate bit
-                 * get set with buffer lock held.
-                 */
-                set_bitmap_uptodate(bh[i]);
-                bh[i]->b_end_io = end_buffer_read_sync;
-                submit_bh(READ, bh[i]);
-                mb_debug(1, "read bitmap for group %u\n", first_group + i);
        }
        /* wait for I/O completion */
-        for (i = 0; i < groups_per_page; i++)
+        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-                if (bh[i])
+                if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
-                        wait_on_buffer(bh[i]);
+                        err = -EIO;
-        err = -EIO;
-        for (i = 0; i < groups_per_page; i++)
-                if (bh[i] && !buffer_uptodate(bh[i]))
                        goto out;
+                }
+        }
-        err = 0;
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        int order = 1;
        void *bb;
-        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
-        bb = EXT4_MB_BUDDY(e4b);
+        bb = e4b->bd_buddy;
        while (order <= e4b->bd_blkbits + 1) {
                block = block >> 1;
                if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        /* let's maintain fragments counter */
        if (first != 0)
-                block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+                block = !mb_test_bit(first - 1, e4b->bd_bitmap);
        if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-                max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+                max = !mb_test_bit(first + count, e4b->bd_bitmap);
        if (block && max)
                e4b->bd_info->bb_fragments--;
        else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                block = first++;
                order = 0;
-                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+                if (!mb_test_bit(block, e4b->bd_bitmap)) {
                        ext4_fsblk_t blocknr;
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                                              "freeing already freed block "
                                              "(bit %u)", block);
                }
-                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+                mb_clear_bit(block, e4b->bd_bitmap);
                e4b->bd_info->bb_counters[order]++;
                /* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
                        break;
                next = (block + 1) * (1 << order);
-                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+                if (mb_test_bit(next, e4b->bd_bitmap))
                        break;
                order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        /* let's maintain fragments counter */
        if (start != 0)
-                mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
-                max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+                max = !mb_test_bit(start + len, e4b->bd_bitmap);
        if (mlen && max)
                e4b->bd_info->bb_fragments++;
        else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
-        ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
-        void *bitmap = EXT4_MB_BITMAP(e4b);
+        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        int i;
        int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 {
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        void *bitmap = EXT4_MB_BITMAP(e4b);
+        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        ext4_fsblk_t first_group_block;
        ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
-                        ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
+                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
                        goto exit_meta_group_info;
                }
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
-                ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
+                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
        }
        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
-        if (sbi->s_journal)
-                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
        return 0;
 out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
 */
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+                                    struct ext4_journal_cb_entry *jce,
+                                    int rc)
 {
-        struct super_block *sb = journal->j_private;
+        struct ext4_free_data *entry = (struct ext4_free_data *)jce;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
-        struct ext4_free_data *entry;
-        struct list_head *l, *ltmp;
-        list_for_each_safe(l, ltmp, &txn->t_private_list) {
+        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-                entry = list_entry(l, struct ext4_free_data, list);
+                 entry->efd_count, entry->efd_group, entry);
-                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+        if (test_opt(sb, DISCARD))
-                         entry->count, entry->group, entry);
+                ext4_issue_discard(sb, entry->efd_group,
+                                   entry->efd_start_cluster, entry->efd_count);
-                if (test_opt(sb, DISCARD))
+        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
-                        ext4_issue_discard(sb, entry->group,
+        /* we expect to find existing buddy because it's pinned */
-                                           entry->start_cluster, entry->count);
+        BUG_ON(err != 0);
-                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-                /* we expect to find existing buddy because it's pinned */
-                BUG_ON(err != 0);
-                db = e4b.bd_info;
+        db = e4b.bd_info;
-                /* there are blocks to put in buddy to make them really free */
+        /* there are blocks to put in buddy to make them really free */
-                count += entry->count;
+        count += entry->efd_count;
-                count2++;
+        count2++;
-                ext4_lock_group(sb, entry->group);
+        ext4_lock_group(sb, entry->efd_group);
-                /* Take it out of per group rb tree */
+        /* Take it out of per group rb tree */
-                rb_erase(&entry->node, &(db->bb_free_root));
+        rb_erase(&entry->efd_node, &(db->bb_free_root));
-                mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
+        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
-                /*
+        /*
-                 * Clear the trimmed flag for the group so that the next
+         * Clear the trimmed flag for the group so that the next
-                 * ext4_trim_fs can trim it.
+         * ext4_trim_fs can trim it.
-                 * If the volume is mounted with -o discard, online discard
+         * If the volume is mounted with -o discard, online discard
-                 * is supported and the free blocks will be trimmed online.
+         * is supported and the free blocks will be trimmed online.
-                 */
+         */
-                if (!test_opt(sb, DISCARD))
+        if (!test_opt(sb, DISCARD))
-                        EXT4_MB_GRP_CLEAR_TRIMMED(db);
+                EXT4_MB_GRP_CLEAR_TRIMMED(db);
-                if (!db->bb_free_root.rb_node) {
+        if (!db->bb_free_root.rb_node) {
-                        /* No more items in the per group rb tree
+                /* No more items in the per group rb tree
-                         * balance refcounts from ext4_mb_free_metadata()
+                 * balance refcounts from ext4_mb_free_metadata()
-                         */
+                 */
-                        page_cache_release(e4b.bd_buddy_page);
+                page_cache_release(e4b.bd_buddy_page);
-                        page_cache_release(e4b.bd_bitmap_page);
+                page_cache_release(e4b.bd_bitmap_page);
-                }
-                ext4_unlock_group(sb, entry->group);
-                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_unload_buddy(&e4b);
        }
+        ext4_unlock_group(sb, entry->efd_group);
+        kmem_cache_free(ext4_free_data_cachep, entry);
+        ext4_mb_unload_buddy(&e4b);
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
                return -ENOMEM;
        }
-        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
-                                          SLAB_RECLAIM_ACCOUNT);
+                                           SLAB_RECLAIM_ACCOUNT);
-        if (ext4_free_ext_cachep == NULL) {
+        if (ext4_free_data_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
                return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
-        kmem_cache_destroy(ext4_free_ext_cachep);
+        kmem_cache_destroy(ext4_free_data_cachep);
        ext4_groupinfo_destroy_slabs();
        ext4_remove_debugfs_entry();
 }
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_data_block_valid(sbi, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
-                           "fs metadata\n", block, block+len);
+                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits, max;
        ext4_lblk_t end;
-        loff_t size, orig_size, start_off;
+        loff_t size, start_off;
+        loff_t orig_size __maybe_unused;
        ext4_lblk_t start;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        n = rb_first(&(grp->bb_free_root));
        while (n) {
-                entry = rb_entry(n, struct ext4_free_data, node);
+                entry = rb_entry(n, struct ext4_free_data, efd_node);
-                ext4_set_bits(bitmap, entry->start_cluster, entry->count);
+                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
                n = rb_next(n);
        }
        return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
+        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
                        " Allocation context details:");
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
+        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
                        ac->ac_status, ac->ac_flags);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
+        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
                        "goal %lu/%lu/%lu@%lu, "
                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
+        ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
                 ac->ac_ex_scanned, ac->ac_found);
-        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
+        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
        ngroups = ext4_get_groups_count(sb);
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
 static int can_merge(struct ext4_free_data *entry1,
                        struct ext4_free_data *entry2)
 {
-        if ((entry1->t_tid == entry2->t_tid) &&
+        if ((entry1->efd_tid == entry2->efd_tid) &&
-            (entry1->group == entry2->group) &&
+            (entry1->efd_group == entry2->efd_group) &&
-            ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
+            ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
                return 1;
        return 0;
 }
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
-        new_node = &new_entry->node;
+        new_node = &new_entry->efd_node;
-        cluster = new_entry->start_cluster;
+        cluster = new_entry->efd_start_cluster;
        if (!*n) {
                /* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        }
        while (*n) {
                parent = *n;
-                entry = rb_entry(parent, struct ext4_free_data, node);
+                entry = rb_entry(parent, struct ext4_free_data, efd_node);
-                if (cluster < entry->start_cluster)
+                if (cluster < entry->efd_start_cluster)
                        n = &(*n)->rb_left;
-                else if (cluster >= (entry->start_cluster + entry->count))
+                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        /* Now try to see the extent can be merged to left and right */
        node = rb_prev(new_node);
        if (node) {
-                entry = rb_entry(node, struct ext4_free_data, node);
+                entry = rb_entry(node, struct ext4_free_data, efd_node);
                if (can_merge(entry, new_entry)) {
-                        new_entry->start_cluster = entry->start_cluster;
+                        new_entry->efd_start_cluster = entry->efd_start_cluster;
-                        new_entry->count += entry->count;
+                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        spin_lock(&sbi->s_md_lock);
+                        ext4_journal_callback_del(handle, &entry->efd_jce);
-                        list_del(&entry->list);
+                        kmem_cache_free(ext4_free_data_cachep, entry);
-                        spin_unlock(&sbi->s_md_lock);
-                        kmem_cache_free(ext4_free_ext_cachep, entry);
                }
        }
        node = rb_next(new_node);
        if (node) {
-                entry = rb_entry(node, struct ext4_free_data, node);
+                entry = rb_entry(node, struct ext4_free_data, efd_node);
                if (can_merge(new_entry, entry)) {
-                        new_entry->count += entry->count;
+                        new_entry->efd_count += entry->efd_count;
                        rb_erase(node, &(db->bb_free_root));
-                        spin_lock(&sbi->s_md_lock);
+                        ext4_journal_callback_del(handle, &entry->efd_jce);
-                        list_del(&entry->list);
+                        kmem_cache_free(ext4_free_data_cachep, entry);
-                        spin_unlock(&sbi->s_md_lock);
-                        kmem_cache_free(ext4_free_ext_cachep, entry);
                }
        }
        /* Add the extent to transaction's private list */
-        spin_lock(&sbi->s_md_lock);
+        ext4_journal_callback_add(handle, ext4_free_data_callback,
-        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+                                  &new_entry->efd_jce);
-        spin_unlock(&sbi->s_md_lock);
        return 0;
 }
@@ -4691,15 +4634,15 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
                if (!new_entry) {
                        err = -ENOMEM;
                        goto error_return;
                }
-                new_entry->start_cluster = bit;
+                new_entry->efd_start_cluster = bit;
-                new_entry->group  = block_group;
+                new_entry->efd_group = block_group;
-                new_entry->count = count_clusters;
+                new_entry->efd_count = count_clusters;
-                new_entry->t_tid = handle->h_transaction->t_tid;
+                new_entry->efd_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
        start = (e4b.bd_info->bb_first_free > start) ?
                e4b.bd_info->bb_first_free : start;
-        while (start < max) {
+        while (start <= max) {
-                start = mb_find_next_zero_bit(bitmap, max, start);
+                start = mb_find_next_zero_bit(bitmap, max + 1, start);
-                if (start >= max)
+                if (start > max)
                        break;
-                next = mb_find_next_bit(bitmap, max, start);
+                next = mb_find_next_bit(bitmap, max + 1, start);
                if ((next - start) >= minblocks) {
                        ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
        struct ext4_group_info *grp;
-        ext4_group_t first_group, last_group;
+        ext4_group_t group, first_group, last_group;
-        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
-        uint64_t start, len, minlen, trimmed = 0;
+        uint64_t start, end, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
-        len = range->len >> sb->s_blocksize_bits;
+        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
+        if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
+            unlikely(start >= max_blks))
                return -EINVAL;
-        if (start + len <= first_data_blk)
+        if (end >= max_blks)
+                end = max_blks - 1;
+        if (end <= first_data_blk)
                goto out;
-        if (start < first_data_blk) {
+        if (start < first_data_blk)
-                len -= first_data_blk - start;
                start = first_data_blk;
-        }
-        /* Determine first and last group to examine based on start and len */
+        /* Determine first and last group to examine based on start and end */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                     &first_group, &first_cluster);
-        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                     &last_group, &last_cluster);
-        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-        last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
-        if (first_group > last_group)
+        /* end now represents the last cluster to discard in this group */
-                return -EINVAL;
+        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
        for (group = first_group; group <= last_group; group++) {
                grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                }
                /*
-                 * For all the groups except the last one, last block will
+                 * For all the groups except the last one, last cluster will
-                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
-                 * change it for the last group in which case start +
+                 * change it for the last group, note that last_cluster is
-                 * len < EXT4_BLOCKS_PER_GROUP(sb).
+                 * already computed earlier by ext4_get_group_no_and_offset()
                 */
-                if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
+                if (group == last_group)
-                        last_cluster = first_cluster + len;
+                        end = last_cluster;
-                len -= last_cluster - first_cluster;
                if (grp->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, group, first_cluster,
-                                                last_cluster, minlen);
+                                                end, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
+                        trimmed += cnt;
                }
-                trimmed += cnt;
+                /*
+                 * For every group except the first one, we are sure
+                 * that the first cluster to discard will be cluster #0.
+                 */
                first_cluster = 0;
        }
-        range->len = trimmed * sb->s_blocksize;
        if (!ret)
                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
 out:
+        range->len = trimmed * sb->s_blocksize;
        return ret;
 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e3..c070618c21ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
 struct ext4_free_data {
-        /* this links the free block information from group_info */
+        /* MUST be the first member */
-        struct rb_node node;
+        struct ext4_journal_cb_entry    efd_jce;
+        /* ext4_free_data private data starts from here */
-        /* this links the free block information from ext4_sb_info */
+        /* this links the free block information from group_info */
-        struct list_head list;
+        struct rb_node                  efd_node;
        /* group which free block extent belongs */
-        ext4_group_t group;
+        ext4_group_t                    efd_group;
        /* free block extent */
-        ext4_grpblk_t start_cluster;
+        ext4_grpblk_t                   efd_start_cluster;
-        ext4_grpblk_t count;
+        ext4_grpblk_t                   efd_count;
        /* transaction which freed this extent */
-        tid_t   t_tid;
+        tid_t                           efd_tid;
 };
 struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
        __u16 bd_blkbits;
        ext4_group_t bd_group;
 };
-#define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa6..f39f80f8f2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
                                   S_IFREG, NULL, goal, owner);
        if (IS_ERR(tmp_inode)) {
-                retval = PTR_ERR(inode);
+                retval = PTR_ERR(tmp_inode);
                ext4_journal_stop(handle);
                return retval;
        }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2a..ed6548d89165 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
         * If check_interval in MMP block is larger, use that instead of
         * update_interval from the superblock.
         */
-        if (mmp->mmp_check_interval > mmp_check_interval)
+        if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
-                mmp_check_interval = mmp->mmp_check_interval;
+                mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
        seq = le32_to_cpu(mmp->mmp_seq);
        if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375d..349d7b3671c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
 fail:
        if (*err == ERR_BAD_DX_DIR)
                ext4_warning(dir->i_sb,
-                             "Corrupt dir inode %ld, running e2fsck is "
+                             "Corrupt dir inode %lu, running e2fsck is "
                             "recommended.", dir->i_ino);
        return NULL;
 }
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 475851896518..74cd1f7f1f88 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)
 static void put_io_page(struct ext4_io_page *io_page)
 {
        if (atomic_dec_and_test(&io_page->p_count)) {
-                end_page_writeback(io_page->p_page);
                put_page(io_page->p_page);
                kmem_cache_free(io_page_cachep, io_page);
        }
@@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
+        if (io->flag & EXT4_IO_END_DIRECT)
+                inode_dio_done(inode);
        /* Wake up anyone waiting on unwritten extent conversion */
        if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
                wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)
        unsigned long           flags;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (io->flag & EXT4_IO_END_IN_FSYNC)
+                goto requeue;
        if (list_empty(&io->list)) {
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                goto free;
        }
        if (!mutex_trylock(&inode->i_mutex)) {
+                bool was_queued;
+requeue:
+                was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
+                io->flag |= EXT4_IO_END_QUEUED;
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                /*
                 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)
                 * yield the cpu if it sees an end_io request that has already
                 * been requeued.
                 */
-                if (io->flag & EXT4_IO_END_QUEUED)
+                if (was_queued)
                        yield();
-                io->flag |= EXT4_IO_END_QUEUED;
                return;
        }
        list_del_init(&io->list);
@@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)
                        } while (bh != head);
                }
-                put_io_page(io_end->pages[i]);
+                if (atomic_read(&io_end->pages[i]->p_count) == 1)
+                        end_page_writeback(io_end->pages[i]->p_page);
        }
-        io_end->num_io_pages = 0;
        inode = io_end->inode;
        if (error) {
@@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
         * PageWriteback bit from the page to prevent the system from
         * wedging later on.
         */
+        if (atomic_read(&io_page->p_count) == 1)
+                end_page_writeback(page);
        put_io_page(io_page);
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb86..59fa0be27251 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
        do_div(reserved_blocks, 100);
        ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+        ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
        le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
                     flex_gd->count);
+        le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+                     flex_gd->count);
        /*
         * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
+        ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        o_blocks_count = ext4_blocks_count(es);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
+                ext4_msg(sb, KERN_DEBUG,
-                       o_blocks_count, n_blocks_count);
+                         "extending last group from %llu to %llu blocks",
+                         o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
                return 0;
        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                ext4_msg(sb, KERN_ERR,
-                        " too large to resize to %llu blocks safely\n",
+                         "filesystem too large to resize to %llu blocks safely",
-                        sb->s_id, n_blocks_count);
+                         n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext4_warning(sb, "CONFIG_LBDAF not enabled");
                return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        ext4_fsblk_t o_blocks_count;
        ext4_group_t o_group;
        ext4_group_t n_group;
-        ext4_grpblk_t offset;
+        ext4_grpblk_t offset, add;
        unsigned long n_desc_blocks;
        unsigned long o_desc_blocks;
        unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        o_blocks_count = ext4_blocks_count(es);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+                ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
-                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+                       "to %llu blocks", o_blocks_count, n_blocks_count);
        if (n_blocks_count < o_blocks_count) {
                /* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
                return 0;
        ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
-        ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+        ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
        n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
                        EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
        }
        brelse(bh);
-        if (offset != 0) {
+        /* extend the last group */
-                /* extend the last group */
+        if (n_group == o_group)
-                ext4_grpblk_t add;
+                add = n_blocks_count - o_blocks_count;
-                add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+        else
+                add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+        if (add > 0) {
                err = ext4_group_extend_no_check(sb, o_blocks_count, add);
                if (err)
                        goto out;
@@ -1674,7 +1681,7 @@ out:
        iput(resize_inode);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+                ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
-                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+                       "upto %llu blocks", o_blocks_count, n_blocks_count);
        return err;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 933900909ed0..ceebaf853beb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
 static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
        if (is_handle_aborted(handle))
                return;
-        printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);
        jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
        return bdi->dev == NULL;
 }
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+        struct super_block              *sb = journal->j_private;
+        struct ext4_sb_info             *sbi = EXT4_SB(sb);
+        int                             error = is_journal_aborted(journal);
+        struct ext4_journal_cb_entry    *jce, *tmp;
+        spin_lock(&sbi->s_md_lock);
+        list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+                list_del_init(&jce->jce_list);
+                spin_unlock(&sbi->s_md_lock);
+                jce->jce_func(sb, jce, error);
+                spin_lock(&sbi->s_md_lock);
+        }
+        spin_unlock(&sbi->s_md_lock);
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk(KERN_CONT "block %llu: ", block);
+                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
-        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
+                       "inode #%lu: block %llu: comm %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       block, current->comm, &vaf);
+        else
+                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+                       "inode #%lu: comm %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       current->comm, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
        if (IS_ERR(path))
                path = "(unknown)";
-        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, line, inode->i_ino);
-        if (block)
-                printk(KERN_CONT "block %llu: ", block);
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
+        if (block)
+                printk(KERN_CRIT
+                       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+                       "block %llu: comm %s: path %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       block, current->comm, path, &vaf);
+        else
+                printk(KERN_CRIT
+                       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+                       "comm %s: path %s: %pV\n",
+                       inode->i_sb->s_id, function, line, inode->i_ino,
+                       current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
-        if (sb->s_dirt)
-                ext4_commit_super(sb, 1);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext4_commit_super(sb, 1);
        }
+        if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+                ext4_commit_super(sb, 1);
        if (sbi->s_proc) {
+                remove_proc_entry("options", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
        kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
        }
 }
-static inline void ext4_show_quota_options(struct seq_file *seq,
-                                           struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_jquota_fmt) {
-                char *fmtname = "";
-                switch (sbi->s_jquota_fmt) {
-                case QFMT_VFS_OLD:
-                        fmtname = "vfsold";
-                        break;
-                case QFMT_VFS_V0:
-                        fmtname = "vfsv0";
-                        break;
-                case QFMT_VFS_V1:
-                        fmtname = "vfsv1";
-                        break;
-                }
-                seq_printf(seq, ",jqfmt=%s", fmtname);
-        }
-        if (sbi->s_qf_names[USRQUOTA])
-                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-        if (sbi->s_qf_names[GRPQUOTA])
-                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-        if (test_opt(sb, USRQUOTA))
-                seq_puts(seq, ",usrquota");
-        if (test_opt(sb, GRPQUOTA))
-                seq_puts(seq, ",grpquota");
-#endif
-}
-/*
- * Show an option if
- *  - it's set to a non-default value OR
- *  - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct dentry *root)
-{
-        int def_errors;
-        unsigned long def_mount_opts;
-        struct super_block *sb = root->d_sb;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_super_block *es = sbi->s_es;
-        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        def_errors     = le16_to_cpu(es->s_errors);
-        if (sbi->s_sb_block != 1)
-                seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
-        if (test_opt(sb, MINIX_DF))
-                seq_puts(seq, ",minixdf");
-        if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
-                seq_puts(seq, ",grpid");
-        if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
-                seq_puts(seq, ",nogrpid");
-        if (sbi->s_resuid != EXT4_DEF_RESUID ||
-            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
-                seq_printf(seq, ",resuid=%u", sbi->s_resuid);
-        }
-        if (sbi->s_resgid != EXT4_DEF_RESGID ||
-            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
-                seq_printf(seq, ",resgid=%u", sbi->s_resgid);
-        }
-        if (test_opt(sb, ERRORS_RO)) {
-                if (def_errors == EXT4_ERRORS_PANIC ||
-                    def_errors == EXT4_ERRORS_CONTINUE) {
-                        seq_puts(seq, ",errors=remount-ro");
-                }
-        }
-        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
-                seq_puts(seq, ",errors=continue");
-        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
-                seq_puts(seq, ",errors=panic");
-        if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
-                seq_puts(seq, ",nouid32");
-        if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
-                seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT4_FS_XATTR
-        if (test_opt(sb, XATTR_USER))
-                seq_puts(seq, ",user_xattr");
-        if (!test_opt(sb, XATTR_USER))
-                seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
-                seq_puts(seq, ",acl");
-        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-                seq_puts(seq, ",noacl");
-#endif
-        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
-                seq_printf(seq, ",commit=%u",
-                           (unsigned) (sbi->s_commit_interval / HZ));
-        }
-        if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
-                seq_printf(seq, ",min_batch_time=%u",
-                           (unsigned) sbi->s_min_batch_time);
-        }
-        if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
-                seq_printf(seq, ",max_batch_time=%u",
-                           (unsigned) sbi->s_max_batch_time);
-        }
-        /*
-         * We're changing the default of barrier mount option, so
-         * let's always display its mount state so it's clear what its
-         * status is.
-         */
-        seq_puts(seq, ",barrier=");
-        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
-                seq_puts(seq, ",journal_async_commit");
-        else if (test_opt(sb, JOURNAL_CHECKSUM))
-                seq_puts(seq, ",journal_checksum");
-        if (test_opt(sb, I_VERSION))
-                seq_puts(seq, ",i_version");
-        if (!test_opt(sb, DELALLOC) &&
-            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
-                seq_puts(seq, ",nodelalloc");
-        if (!test_opt(sb, MBLK_IO_SUBMIT))
-                seq_puts(seq, ",nomblk_io_submit");
-        if (sbi->s_stripe)
-                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
-        /*
-         * journal mode get enabled in different ways
-         * So just print the value even if we didn't specify it
-         */
-        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-                seq_puts(seq, ",data=journal");
-        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-                seq_puts(seq, ",data=ordered");
-        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-                seq_puts(seq, ",data=writeback");
-        if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
-                seq_printf(seq, ",inode_readahead_blks=%u",
-                           sbi->s_inode_readahead_blks);
-        if (test_opt(sb, DATA_ERR_ABORT))
-                seq_puts(seq, ",data_err=abort");
-        if (test_opt(sb, NO_AUTO_DA_ALLOC))
-                seq_puts(seq, ",noauto_da_alloc");
-        if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
-                seq_puts(seq, ",discard");
-        if (test_opt(sb, NOLOAD))
-                seq_puts(seq, ",norecovery");
-        if (test_opt(sb, DIOREAD_NOLOCK))
-                seq_puts(seq, ",dioread_nolock");
-        if (test_opt(sb, BLOCK_VALIDITY) &&
-            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
-                seq_puts(seq, ",block_validity");
-        if (!test_opt(sb, INIT_INODE_TABLE))
-                seq_puts(seq, ",noinit_itable");
-        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
-                seq_printf(seq, ",init_itable=%u",
-                           (unsigned) sbi->s_li_wait_mult);
-        ext4_show_quota_options(seq, sb);
-        return 0;
-}
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
 {
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
 enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
-        Opt_journal_update, Opt_journal_dev,
+        Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
-        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_nouid32, "nouid32"},
        {Opt_debug, "debug"},
-        {Opt_oldalloc, "oldalloc"},
+        {Opt_removed, "oldalloc"},
-        {Opt_orlov, "orlov"},
+        {Opt_removed, "orlov"},
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
-        {Opt_noload, "noload"},
        {Opt_noload, "norecovery"},
-        {Opt_nobh, "nobh"},
+        {Opt_noload, "noload"},
-        {Opt_bh, "bh"},
+        {Opt_removed, "nobh"},
+        {Opt_removed, "bh"},
        {Opt_commit, "commit=%u"},
        {Opt_min_batch_time, "min_batch_time=%u"},
        {Opt_max_batch_time, "max_batch_time=%u"},
-        {Opt_journal_update, "journal=update"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
-        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
        {Opt_init_itable, "init_itable=%u"},
        {Opt_init_itable, "init_itable"},
        {Opt_noinit_itable, "noinit_itable"},
+        {Opt_removed, "check=none"},    /* mount option from ext2/3 */
+        {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
+        {Opt_removed, "reservation"},   /* mount option from ext2/3 */
+        {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+        {Opt_removed, "journal=%u"},    /* mount option from ext2/3 */
        {Opt_err, NULL},
 };
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 }
 #endif
-static int parse_options(char *options, struct super_block *sb,
+#define MOPT_SET        0x0001
-                         unsigned long *journal_devnum,
+#define MOPT_CLEAR      0x0002
-                         unsigned int *journal_ioprio,
+#define MOPT_NOSUPPORT  0x0004
-                         ext4_fsblk_t *n_blocks_count, int is_remount)
+#define MOPT_EXPLICIT   0x0008
-{
+#define MOPT_CLEAR_ERR  0x0010
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+#define MOPT_GTE0       0x0020
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int data_opt = 0;
-        int option;
 #ifdef CONFIG_QUOTA
-        int qfmt;
+#define MOPT_Q          0
+#define MOPT_QFMT       0x0040
+#else
+#define MOPT_Q          MOPT_NOSUPPORT
+#define MOPT_QFMT       MOPT_NOSUPPORT
 #endif
+#define MOPT_DATAJ      0x0080
-        if (!options)
-                return 1;
+static const struct mount_opts {
+        int     token;
-        while ((p = strsep(&options, ",")) != NULL) {
+        int     mount_opt;
-                int token;
+        int     flags;
-                if (!*p)
+} ext4_mount_opts[] = {
-                        continue;
+        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
-                /*
+        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
-                 * Initialize args struct so we know whether arg was
+        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
-                 * found; some options take optional arguments.
+        {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
-                 */
+        {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
-                args[0].to = args[0].from = NULL;
+        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
-                token = match_token(p, tokens, args);
+        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
-                switch (token) {
+        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
-                case Opt_bsd_df:
+        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
-                        clear_opt(sb, MINIX_DF);
+        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
-                        break;
+        {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
-                case Opt_minix_df:
+        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
-                        set_opt(sb, MINIX_DF);
+        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+                                    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
-                        break;
+        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
-                case Opt_grpid:
+        {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
-                        set_opt(sb, GRPID);
+        {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+        {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
-                        break;
+        {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
-                case Opt_nogrpid:
+        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
-                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
-                        clear_opt(sb, GRPID);
+        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
-                        break;
+        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
-                case Opt_resuid:
+        {Opt_commit, 0, MOPT_GTE0},
-                        if (match_int(&args[0], &option))
+        {Opt_max_batch_time, 0, MOPT_GTE0},
-                                return 0;
+        {Opt_min_batch_time, 0, MOPT_GTE0},
-                        sbi->s_resuid = option;
+        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
-                        break;
+        {Opt_init_itable, 0, MOPT_GTE0},
-                case Opt_resgid:
+        {Opt_stripe, 0, MOPT_GTE0},
-                        if (match_int(&args[0], &option))
+        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
-                                return 0;
+        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
-                        sbi->s_resgid = option;
+        {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
-                        break;
-                case Opt_sb:
-                        /* handled by get_sb_block() instead of here */
-                        /* *sb_block = match_int(&args[0]); */
-                        break;
-                case Opt_err_panic:
-                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sb, ERRORS_PANIC);
-                        break;
-                case Opt_err_ro:
-                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sb, ERRORS_RO);
-                        break;
-                case Opt_err_cont:
-                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sb, ERRORS_CONT);
-                        break;
-                case Opt_nouid32:
-                        set_opt(sb, NO_UID32);
-                        break;
-                case Opt_debug:
-                        set_opt(sb, DEBUG);
-                        break;
-                case Opt_oldalloc:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated oldalloc option");
-                        break;
-                case Opt_orlov:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated orlov option");
-                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
-                case Opt_user_xattr:
+        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
-                        set_opt(sb, XATTR_USER);
+        {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
-                        break;
-                case Opt_nouser_xattr:
-                        clear_opt(sb, XATTR_USER);
-                        break;
 #else
-                case Opt_user_xattr:
+        {Opt_user_xattr, 0, MOPT_NOSUPPORT},
-                case Opt_nouser_xattr:
+        {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
-                        ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-                case Opt_acl:
+        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
-                        set_opt(sb, POSIX_ACL);
+        {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
-                        break;
-                case Opt_noacl:
-                        clear_opt(sb, POSIX_ACL);
-                        break;
 #else
-                case Opt_acl:
+        {Opt_acl, 0, MOPT_NOSUPPORT},
-                case Opt_noacl:
+        {Opt_noacl, 0, MOPT_NOSUPPORT},
-                        ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-                        break;
 #endif
-                case Opt_journal_update:
+        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
-                        /* @@@ FIXME */
+        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
-                        /* Eventually we will want to be able to create
+        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
-                           a journal file here.  For now, only allow the
+        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
-                           user to specify an existing inode to be the
+                                                        MOPT_SET | MOPT_Q},
-                           journal file. */
+        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
-                        if (is_remount) {
+                                                        MOPT_SET | MOPT_Q},
-                                ext4_msg(sb, KERN_ERR,
+        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
-                                         "Cannot specify journal on remount");
+                       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
-                                return 0;
+        {Opt_usrjquota, 0, MOPT_Q},
-                        }
+        {Opt_grpjquota, 0, MOPT_Q},
-                        set_opt(sb, UPDATE_JOURNAL);
+        {Opt_offusrjquota, 0, MOPT_Q},
-                        break;
+        {Opt_offgrpjquota, 0, MOPT_Q},
-                case Opt_journal_dev:
+        {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
-                        if (is_remount) {
+        {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+        {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+        {Opt_err, 0, 0}
+};
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+                            substring_t *args, unsigned long *journal_devnum,
+                            unsigned int *journal_ioprio, int is_remount)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        const struct mount_opts *m;
+        int arg = 0;
+        if (args->from && match_int(args, &arg))
+                return -1;
+        switch (token) {
+        case Opt_noacl:
+        case Opt_nouser_xattr:
+                ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+                break;
+        case Opt_sb:
+                return 1;       /* handled by get_sb_block() */
+        case Opt_removed:
+                ext4_msg(sb, KERN_WARNING,
+                         "Ignoring removed %s option", opt);
+                return 1;
+        case Opt_resuid:
+                sbi->s_resuid = arg;
+                return 1;
+        case Opt_resgid:
+                sbi->s_resgid = arg;
+                return 1;
+        case Opt_abort:
+                sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                return 1;
+        case Opt_i_version:
+                sb->s_flags |= MS_I_VERSION;
+                return 1;
+        case Opt_journal_dev:
+                if (is_remount) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Cannot specify journal on remount");
+                        return -1;
+                }
+                *journal_devnum = arg;
+                return 1;
+        case Opt_journal_ioprio:
+                if (arg < 0 || arg > 7)
+                        return -1;
+                *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+                return 1;
+        }
+        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+                if (token != m->token)
+                        continue;
+                if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+                        return -1;
+                if (m->flags & MOPT_EXPLICIT)
+                        set_opt2(sb, EXPLICIT_DELALLOC);
+                if (m->flags & MOPT_CLEAR_ERR)
+                        clear_opt(sb, ERRORS_MASK);
+                if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+                        ext4_msg(sb, KERN_ERR, "Cannot change quota "
+                                 "options when quota turned on");
+                        return -1;
+                }
+                if (m->flags & MOPT_NOSUPPORT) {
+                        ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+                } else if (token == Opt_commit) {
+                        if (arg == 0)
+                                arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+                        sbi->s_commit_interval = HZ * arg;
+                } else if (token == Opt_max_batch_time) {
+                        if (arg == 0)
+                                arg = EXT4_DEF_MAX_BATCH_TIME;
+                        sbi->s_max_batch_time = arg;
+                } else if (token == Opt_min_batch_time) {
+                        sbi->s_min_batch_time = arg;
+                } else if (token == Opt_inode_readahead_blks) {
+                        if (arg > (1 << 30))
+                                return -1;
+                        if (arg && !is_power_of_2(arg)) {
                                ext4_msg(sb, KERN_ERR,
-                                        "Cannot specify journal on remount");
+                                         "EXT4-fs: inode_readahead_blks"
-                                return 0;
+                                         " must be a power of 2");
+                                return -1;
                        }
-                        if (match_int(&args[0], &option))
+                        sbi->s_inode_readahead_blks = arg;
-                                return 0;
+                } else if (token == Opt_init_itable) {
-                        *journal_devnum = option;
+                        set_opt(sb, INIT_INODE_TABLE);
-                        break;
+                        if (!args->from)
-                case Opt_journal_checksum:
+                                arg = EXT4_DEF_LI_WAIT_MULT;
-                        set_opt(sb, JOURNAL_CHECKSUM);
+                        sbi->s_li_wait_mult = arg;
-                        break;
+                } else if (token == Opt_stripe) {
-                case Opt_journal_async_commit:
+                        sbi->s_stripe = arg;
-                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
+                } else if (m->flags & MOPT_DATAJ) {
-                        set_opt(sb, JOURNAL_CHECKSUM);
-                        break;
-                case Opt_noload:
-                        set_opt(sb, NOLOAD);
-                        break;
-                case Opt_commit:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        if (option == 0)
-                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
-                        sbi->s_commit_interval = HZ * option;
-                        break;
-                case Opt_max_batch_time:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        if (option == 0)
-                                option = EXT4_DEF_MAX_BATCH_TIME;
-                        sbi->s_max_batch_time = option;
-                        break;
-                case Opt_min_batch_time:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_min_batch_time = option;
-                        break;
-                case Opt_data_journal:
-                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
-                        goto datacheck;
-                case Opt_data_ordered:
-                        data_opt = EXT4_MOUNT_ORDERED_DATA;
-                        goto datacheck;
-                case Opt_data_writeback:
-                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
-                datacheck:
                        if (is_remount) {
                                if (!sbi->s_journal)
                                        ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
-                                else if (test_opt(sb, DATA_FLAGS) != data_opt) {
+                                else if (test_opt(sb, DATA_FLAGS) !=
+                                         m->mount_opt) {
                                        ext4_msg(sb, KERN_ERR,
-                                                "Cannot change data mode on remount");
+                                         "Cannot change data mode on remount");
-                                        return 0;
+                                        return -1;
                                }
                        } else {
                                clear_opt(sb, DATA_FLAGS);
-                                sbi->s_mount_opt |= data_opt;
+                                sbi->s_mount_opt |= m->mount_opt;
                        }
-                        break;
-                case Opt_data_err_abort:
-                        set_opt(sb, DATA_ERR_ABORT);
-                        break;
-                case Opt_data_err_ignore:
-                        clear_opt(sb, DATA_ERR_ABORT);
-                        break;
 #ifdef CONFIG_QUOTA
-                case Opt_usrjquota:
+                } else if (token == Opt_usrjquota) {
                        if (!set_qf_name(sb, USRQUOTA, &args[0]))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_grpjquota) {
-                case Opt_grpjquota:
                        if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_offusrjquota) {
-                case Opt_offusrjquota:
                        if (!clear_qf_name(sb, USRQUOTA))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (token == Opt_offgrpjquota) {
-                case Opt_offgrpjquota:
                        if (!clear_qf_name(sb, GRPQUOTA))
-                                return 0;
+                                return -1;
-                        break;
+                } else if (m->flags & MOPT_QFMT) {
-                case Opt_jqfmt_vfsold:
-                        qfmt = QFMT_VFS_OLD;
-                        goto set_qf_format;
-                case Opt_jqfmt_vfsv0:
-                        qfmt = QFMT_VFS_V0;
-                        goto set_qf_format;
-                case Opt_jqfmt_vfsv1:
-                        qfmt = QFMT_VFS_V1;
-set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
-                            sbi->s_jquota_fmt != qfmt) {
+                            sbi->s_jquota_fmt != m->mount_opt) {
-                                ext4_msg(sb, KERN_ERR, "Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot "
-                                        "journaled quota options when "
+                                         "change journaled quota options "
-                                        "quota turned on");
+                                         "when quota turned on");
-                                return 0;
+                                return -1;
-                        }
-                        sbi->s_jquota_fmt = qfmt;
-                        break;
-                case Opt_quota:
-                case Opt_usrquota:
-                        set_opt(sb, QUOTA);
-                        set_opt(sb, USRQUOTA);
-                        break;
-                case Opt_grpquota:
-                        set_opt(sb, QUOTA);
-                        set_opt(sb, GRPQUOTA);
-                        break;
-                case Opt_noquota:
-                        if (sb_any_quota_loaded(sb)) {
-                                ext4_msg(sb, KERN_ERR, "Cannot change quota "
-                                        "options when quota turned on");
-                                return 0;
                        }
-                        clear_opt(sb, QUOTA);
+                        sbi->s_jquota_fmt = m->mount_opt;
-                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sb, GRPQUOTA);
-                        break;
-#else
-                case Opt_quota:
-                case Opt_usrquota:
-                case Opt_grpquota:
-                        ext4_msg(sb, KERN_ERR,
-                                "quota options not supported");
-                        break;
-                case Opt_usrjquota:
-                case Opt_grpjquota:
-                case Opt_offusrjquota:
-                case Opt_offgrpjquota:
-                case Opt_jqfmt_vfsold:
-                case Opt_jqfmt_vfsv0:
-                case Opt_jqfmt_vfsv1:
-                        ext4_msg(sb, KERN_ERR,
-                                "journaled quota options not supported");
-                        break;
-                case Opt_noquota:
-                        break;
 #endif
-                case Opt_abort:
+                } else {
-                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                        if (!args->from)
-                        break;
+                                arg = 1;
-                case Opt_nobarrier:
+                        if (m->flags & MOPT_CLEAR)
-                        clear_opt(sb, BARRIER);
+                                arg = !arg;
-                        break;
+                        else if (unlikely(!(m->flags & MOPT_SET))) {
-                case Opt_barrier:
+                                ext4_msg(sb, KERN_WARNING,
-                        if (args[0].from) {
+                                         "buggy handling of option %s", opt);
-                                if (match_int(&args[0], &option))
+                                WARN_ON(1);
-                                        return 0;
+                                return -1;
-                        } else
-                                option = 1;     /* No argument, default to 1 */
-                        if (option)
-                                set_opt(sb, BARRIER);
-                        else
-                                clear_opt(sb, BARRIER);
-                        break;
-                case Opt_ignore:
-                        break;
-                case Opt_resize:
-                        if (!is_remount) {
-                                ext4_msg(sb, KERN_ERR,
-                                        "resize option only available "
-                                        "for remount");
-                                return 0;
-                        }
-                        if (match_int(&args[0], &option) != 0)
-                                return 0;
-                        *n_blocks_count = option;
-                        break;
-                case Opt_nobh:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated nobh option");
-                        break;
-                case Opt_bh:
-                        ext4_msg(sb, KERN_WARNING,
-                                 "Ignoring deprecated bh option");
-                        break;
-                case Opt_i_version:
-                        set_opt(sb, I_VERSION);
-                        sb->s_flags |= MS_I_VERSION;
-                        break;
-                case Opt_nodelalloc:
-                        clear_opt(sb, DELALLOC);
-                        clear_opt2(sb, EXPLICIT_DELALLOC);
-                        break;
-                case Opt_mblk_io_submit:
-                        set_opt(sb, MBLK_IO_SUBMIT);
-                        break;
-                case Opt_nomblk_io_submit:
-                        clear_opt(sb, MBLK_IO_SUBMIT);
-                        break;
-                case Opt_stripe:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_stripe = option;
-                        break;
-                case Opt_delalloc:
-                        set_opt(sb, DELALLOC);
-                        set_opt2(sb, EXPLICIT_DELALLOC);
-                        break;
-                case Opt_block_validity:
-                        set_opt(sb, BLOCK_VALIDITY);
-                        break;
-                case Opt_noblock_validity:
-                        clear_opt(sb, BLOCK_VALIDITY);
-                        break;
-                case Opt_inode_readahead_blks:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0 || option > (1 << 30))
-                                return 0;
-                        if (option && !is_power_of_2(option)) {
-                                ext4_msg(sb, KERN_ERR,
-                                         "EXT4-fs: inode_readahead_blks"
-                                         " must be a power of 2");
-                                return 0;
                        }
-                        sbi->s_inode_readahead_blks = option;
+                        if (arg != 0)
-                        break;
+                                sbi->s_mount_opt |= m->mount_opt;
-                case Opt_journal_ioprio:
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        if (option < 0 || option > 7)
-                                break;
-                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
-                                                            option);
-                        break;
-                case Opt_noauto_da_alloc:
-                        set_opt(sb, NO_AUTO_DA_ALLOC);
-                        break;
-                case Opt_auto_da_alloc:
-                        if (args[0].from) {
-                                if (match_int(&args[0], &option))
-                                        return 0;
-                        } else
-                                option = 1;     /* No argument, default to 1 */
-                        if (option)
-                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sb,NO_AUTO_DA_ALLOC);
+                                sbi->s_mount_opt &= ~m->mount_opt;
-                        break;
-                case Opt_discard:
-                        set_opt(sb, DISCARD);
-                        break;
-                case Opt_nodiscard:
-                        clear_opt(sb, DISCARD);
-                        break;
-                case Opt_dioread_nolock:
-                        set_opt(sb, DIOREAD_NOLOCK);
-                        break;
-                case Opt_dioread_lock:
-                        clear_opt(sb, DIOREAD_NOLOCK);
-                        break;
-                case Opt_init_itable:
-                        set_opt(sb, INIT_INODE_TABLE);
-                        if (args[0].from) {
-                                if (match_int(&args[0], &option))
-                                        return 0;
-                        } else
-                                option = EXT4_DEF_LI_WAIT_MULT;
-                        if (option < 0)
-                                return 0;
-                        sbi->s_li_wait_mult = option;
-                        break;
-                case Opt_noinit_itable:
-                        clear_opt(sb, INIT_INODE_TABLE);
-                        break;
-                default:
-                        ext4_msg(sb, KERN_ERR,
-                               "Unrecognized mount option \"%s\" "
-                               "or missing value", p);
-                        return 0;
                }
+                return 1;
+        }
+        ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+                 "or missing value", opt);
+        return -1;
+}
+static int parse_options(char *options, struct super_block *sb,
+                         unsigned long *journal_devnum,
+                         unsigned int *journal_ioprio,
+                         int is_remount)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
+                token = match_token(p, tokens, args);
+                if (handle_mount_opt(sb, p, token, args, journal_devnum,
+                                     journal_ioprio, is_remount) < 0)
+                        return 0;
        }
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
        return 1;
 }
+static inline void ext4_show_quota_options(struct seq_file *seq,
+                                           struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sbi->s_jquota_fmt) {
+                char *fmtname = "";
+                switch (sbi->s_jquota_fmt) {
+                case QFMT_VFS_OLD:
+                        fmtname = "vfsold";
+                        break;
+                case QFMT_VFS_V0:
+                        fmtname = "vfsv0";
+                        break;
+                case QFMT_VFS_V1:
+                        fmtname = "vfsv1";
+                        break;
+                }
+                seq_printf(seq, ",jqfmt=%s", fmtname);
+        }
+        if (sbi->s_qf_names[USRQUOTA])
+                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+        if (sbi->s_qf_names[GRPQUOTA])
+                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+        if (test_opt(sb, USRQUOTA))
+                seq_puts(seq, ",usrquota");
+        if (test_opt(sb, GRPQUOTA))
+                seq_puts(seq, ",grpquota");
+#endif
+}
+static const char *token2str(int token)
+{
+        static const struct match_token *t;
+        for (t = tokens; t->token != Opt_err; t++)
+                if (t->token == token && !strchr(t->pattern, '='))
+                        break;
+        return t->pattern;
+}
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+                              int nodefs)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+        const struct mount_opts *m;
+        char sep = nodefs ? '\n' : ',';
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+        if (sbi->s_sb_block != 1)
+                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+                int want_set = m->flags & MOPT_SET;
+                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+                    (m->flags & MOPT_CLEAR_ERR))
+                        continue;
+                if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+                        continue; /* skip if same as the default */
+                if ((want_set &&
+                     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+                    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+                        continue; /* select Opt_noFoo vs Opt_Foo */
+                SEQ_OPTS_PRINT("%s", token2str(m->token));
+        }
+        if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+                SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
+        if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+                SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+                SEQ_OPTS_PUTS("errors=remount-ro");
+        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+                SEQ_OPTS_PUTS("errors=continue");
+        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+                SEQ_OPTS_PUTS("errors=panic");
+        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+        if (sb->s_flags & MS_I_VERSION)
+                SEQ_OPTS_PUTS("i_version");
+        if (nodefs || sbi->s_stripe)
+                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+        if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+                        SEQ_OPTS_PUTS("data=journal");
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                        SEQ_OPTS_PUTS("data=ordered");
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+                        SEQ_OPTS_PUTS("data=writeback");
+        }
+        if (nodefs ||
+            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+                               sbi->s_inode_readahead_blks);
+        if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+        ext4_show_quota_options(seq, sb);
+        return 0;
+}
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+        return _ext4_show_options(seq, root->d_sb, 0);
+}
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+        struct super_block *sb = seq->private;
+        int rc;
+        seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+        rc = _ext4_show_options(seq, sb, 1);
+        seq_puts(seq, "\n");
+        return rc;
+}
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+        return single_open(file, options_seq_show, PDE(inode)->data);
+}
+static const struct file_operations ext4_seq_options_fops = {
+        .owner = THIS_MODULE,
+        .open = options_open_fs,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
 {
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
-                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
-        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
-                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
-                        "2.6.38");
                set_opt(sb, GRPID);
-        }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-                           &journal_devnum, &journal_ioprio, NULL, 0)) {
+                           &journal_devnum, &journal_ioprio, 0)) {
                ext4_msg(sb, KERN_WARNING,
                         "failed to parse options in superblock: %s",
                         sbi->s_es->s_mount_opts);
        }
+        sbi->s_def_mount_opt = sbi->s_mount_opt;
        if (!parse_options((char *) data, sb, &journal_devnum,
-                           &journal_ioprio, NULL, 0))
+                           &journal_ioprio, 0))
                goto failed_mount;
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #else
                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-                sb->s_dirt = 1;
        }
        /* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+        if (sbi->s_proc)
+                proc_create_data("options", S_IRUGO, sbi->s_proc,
+                                 &ext4_seq_options_fops, sb);
        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
        /*
         * The journal may have updated the bg summary counts, so we
         * need to update the global counters.
@@ -3861,6 +3727,7 @@ failed_mount2:
        ext4_kvfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
+                remove_proc_entry("options", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");
-        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-                err = jbd2_journal_update_format(journal);
-                if (err)  {
-                        ext4_msg(sb, KERN_ERR, "error updating journal");
-                        jbd2_journal_destroy(journal);
-                        return err;
-                }
-        }
        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
@@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        int enable_quota = 0;
@@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-        if (!parse_options(data, sb, NULL, &journal_ioprio,
+        if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
-                           &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
-        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
-                n_blocks_count > ext4_blocks_count(es)) {
                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
                        goto restore_opts;
@@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        if (sbi->s_journal)
                                ext4_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
-                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
-                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
                        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a220..e88748e55c0f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
                printk("\n"); \
        } while (0)
 #else
-# define ea_idebug(f...)
+# define ea_idebug(inode, fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
-# define ea_bdebug(f...)
+# define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
 #endif
 static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
 static inline int
 ext4_xattr_check_block(struct buffer_head *bh)
 {
-        int error;
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1))
                return -EIO;
-        error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+        return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
-        return error;
 }
 static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        error = -ENODATA;
        if (!EXT4_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %llu",
+                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh)
                goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        error = 0;
        if (!EXT4_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %llu",
+                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        error = -EIO;
        if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
+                unlock_buffer(bh);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+                if (ce)
+                        mb_cache_entry_release(ce);
+                unlock_buffer(bh);
                error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
-                if (ce)
-                        mb_cache_entry_release(ce);
        }
-        unlock_buffer(bh);
 out:
        ext4_std_error(inode->i_sb, error);
        return;
@@ -834,7 +834,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-                        ea_idebug(inode, "creating block %d", block);
+                        ea_idebug(inode, "creating block %llu",
+                                  (unsigned long long)block);
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903fb..c78841ee81cf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
 * whole transaction.
 *
 * Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __try_to_free_cp_buf(struct journal_head *jh)
 {
        int ret = 0;
        struct buffer_head *bh = jh2bh(jh);
-        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+        if (jh->b_transaction == NULL && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
                /*
                 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
                get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
-                jbd_unlock_bh_state(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
-        } else {
-                jbd_unlock_bh_state(bh);
        }
        return ret;
 }
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 }
 /*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk.  Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
-        __releases(journal->j_list_lock)
-{
-        get_bh(bh);
-        spin_unlock(&journal->j_list_lock);
-        jbd_lock_bh_state(bh);
-        jbd_unlock_bh_state(bh);
-        put_bh(bh);
-}
-/*
 * Clean up transaction's list of buffers submitted for io.
 * We wait for any pending IO to complete and remove any clean
 * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
        while (!released && transaction->t_checkpoint_io_list) {
                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
-                if (!jbd_trylock_bh_state(bh)) {
-                        jbd_sync_bh(journal, bh);
-                        spin_lock(&journal->j_list_lock);
-                        goto restart;
-                }
                get_bh(bh);
                if (buffer_locked(bh)) {
                        spin_unlock(&journal->j_list_lock);
-                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
                        /* the journal_head may have gone by now */
                        BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
                 * it has been written out and so we can drop it from the list
                 */
                released = __jbd2_journal_remove_checkpoint(jh);
-                jbd_unlock_bh_state(bh);
                __brelse(bh);
        }
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
-                clear_buffer_jwrite(bh);
                BUFFER_TRACE(bh, "brelse");
                __brelse(bh);
        }
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
 * be written out.
 *
 * Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
                            int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        if (buffer_locked(bh)) {
                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
                /* the journal_head may have gone by now */
                BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
                        /*
                         * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                if (unlikely(buffer_write_io_error(bh)))
                        ret = -EIO;
                get_bh(bh);
-                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __jbd2_journal_remove_checkpoint(jh);
                spin_unlock(&journal->j_list_lock);
-                jbd_unlock_bh_state(bh);
                __brelse(bh);
        } else {
                /*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                BUFFER_TRACE(bh, "queue");
                get_bh(bh);
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
-                set_buffer_jwrite(bh);
                journal->j_chkpt_bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
-                jbd_unlock_bh_state(bh);
                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
                if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
                int retry = 0, err;
                while (!retry && transaction->t_checkpoint_list) {
-                        struct buffer_head *bh;
                        jh = transaction->t_checkpoint_list;
-                        bh = jh2bh(jh);
-                        if (!jbd_trylock_bh_state(bh)) {
-                                jbd_sync_bh(journal, bh);
-                                retry = 1;
-                                break;
-                        }
                        retry = __process_buffer(journal, jh, &batch_count,
                                                 transaction);
                        if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
 int jbd2_cleanup_journal_tail(journal_t *journal)
 {
-        transaction_t * transaction;
        tid_t           first_tid;
-        unsigned long   blocknr, freed;
+        unsigned long   blocknr;
        if (is_journal_aborted(journal))
                return 1;
-        /* OK, work out the oldest transaction remaining in the log, and
+        if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
-         * the log block it starts at.
-         *
-         * If the log is now empty, we need to work out which is the
-         * next transaction ID we will write, and where it will
-         * start. */
-        write_lock(&journal->j_state_lock);
-        spin_lock(&journal->j_list_lock);
-        transaction = journal->j_checkpoint_transactions;
-        if (transaction) {
-                first_tid = transaction->t_tid;
-                blocknr = transaction->t_log_start;
-        } else if ((transaction = journal->j_committing_transaction) != NULL) {
-                first_tid = transaction->t_tid;
-                blocknr = transaction->t_log_start;
-        } else if ((transaction = journal->j_running_transaction) != NULL) {
-                first_tid = transaction->t_tid;
-                blocknr = journal->j_head;
-        } else {
-                first_tid = journal->j_transaction_sequence;
-                blocknr = journal->j_head;
-        }
-        spin_unlock(&journal->j_list_lock);
-        J_ASSERT(blocknr != 0);
-        /* If the oldest pinned transaction is at the tail of the log
-           already then there's not much we can do right now. */
-        if (journal->j_tail_sequence == first_tid) {
-                write_unlock(&journal->j_state_lock);
                return 1;
-        }
+        J_ASSERT(blocknr != 0);
-        /* OK, update the superblock to recover the freed space.
-         * Physical blocks come first: have we wrapped beyond the end of
-         * the log?  */
-        freed = blocknr - journal->j_tail;
-        if (blocknr < journal->j_tail)
-                freed = freed + journal->j_last - journal->j_first;
-        trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
-        jbd_debug(1,
-                  "Cleaning journal tail from %d to %d (offset %lu), "
-                  "freeing %lu\n",
-                  journal->j_tail_sequence, first_tid, blocknr, freed);
-        journal->j_free += freed;
-        journal->j_tail_sequence = first_tid;
-        journal->j_tail = blocknr;
-        write_unlock(&journal->j_state_lock);
        /*
-         * If there is an external journal, we need to make sure that
+         * We need to make sure that any blocks that were recently written out
-         * any data blocks that were recently written out --- perhaps
+         * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
-         * by jbd2_log_do_checkpoint() --- are flushed out before we
+         * we drop the transactions from the journal. It's unlikely this will
-         * drop the transactions from the external journal.  It's
+         * be necessary, especially with an appropriately sized journal, but we
-         * unlikely this will be necessary, especially with a
+         * need this to guarantee correctness.  Fortunately
-         * appropriately sized journal, but we need this to guarantee
+         * jbd2_cleanup_journal_tail() doesn't get called all that often.
-         * correctness.  Fortunately jbd2_cleanup_journal_tail()
-         * doesn't get called all that often.
         */
-        if ((journal->j_fs_dev != journal->j_dev) &&
+        if (journal->j_flags & JBD2_BARRIER)
-            (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-        if (!(journal->j_flags & JBD2_ABORT))
-                jbd2_journal_update_superblock(journal, 1);
+        __jbd2_update_log_tail(journal, first_tid, blocknr);
        return 0;
 }
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
        do {
                jh = next_jh;
                next_jh = jh->b_cpnext;
-                /* Use trylock because of the ranking */
+                ret = __try_to_free_cp_buf(jh);
-                if (jbd_trylock_bh_state(jh2bh(jh))) {
+                if (ret) {
-                        ret = __try_to_free_cp_buf(jh);
+                        freed++;
-                        if (ret) {
+                        if (ret == 2) {
-                                freed++;
+                                *released = 1;
-                                if (ret == 2) {
+                                return freed;
-                                        *released = 1;
-                                        return freed;
-                                }
                        }
                }
                /*
@@ -673,9 +577,7 @@ out:
 * The function can free jh and bh.
 *
 * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
 */
 int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 {
        struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
                                    transaction->t_tid, stats);
        __jbd2_journal_drop_transaction(journal, transaction);
-        kfree(transaction);
+        jbd2_journal_free_transaction(transaction);
        /* Just in case anybody was waiting for more transactions to be
           checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
+        trace_jbd2_drop_transaction(journal, transaction);
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c067a8cae63b..17f557f01cf0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
        struct blk_plug plug;
+        /* Tail of the journal */
+        unsigned long first_block;
+        tid_t first_tid;
+        int update_tail;
        /*
         * First job: lock down the current transaction and wait for
@@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
        if (journal->j_flags & JBD2_FLUSHED) {
                jbd_debug(3, "super block updated\n");
-                jbd2_journal_update_superblock(journal, 1);
+                mutex_lock(&journal->j_checkpoint_mutex);
+                /*
+                 * We hold j_checkpoint_mutex so tail cannot change under us.
+                 * We don't need any special data guarantees for writing sb
+                 * since journal is empty and it is ok for write to be
+                 * flushed only with transaction commit.
+                 */
+                jbd2_journal_update_sb_log_tail(journal,
+                                                journal->j_tail_sequence,
+                                                journal->j_tail,
+                                                WRITE_SYNC);
+                mutex_unlock(&journal->j_checkpoint_mutex);
        } else {
                jbd_debug(3, "superblock not updated\n");
        }
@@ -677,10 +692,30 @@ start_journal_io:
                err = 0;
        }
+        /*
+         * Get current oldest transaction in the log before we issue flush
+         * to the filesystem device. After the flush we can be sure that
+         * blocks of all older transactions are checkpointed to persistent
+         * storage and we will be safe to update journal start in the
+         * superblock with the numbers we get here.
+         */
+        update_tail =
+                jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
        write_lock(&journal->j_state_lock);
+        if (update_tail) {
+                long freed = first_block - journal->j_tail;
+                if (first_block < journal->j_tail)
+                        freed += journal->j_last - journal->j_first;
+                /* Update tail only if we free significant amount of space */
+                if (freed < journal->j_maxlen / 4)
+                        update_tail = 0;
+        }
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
        commit_transaction->t_state = T_COMMIT_DFLUSH;
        write_unlock(&journal->j_state_lock);
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
@@ -831,6 +866,14 @@ wait_for_iobuf:
        if (err)
                jbd2_journal_abort(journal, err);
+        /*
+         * Now disk caches for filesystem device are flushed so we are safe to
+         * erase checkpointed transactions from the log by updating journal
+         * superblock.
+         */
+        if (update_tail)
+                jbd2_update_log_tail(journal, first_tid, first_block);
        /* End of a transaction!  Finally, we can do checkpoint
           processing: any buffers committed as a result of this
           transaction can be removed from any checkpoint list it was on
@@ -1048,7 +1091,7 @@ restart_loop:
        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
-                kfree(commit_transaction);
+                jbd2_journal_free_transaction(commit_transaction);
        wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 839377e3d624..98ed6dbfe381 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
 EXPORT_SYMBOL(jbd2_journal_init_dev);
 EXPORT_SYMBOL(jbd2_journal_init_inode);
-EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 EXPORT_SYMBOL(jbd2_inode_cache);
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
@@ -746,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
        return jbd2_journal_add_journal_head(bh);
 }
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+                              unsigned long *block)
+{
+        transaction_t *transaction;
+        int ret;
+        read_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        transaction = journal->j_checkpoint_transactions;
+        if (transaction) {
+                *tid = transaction->t_tid;
+                *block = transaction->t_log_start;
+        } else if ((transaction = journal->j_committing_transaction) != NULL) {
+                *tid = transaction->t_tid;
+                *block = transaction->t_log_start;
+        } else if ((transaction = journal->j_running_transaction) != NULL) {
+                *tid = transaction->t_tid;
+                *block = journal->j_head;
+        } else {
+                *tid = journal->j_transaction_sequence;
+                *block = journal->j_head;
+        }
+        ret = tid_gt(*tid, journal->j_tail_sequence);
+        spin_unlock(&journal->j_list_lock);
+        read_unlock(&journal->j_state_lock);
+        return ret;
+}
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+        unsigned long freed;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+        /*
+         * We cannot afford for write to remain in drive's caches since as
+         * soon as we update j_tail, next transaction can start reusing journal
+         * space and if we lose sb update during power failure we'd replay
+         * old transaction with possibly newly overwritten data.
+         */
+        jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+        write_lock(&journal->j_state_lock);
+        freed = block - journal->j_tail;
+        if (block < journal->j_tail)
+                freed += journal->j_last - journal->j_first;
+        trace_jbd2_update_log_tail(journal, tid, block, freed);
+        jbd_debug(1,
+                  "Cleaning journal tail from %d to %d (offset %lu), "
+                  "freeing %lu\n",
+                  journal->j_tail_sequence, tid, block, freed);
+        journal->j_free += freed;
+        journal->j_tail_sequence = tid;
+        journal->j_tail = block;
+        write_unlock(&journal->j_state_lock);
+}
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+        mutex_lock(&journal->j_checkpoint_mutex);
+        if (tid_gt(tid, journal->j_tail_sequence))
+                __jbd2_update_log_tail(journal, tid, block);
+        mutex_unlock(&journal->j_checkpoint_mutex);
+}
 struct jbd2_stats_proc_session {
        journal_t *journal;
        struct transaction_stats_s *stats;
@@ -1114,40 +1204,45 @@ static int journal_reset(journal_t *journal)
        journal->j_max_transaction_buffers = journal->j_maxlen / 4;
-        /* Add the dynamic fields and write it to disk. */
-        jbd2_journal_update_superblock(journal, 1);
-        return jbd2_journal_start_thread(journal);
-}
-/**
- * void jbd2_journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void jbd2_journal_update_superblock(journal_t *journal, int wait)
-{
-        journal_superblock_t *sb = journal->j_superblock;
-        struct buffer_head *bh = journal->j_sb_buffer;
        /*
         * As a special case, if the on-disk copy is already marked as needing
-         * no recovery (s_start == 0) and there are no outstanding transactions
+         * no recovery (s_start == 0), then we can safely defer the superblock
-         * in the filesystem, then we can safely defer the superblock update
+         * update until the next commit by setting JBD2_FLUSHED.  This avoids
-         * until the next commit by setting JBD2_FLUSHED.  This avoids
         * attempting a write to a potential-readonly device.
         */
-        if (sb->s_start == 0 && journal->j_tail_sequence ==
+        if (sb->s_start == 0) {
-                                journal->j_transaction_sequence) {
                jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %d, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
-                goto out;
+                journal->j_flags |= JBD2_FLUSHED;
+        } else {
+                /* Lock here to make assertions happy... */
+                mutex_lock(&journal->j_checkpoint_mutex);
+                /*
+                 * Update log tail information. We use WRITE_FUA since new
+                 * transaction will start reusing journal space and so we
+                 * must make sure information about current log tail is on
+                 * disk before that.
+                 */
+                jbd2_journal_update_sb_log_tail(journal,
+                                                journal->j_tail_sequence,
+                                                journal->j_tail,
+                                                WRITE_FUA);
+                mutex_unlock(&journal->j_checkpoint_mutex);
        }
+        return jbd2_journal_start_thread(journal);
+}
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+        struct buffer_head *bh = journal->j_sb_buffer;
+        int ret;
+        trace_jbd2_write_superblock(journal, write_op);
+        if (!(journal->j_flags & JBD2_BARRIER))
+                write_op &= ~(REQ_FUA | REQ_FLUSH);
+        lock_buffer(bh);
        if (buffer_write_io_error(bh)) {
                /*
                 * Oh, dear.  A previous attempt to write the journal
@@ -1163,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
+        get_bh(bh);
+        bh->b_end_io = end_buffer_write_sync;
+        ret = submit_bh(write_op, bh);
+        wait_on_buffer(bh);
+        if (buffer_write_io_error(bh)) {
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+                ret = -EIO;
+        }
+        if (ret) {
+                printk(KERN_ERR "JBD2: Error %d detected when updating "
+                       "journal superblock for %s.\n", ret,
+                       journal->j_devname);
+        }
+}
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+                                     unsigned long tail_block, int write_op)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+        jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+                  tail_block, tail_tid);
+        sb->s_sequence = cpu_to_be32(tail_tid);
+        sb->s_start    = cpu_to_be32(tail_block);
+        jbd2_write_superblock(journal, write_op);
+        /* Log is no longer empty */
+        write_lock(&journal->j_state_lock);
+        WARN_ON(!sb->s_sequence);
+        journal->j_flags &= ~JBD2_FLUSHED;
+        write_unlock(&journal->j_state_lock);
+}
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        read_lock(&journal->j_state_lock);
-        jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
+        jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
-                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+                  journal->j_tail_sequence);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
-        sb->s_start    = cpu_to_be32(journal->j_tail);
+        sb->s_start    = cpu_to_be32(0);
-        sb->s_errno    = cpu_to_be32(journal->j_errno);
        read_unlock(&journal->j_state_lock);
-        BUFFER_TRACE(bh, "marking dirty");
+        jbd2_write_superblock(journal, WRITE_FUA);
-        mark_buffer_dirty(bh);
-        if (wait) {
-                sync_dirty_buffer(bh);
-                if (buffer_write_io_error(bh)) {
-                        printk(KERN_ERR "JBD2: I/O error detected "
-                               "when updating journal superblock for %s.\n",
-                               journal->j_devname);
-                        clear_buffer_write_io_error(bh);
-                        set_buffer_uptodate(bh);
-                }
-        } else
-                write_dirty_buffer(bh, WRITE);
-out:
-        /* If we have just flushed the log (by marking s_start==0), then
-         * any future commit will have to be careful to update the
-         * superblock again to re-record the true start of the log. */
+        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
-        if (sb->s_start)
+        journal->j_flags |= JBD2_FLUSHED;
-                journal->j_flags &= ~JBD2_FLUSHED;
-        else
-                journal->j_flags |= JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);
 }
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno.  Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        read_lock(&journal->j_state_lock);
+        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+                  journal->j_errno);
+        sb->s_errno    = cpu_to_be32(journal->j_errno);
+        read_unlock(&journal->j_state_lock);
+        jbd2_write_superblock(journal, WRITE_SYNC);
+}
 /*
 * Read the superblock for a given journal, performing initial
 * validation of the format.
 */
 static int journal_get_superblock(journal_t *journal)
 {
        struct buffer_head *bh;
@@ -1398,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal)
        if (journal->j_sb_buffer) {
                if (!is_journal_aborted(journal)) {
-                        /* We can now mark the journal as empty. */
+                        mutex_lock(&journal->j_checkpoint_mutex);
-                        journal->j_tail = 0;
+                        jbd2_mark_journal_empty(journal);
-                        journal->j_tail_sequence =
+                        mutex_unlock(&journal->j_checkpoint_mutex);
-                                ++journal->j_transaction_sequence;
+                } else
-                        jbd2_journal_update_superblock(journal, 1);
-                } else {
                        err = -EIO;
-                }
                brelse(journal->j_sb_buffer);
        }
@@ -1552,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 EXPORT_SYMBOL(jbd2_journal_clear_features);
 /**
- * int jbd2_journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int jbd2_journal_update_format (journal_t *journal)
-{
-        journal_superblock_t *sb;
-        int err;
-        err = journal_get_superblock(journal);
-        if (err)
-                return err;
-        sb = journal->j_superblock;
-        switch (be32_to_cpu(sb->s_header.h_blocktype)) {
-        case JBD2_SUPERBLOCK_V2:
-                return 0;
-        case JBD2_SUPERBLOCK_V1:
-                return journal_convert_superblock_v1(journal, sb);
-        default:
-                break;
-        }
-        return -EINVAL;
-}
-static int journal_convert_superblock_v1(journal_t *journal,
-                                         journal_superblock_t *sb)
-{
-        int offset, blocksize;
-        struct buffer_head *bh;
-        printk(KERN_WARNING
-                "JBD2: Converting superblock from version 1 to 2.\n");
-        /* Pre-initialise new fields to zero */
-        offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
-        blocksize = be32_to_cpu(sb->s_blocksize);
-        memset(&sb->s_feature_compat, 0, blocksize-offset);
-        sb->s_nr_users = cpu_to_be32(1);
-        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-        journal->j_format_version = 2;
-        bh = journal->j_sb_buffer;
-        BUFFER_TRACE(bh, "marking dirty");
-        mark_buffer_dirty(bh);
-        sync_dirty_buffer(bh);
-        return 0;
-}
-/**
 * int jbd2_journal_flush () - Flush journal
 * @journal: Journal to act on.
 *
@@ -1619,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal)
 {
        int err = 0;
        transaction_t *transaction = NULL;
-        unsigned long old_tail;
        write_lock(&journal->j_state_lock);
@@ -1654,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal)
        if (is_journal_aborted(journal))
                return -EIO;
+        mutex_lock(&journal->j_checkpoint_mutex);
        jbd2_cleanup_journal_tail(journal);
        /* Finally, mark the journal as really needing no recovery.
@@ -1661,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal)
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
+        jbd2_mark_journal_empty(journal);
+        mutex_unlock(&journal->j_checkpoint_mutex);
        write_lock(&journal->j_state_lock);
-        old_tail = journal->j_tail;
-        journal->j_tail = 0;
-        write_unlock(&journal->j_state_lock);
-        jbd2_journal_update_superblock(journal, 1);
-        write_lock(&journal->j_state_lock);
-        journal->j_tail = old_tail;
        J_ASSERT(!journal->j_running_transaction);
        J_ASSERT(!journal->j_committing_transaction);
        J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1708,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
                write ? "Clearing" : "Ignoring");
        err = jbd2_journal_skip_recovery(journal);
-        if (write)
+        if (write) {
-                jbd2_journal_update_superblock(journal, 1);
+                /* Lock to make assertions happy... */
+                mutex_lock(&journal->j_checkpoint_mutex);
+                jbd2_mark_journal_empty(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
+        }
 no_recovery:
        return err;
@@ -1759,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
        __jbd2_journal_abort_hard(journal);
        if (errno)
-                jbd2_journal_update_superblock(journal, 1);
+                jbd2_journal_update_sb_errno(journal);
 }
 /**
@@ -2017,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
-static int journal_init_jbd2_journal_head_cache(void)
+static int jbd2_journal_init_journal_head_cache(void)
 {
        int retval;
@@ -2035,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void)
        return retval;
 }
-static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+static void jbd2_journal_destroy_journal_head_cache(void)
 {
        if (jbd2_journal_head_cache) {
                kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2323,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
-static int __init journal_init_handle_cache(void)
+static int __init jbd2_journal_init_handle_cache(void)
 {
        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
        if (jbd2_handle_cache == NULL) {
@@ -2358,17 +2452,20 @@ static int __init journal_init_caches(void)
        ret = jbd2_journal_init_revoke_caches();
        if (ret == 0)
-                ret = journal_init_jbd2_journal_head_cache();
+                ret = jbd2_journal_init_journal_head_cache();
+        if (ret == 0)
+                ret = jbd2_journal_init_handle_cache();
        if (ret == 0)
-                ret = journal_init_handle_cache();
+                ret = jbd2_journal_init_transaction_cache();
        return ret;
 }
 static void jbd2_journal_destroy_caches(void)
 {
        jbd2_journal_destroy_revoke_caches();
-        jbd2_journal_destroy_jbd2_journal_head_cache();
+        jbd2_journal_destroy_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
+        jbd2_journal_destroy_transaction_cache();
        jbd2_journal_destroy_slabs();
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/crc32.h>
+#include <linux/blkdev.h>
 #endif
 /*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
        err2 = sync_blockdev(journal->j_fs_dev);
        if (!err)
                err = err2;
+        /* Make sure all replayed data is on permanent storage */
+        if (journal->j_flags & JBD2_BARRIER)
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
        return err;
 }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc9..6973705d6a3d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
        J_ASSERT(!jbd2_revoke_record_cache);
        J_ASSERT(!jbd2_revoke_table_cache);
-        jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+        jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
-                                           sizeof(struct jbd2_revoke_record_s),
+                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
-                                           0,
-                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
-                                           NULL);
        if (!jbd2_revoke_record_cache)
                goto record_cache_failure;
-        jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+        jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
-                                           sizeof(struct jbd2_revoke_table_s),
+                                             SLAB_TEMPORARY);
-                                           0, SLAB_TEMPORARY, NULL);
        if (!jbd2_revoke_table_cache)
                goto table_cache_failure;
        return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56e1fd5..ddcd3549c6c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+        J_ASSERT(!transaction_cache);
+        transaction_cache = kmem_cache_create("jbd2_transaction_s",
+                                        sizeof(transaction_t),
+                                        0,
+                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+                                        NULL);
+        if (transaction_cache)
+                return 0;
+        return -ENOMEM;
+}
+void jbd2_journal_destroy_transaction_cache(void)
+{
+        if (transaction_cache) {
+                kmem_cache_destroy(transaction_cache);
+                transaction_cache = NULL;
+        }
+}
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+        if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+                return;
+        kmem_cache_free(transaction_cache, transaction);
+}
 /*
 * jbd2_get_transaction: obtain a new transaction_t object.
 *
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 alloc_transaction:
        if (!journal->j_running_transaction) {
-                new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+                new_transaction = kmem_cache_alloc(transaction_cache,
+                                                   gfp_mask | __GFP_ZERO);
                if (!new_transaction) {
                        /*
                         * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
-                kfree(new_transaction);
+                jbd2_journal_free_transaction(new_transaction);
                return -EROFS;
        }
@@ -284,7 +314,7 @@ repeat:
        read_unlock(&journal->j_state_lock);
        lock_map_acquire(&handle->h_lockdep_map);
-        kfree(new_transaction);
+        jbd2_journal_free_transaction(new_transaction);
        return 0;
 }
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * of these pointers, it could go bad.  Generally the caller needs to re-read
 * the pointer from the transaction_t.
 *
- * Called under j_list_lock.  The journal may not be locked.
+ * Called under j_list_lock.
 */
-void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 {
        struct journal_head **list = NULL;
        transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
        spin_lock(&journal->j_list_lock);
        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
-                if (jh->b_jlist == BJ_None) {
+                JBUFFER_TRACE(jh, "remove from checkpoint list");
-                        JBUFFER_TRACE(jh, "remove from checkpoint list");
+                __jbd2_journal_remove_checkpoint(jh);
-                        __jbd2_journal_remove_checkpoint(jh);
-                }
        }
        spin_unlock(&journal->j_list_lock);
 out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
+        clear_buffer_delay(bh);
+        clear_buffer_unwritten(bh);
        bh->b_bdev = NULL;
        return may_free;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fa63f1b46103..c437f914d537 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1872,19 +1872,6 @@ extern struct dentry *mount_pseudo(struct file_system_type *, char *,
        const struct dentry_operations *dops,
        unsigned long);
-static inline void sb_mark_dirty(struct super_block *sb)
-{
-        sb->s_dirt = 1;
-}
-static inline void sb_mark_clean(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
-static inline int sb_is_dirty(struct super_block *sb)
-{
-        return sb->s_dirt;
-}
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
        (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5557baefed60..912c30a8ddb1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -971,6 +971,10 @@ extern void __journal_clean_data_list(transaction_t *transaction);
 /* Log buffer allocation */
 extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *);
 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+                              unsigned long *block);
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 /* Commit management */
 extern void jbd2_journal_commit_transaction(journal_t *);
@@ -1020,6 +1024,11 @@ jbd2_journal_write_metadata_buffer(transaction_t	  *transaction,
 /* Transaction locking */
 extern void             __wait_on_journal (journal_t *);
+/* Transaction cache support */
+extern void jbd2_journal_destroy_transaction_cache(void);
+extern int  jbd2_journal_init_transaction_cache(void);
+extern void jbd2_journal_free_transaction(transaction_t *);
 /*
 * Journal locking.
 *
@@ -1082,7 +1091,8 @@ extern int	   jbd2_journal_destroy    (journal_t *);
 extern int         jbd2_journal_recover    (journal_t *journal);
 extern int         jbd2_journal_wipe       (journal_t *, int);
 extern int         jbd2_journal_skip_recovery   (journal_t *);
-extern void        jbd2_journal_update_superblock       (journal_t *, int);
+extern void        jbd2_journal_update_sb_log_tail      (journal_t *, tid_t,
+                                unsigned long, int);
 extern void        __jbd2_journal_abort_hard    (journal_t *);
 extern void        jbd2_journal_abort      (journal_t *, int);
 extern int         jbd2_journal_errno      (journal_t *);
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 423cb6d78ee0..c18b46f8aeeb 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -66,6 +66,8 @@ struct journal_head {
         * transaction (if there is one).  Only applies to buffers on a
         * transaction's data or metadata journaling list.
         * [j_list_lock] [jbd_lock_bh_state()]
+         * Either of these locks is enough for reading, both are needed for
+         * changes.
         */
        transaction_t *b_transaction;
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 75964412ddbb..127993dbf322 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -81,6 +81,13 @@ DEFINE_EVENT(jbd2_commit, jbd2_commit_logging,
        TP_ARGS(journal, commit_transaction)
 );
+DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction,
+        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+        TP_ARGS(journal, commit_transaction)
+);
 TRACE_EVENT(jbd2_end_commit,
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
@@ -200,7 +207,7 @@ TRACE_EVENT(jbd2_checkpoint_stats,
                  __entry->forced_to_close, __entry->written, __entry->dropped)
 );
-TRACE_EVENT(jbd2_cleanup_journal_tail,
+TRACE_EVENT(jbd2_update_log_tail,
        TP_PROTO(journal_t *journal, tid_t first_tid,
                 unsigned long block_nr, unsigned long freed),
@@ -229,6 +236,26 @@ TRACE_EVENT(jbd2_cleanup_journal_tail,
                  __entry->block_nr, __entry->freed)
 );
+TRACE_EVENT(jbd2_write_superblock,
+        TP_PROTO(journal_t *journal, int write_op),
+        TP_ARGS(journal, write_op),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(          int,  write_op                )
+        ),
+        TP_fast_assign(
+                __entry->dev            = journal->j_fs_dev->bd_dev;
+                __entry->write_op       = write_op;
+        ),
+        TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
+                  MINOR(__entry->dev), __entry->write_op)
+);
 #endif /* _TRACE_JBD2_H */
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3fc261705b1e..26adea8ca2e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -95,6 +95,8 @@ unsigned long vm_dirty_bytes;
 */
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 /*
 * The longest time for which data is allowed to remain dirty
 */