8 files changed, 118 insertions, 54 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 97b970e7dd13..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -547,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index e25e99bf7ee1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
+ * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dd2cb5076ff9..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1729,7 +1729,7 @@ repeat:
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-                        ext_debug("next leaf isnt full(%d)\n",
+                        ext_debug("next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto repeat;
@@ -2533,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
+ * extent into multiple extents (up to three - one initialized and two
 * uninitialized).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
@@ -3174,7 +3174,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                   path, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
-                 * that this IO needs to convertion to written when IO is
+                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3460,10 +3460,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_mark_uninitialized(&newex);
                /*
                 * io_end structure was created for every IO write to an
-                 * uninitialized extent. To avoid unecessary conversion,
+                 * uninitialized extent. To avoid unnecessary conversion,
                 * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
-                 * that we need to perform convertion when IO is done.
+                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7f74019d6d77..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
                 * to the work-to-be schedule is freed.
                 *
                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
+                 * conversion finished. The io structure has a flag to
                 * avoid double converting from both fsync and background work
                 * queue work.
                 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        int ret = 0;
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-                sync_mapping_buffers(inode->i_mapping);
+                ret = sync_mapping_buffers(inode->i_mapping);
+                if (ret)
+                        break;
+                memset(&wbc, 0, sizeof(wbc));
+                wbc.sync_mode = WB_SYNC_ALL;
+                wbc.nr_to_write = 0;         /* only write out the inode */
+                ret = sync_inode(inode, &wbc);
+                if (ret)
+                        break;
        }
+        return ret;
 }
 /*
@@ -176,7 +187,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                        ext4_sync_parent(inode);
+                        ret = ext4_sync_parent(inode);
                goto out;
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a86282b9024..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2502,6 +2502,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * for partial write.
                 */
                set_buffer_new(bh);
+                set_buffer_mapped(bh);
        }
        return 0;
 }
@@ -2588,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
@@ -2690,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 /*
 * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
@@ -3304,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
-         * simplifying them becuase we wouldn't actually intend to
+         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
@@ -3694,7 +3695,7 @@ retry:
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
 *
 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3717,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 * We could direct write to holes and fallocate.
                 *
                 * Allocated blocks to fill the hole are marked as uninitialized
-                 * to prevent paralel buffered read to expose the stale data
+                 * to prevent parallel buffered read to expose the stale data
                 * before DIO complete the data IO.
                 *
                 * As to previously fallocated extents, ext4 get_block
@@ -3778,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                        int err;
                        /*
                         * for non AIO case, since the IO is already
-                         * completed, we could do the convertion right here
+                         * completed, we could do the conversion right here
                         */
                        err = ext4_convert_unwritten_extents(inode,
                                                             offset, ret);
@@ -4025,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -4169,7 +4170,7 @@ out_err:
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4261,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -4429,8 +4430,8 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
        trace_ext4_truncate_enter(inode);
@@ -4455,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4493,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4553,6 +4564,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -5398,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
@@ -5478,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a5837a837a8b..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d1bafa57f483..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
         * start with one credit accounted for
         * superblock modification.
         *
-         * For the tmp_inode we already have commited the
+         * For the tmp_inode we already have committed the
         * trascation that created the inode. Later as and
         * when we add extents we extent the journal
         */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 22546ad7f0ae..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -242,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+        handle_t  *handle;
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_TRANS);
-        /* Special case here: if the journal has aborted behind our
-         * backs (eg. EIO in the commit thread), then we still need to
-         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        handle = ext4_journal_current_handle();
-                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, "Detected aborted journal");
+        /*
-                        return ERR_PTR(-EROFS);
+         * If a handle has been started, it should be allowed to
-                }
+         * finish, otherwise deadlock could happen between freeze
-                return jbd2_journal_start(journal, nblocks);
+         * and others(e.g. truncate) due to the restart of the
+         * journal handle if the filesystem is forzen and active
+         * handles are not stopped.
+         */
+        if (!handle)
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
+        if (!journal)
+                return ext4_get_nojournal();
+        /*
+         * Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly.
+         */
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, "Detected aborted journal");
+                return ERR_PTR(-EROFS);
        }
-        return ext4_get_nojournal();
+        return jbd2_journal_start(journal, nblocks);
 }
 /*
@@ -617,7 +634,7 @@ __acquires(bitlock)
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
-         * ext4_grp_locked_error() to distinguish beween the
+         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
@@ -2975,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
        mutex_unlock(&ext4_li_info->li_list_mtx);
        sbi->s_li_request = elr;
+        /*
+         * set elr to NULL here since it has been inserted to
+         * the request_list and the removal and free of it is
+         * handled by ext4_clear_request_list from now on.
+         */
+        elr = NULL;
        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
@@ -3385,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
                        ext4_count_free_blocks(sb));
        if (!err) {
@@ -3646,9 +3673,6 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-        init_timer(&sbi->s_err_report);
-        sbi->s_err_report.function = print_daily_error_info;
-        sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -3672,6 +3696,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
@@ -4138,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4614,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        handle_t *handle;
        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        /* Update modification times of quota files when userspace can
+         * start looking at them */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                goto out;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+out:
        return dquot_quota_off(sb, type);
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
@@ -4714,9 +4757,8 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
        }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }