7 files changed, 274 insertions, 311 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece228827..b9821be709bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        /* this isn't the right place to decide whether block is metadata
         * inode.c/extents.c knows better, but for safety ... */
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                        ext4_should_journal_data(inode))
+                metadata = 1;
+        /* We need to make sure we don't reuse
+         * block released untill the transaction commit.
+         * writeback mode have weak data consistency so
+         * don't force data as metadata when freeing block
+         * for writeback mode.
+         */
+        if (metadata == 0 && !ext4_should_writeback_data(inode))
                metadata = 1;
        sb = inode->i_sb;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6690a41cdd9f..4880cc3e6727 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,7 +511,6 @@ do {									       \
 /*
 * Mount flags
 */
-#define EXT4_MOUNT_CHECK                0x00001 /* Do mount-time checks */
 #define EXT4_MOUNT_OLDALLOC             0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID                0x00004 /* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG                0x00008 /* Some debugging messages */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d43264..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
        struct inode *s_buddy_cache;
        long s_blocks_reserved;
        spinlock_t s_reserve_lock;
-        struct list_head s_active_transaction;
-        struct list_head s_closed_transaction;
-        struct list_head s_committed_transaction;
        spinlock_t s_md_lock;
        tid_t s_last_transaction;
        unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9b4ec9decfd1..8dbf6953845b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
        int ret = 0, err, nr_pages, i;
        unsigned long index, end;
        struct pagevec pvec;
+        long pages_skipped;
        BUG_ON(mpd->next_page <= mpd->first_page);
        pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
        end = mpd->next_page - 1;
        while (index <= end) {
-                /* XXX: optimize tail */
+                /*
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                 * We can use PAGECACHE_TAG_DIRTY lookup here because
+                 * even though we have cleared the dirty flag on the page
+                 * We still keep the page in the radix tree with tag
+                 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+                 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+                 * which is called via the below writepage callback.
+                 */
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                        PAGECACHE_TAG_DIRTY,
+                                        min(end - index,
+                                        (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        index = page->index;
+                        pages_skipped = mpd->wbc->pages_skipped;
-                        if (index > end)
-                                break;
-                        index++;
                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err)
+                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+                                /*
+                                 * have successfully written the page
+                                 * without skipping the same
+                                 */
                                mpd->pages_written++;
                        /*
                         * In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
                               struct writeback_control *wbc,
                               struct mpage_da_data *mpd)
 {
-        long to_write;
        int ret;
        if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
        mpd->pages_written = 0;
        mpd->retval = 0;
-        to_write = wbc->nr_to_write;
        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
        /*
         * Handle last extent of pages
         */
        if (!mpd->io_done && mpd->next_page != mpd->first_page) {
                if (mpage_da_map_blocks(mpd) == 0)
                        mpage_da_submit_io(mpd);
-        }
-        wbc->nr_to_write = to_write - mpd->pages_written;
+                mpd->io_done = 1;
+                ret = MPAGE_DA_EXTENT_TAIL;
+        }
+        wbc->nr_to_write -= mpd->pages_written;
        return ret;
 }
@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
+        pgoff_t index;
+        int range_whole = 0;
        handle_t *handle = NULL;
-        loff_t range_start = 0;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
+        int no_nrwrite_index_update;
+        long pages_written = 0, pages_skipped;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
-        long to_write, pages_skipped = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        /*
@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
                wbc->nr_to_write = sbi->s_mb_stream_request;
        }
+        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                range_whole = 1;
-        if (!wbc->range_cyclic)
+        if (wbc->range_cyclic)
-                /*
+                index = mapping->writeback_index;
-                 * If range_cyclic is not set force range_cont
+        else
-                 * and save the old writeback_index
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
-                 */
-                wbc->range_cont = 1;
-        range_start =  wbc->range_start;
-        pages_skipped = wbc->pages_skipped;
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-restart_loop:
+        /*
-        to_write = wbc->nr_to_write;
+         * we don't want write_cache_pages to update
-        while (!ret && to_write > 0) {
+         * nr_to_write and writeback_index
+         */
+        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+        wbc->no_nrwrite_index_update = 1;
+        pages_skipped = wbc->pages_skipped;
+        while (!ret && wbc->nr_to_write > 0) {
                /*
                 * we  insert one extent at a time. So we need
@@ -2422,48 +2436,53 @@ restart_loop:
                        dump_stack();
                        goto out_writepages;
                }
-                to_write -= wbc->nr_to_write;
                mpd.get_block = ext4_da_get_block_write;
                ret = mpage_da_writepages(mapping, wbc, &mpd);
                ext4_journal_stop(handle);
-                if (mpd.retval == -ENOSPC)
+                if (mpd.retval == -ENOSPC) {
+                        /* commit the transaction which would
+                         * free blocks released in the transaction
+                         * and try again
+                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
+                        wbc->pages_skipped = pages_skipped;
-                /* reset the retry count */
+                        ret = 0;
-                if (ret == MPAGE_DA_EXTENT_TAIL) {
+                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
                         * got one extent now try with
                         * rest of the pages
                         */
-                        to_write += wbc->nr_to_write;
+                        pages_written += mpd.pages_written;
+                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
-                } else if (wbc->nr_to_write) {
+                } else if (wbc->nr_to_write)
                        /*
                         * There is no more writeout needed
                         * or we requested for a noblocking writeout
                         * and we found the device congested
                         */
-                        to_write += wbc->nr_to_write;
                        break;
-                }
-                wbc->nr_to_write = to_write;
-        }
-        if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-                /* We skipped pages in this loop */
-                wbc->range_start = range_start;
-                wbc->nr_to_write = to_write +
-                                wbc->pages_skipped - pages_skipped;
-                wbc->pages_skipped = pages_skipped;
-                goto restart_loop;
        }
+        if (pages_skipped != wbc->pages_skipped)
+                printk(KERN_EMERG "This should not happen leaving %s "
+                                "with nr_to_write = %ld ret = %d\n",
+                                __func__, wbc->nr_to_write, ret);
+        /* Update index */
+        index += pages_written;
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+                /*
+                 * set the writeback_index so that range_cyclic
+                 * mode will write it back later
+                 */
+                mapping->writeback_index = index;
 out_writepages:
-        wbc->nr_to_write = to_write - nr_to_writebump;
+        if (!no_nrwrite_index_update)
-        wbc->range_start = range_start;
+                wbc->no_nrwrite_index_update = 0;
+        wbc->nr_to_write -= nr_to_writebump;
        return ret;
 }
@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = inode->i_blocks;
        struct super_block *sb = inode->i_sb;
-        int err = 0;
        if (i_blocks <= ~0U) {
                /*
@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
-        } else if (i_blocks <= 0xffffffffffffULL) {
+                return 0;
+        }
+        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+                return -EFBIG;
+        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
-                err = ext4_update_rocompat_feature(handle, sb,
-                                            EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-                if (err)
-                        goto  err_out;
-                /* i_block is stored in the split  48 bit fields */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
        } else {
-                /*
-                 * i_blocks should be represented in a 48 bit variable
-                 * as multiple of  file system block size
-                 */
-                err = ext4_update_rocompat_feature(handle, sb,
-                                            EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-                if (err)
-                        goto  err_out;
                ei->i_flags |= EXT4_HUGE_FILE_FL;
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
-err_out:
+        return 0;
-        return err;
 }
 /*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714f0d85..dfe17a134052 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        }
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+        meta_group_info[i]->bb_free_root.rb_node = NULL;;
 #ifdef DOUBLE_CHECK
        {
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        }
        spin_lock_init(&sbi->s_md_lock);
-        INIT_LIST_HEAD(&sbi->s_active_transaction);
-        INIT_LIST_HEAD(&sbi->s_closed_transaction);
-        INIT_LIST_HEAD(&sbi->s_committed_transaction);
        spin_lock_init(&sbi->s_bal_lock);
        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
+        sbi->s_journal->j_commit_callback = release_blocks_on_commit;
        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
 }
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                list_del(&pa->pa_group_list);
                count++;
-                kfree(pa);
+                kmem_cache_free(ext4_pspace_cachep, pa);
        }
        if (count)
                mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        /* release freed, non-committed blocks */
-        spin_lock(&sbi->s_md_lock);
-        list_splice_init(&sbi->s_closed_transaction,
-                        &sbi->s_committed_transaction);
-        list_splice_init(&sbi->s_active_transaction,
-                        &sbi->s_committed_transaction);
-        spin_unlock(&sbi->s_md_lock);
-        ext4_mb_free_committed_blocks(sb);
        if (sbi->s_group_info) {
                for (i = 0; i < sbi->s_groups_count; i++) {
                        grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
-static noinline_for_stack void
+/*
-ext4_mb_free_committed_blocks(struct super_block *sb)
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 {
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct super_block *sb = journal->j_private;
-        int err;
-        int i;
-        int count = 0;
-        int count2 = 0;
-        struct ext4_free_metadata *md;
        struct ext4_buddy e4b;
+        struct ext4_group_info *db;
+        int err, count = 0, count2 = 0;
+        struct ext4_free_data *entry;
+        ext4_fsblk_t discard_block;
+        struct list_head *l, *ltmp;
-        if (list_empty(&sbi->s_committed_transaction))
+        list_for_each_safe(l, ltmp, &txn->t_private_list) {
-                return;
+                entry = list_entry(l, struct ext4_free_data, list);
-        /* there is committed blocks to be freed yet */
-        do {
-                /* get next array of blocks */
-                md = NULL;
-                spin_lock(&sbi->s_md_lock);
-                if (!list_empty(&sbi->s_committed_transaction)) {
-                        md = list_entry(sbi->s_committed_transaction.next,
-                                        struct ext4_free_metadata, list);
-                        list_del(&md->list);
-                }
-                spin_unlock(&sbi->s_md_lock);
-                if (md == NULL)
-                        break;
                mb_debug("gonna free %u blocks in group %lu (0x%p):",
-                                md->num, md->group, md);
+                         entry->count, entry->group, entry);
-                err = ext4_mb_load_buddy(sb, md->group, &e4b);
+                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
                BUG_ON(err != 0);
+                db = e4b.bd_info;
                /* there are blocks to put in buddy to make them really free */
-                count += md->num;
+                count += entry->count;
                count2++;
-                ext4_lock_group(sb, md->group);
+                ext4_lock_group(sb, entry->group);
-                for (i = 0; i < md->num; i++) {
+                /* Take it out of per group rb tree */
-                        mb_debug(" %u", md->blocks[i]);
+                rb_erase(&entry->node, &(db->bb_free_root));
-                        mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+                if (!db->bb_free_root.rb_node) {
+                        /* No more items in the per group rb tree
+                         * balance refcounts from ext4_mb_free_metadata()
+                         */
+                        page_cache_release(e4b.bd_buddy_page);
+                        page_cache_release(e4b.bd_bitmap_page);
                }
-                mb_debug("\n");
+                ext4_unlock_group(sb, entry->group);
-                ext4_unlock_group(sb, md->group);
+                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+                        + entry->start_blk
-                /* balance refcounts from ext4_mb_free_metadata() */
+                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                page_cache_release(e4b.bd_buddy_page);
+                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
-                page_cache_release(e4b.bd_bitmap_page);
+                           (unsigned long long) discard_block, entry->count);
+                sb_issue_discard(sb, discard_block, entry->count);
-                kfree(md);
+                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
+        }
-        } while (md);
        mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
+#ifdef CONFIG_PROC_FS
        mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ err_out:
        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
        return -ENOMEM;
+#else
+        return 0;
+#endif
 }
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
+#ifdef CONFIG_PROC_FS
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
+#endif
        return 0;
 }
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
+        ext4_free_ext_cachep =
+                kmem_cache_create("ext4_free_block_extents",
+                                     sizeof(struct ext4_free_data),
+                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
+        if (ext4_free_ext_cachep == NULL) {
+                kmem_cache_destroy(ext4_pspace_cachep);
+                kmem_cache_destroy(ext4_ac_cachep);
+                return -ENOMEM;
+        }
        return 0;
 }
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
        /* XXX: synchronize_rcu(); */
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
+        kmem_cache_destroy(ext4_free_ext_cachep);
 }
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                goto out1;
        }
-        ext4_mb_poll_new_transaction(sb, handle);
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
@@ -4384,35 +4385,20 @@ out1:
        return block;
 }
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
-                                                handle_t *handle)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_last_transaction == handle->h_transaction->t_tid)
-                return;
-        /* new transaction! time to close last one and free blocks for
-         * committed transaction. we know that only transaction can be
-         * active, so previos transaction can be being logged and we
-         * know that transaction before previous is known to be already
-         * logged. this means that now we may free blocks freed in all
-         * transactions before previous one. hope I'm clear enough ... */
-        spin_lock(&sbi->s_md_lock);
+/*
-        if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
+ * We can merge two free data extents only if the physical blocks
-                mb_debug("new transaction %lu, old %lu\n",
+ * are contiguous, AND the extents were freed by the same transaction,
-                                (unsigned long) handle->h_transaction->t_tid,
+ * AND the blocks are associated with the same group.
-                                (unsigned long) sbi->s_last_transaction);
+ */
-                list_splice_init(&sbi->s_closed_transaction,
+static int can_merge(struct ext4_free_data *entry1,
-                                &sbi->s_committed_transaction);
+                        struct ext4_free_data *entry2)
-                list_splice_init(&sbi->s_active_transaction,
+{
-                                &sbi->s_closed_transaction);
+        if ((entry1->t_tid == entry2->t_tid) &&
-                sbi->s_last_transaction = handle->h_transaction->t_tid;
+            (entry1->group == entry2->group) &&
-        }
+            ((entry1->start_blk + entry1->count) == entry2->start_blk))
-        spin_unlock(&sbi->s_md_lock);
+                return 1;
+        return 0;
-        ext4_mb_free_committed_blocks(sb);
 }
 static noinline_for_stack int
@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_free_metadata *md;
+        struct ext4_free_data *entry, *new_entry;
-        int i;
+        struct rb_node **n = &db->bb_free_root.rb_node, *node;
+        struct rb_node *parent = NULL, *new_node;
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
+        new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+        new_entry->start_blk = block;
+        new_entry->group  = group;
+        new_entry->count = count;
+        new_entry->t_tid = handle->h_transaction->t_tid;
+        new_node = &new_entry->node;
        ext4_lock_group(sb, group);
-        for (i = 0; i < count; i++) {
+        if (!*n) {
-                md = db->bb_md_cur;
+                /* first free block exent. We need to
-                if (md && db->bb_tid != handle->h_transaction->t_tid) {
+                   protect buddy cache from being freed,
-                        db->bb_md_cur = NULL;
+                 * otherwise we'll refresh it from
-                        md = NULL;
+                 * on-disk bitmap and lose not-yet-available
+                 * blocks */
+                page_cache_get(e4b->bd_buddy_page);
+                page_cache_get(e4b->bd_bitmap_page);
+        }
+        while (*n) {
+                parent = *n;
+                entry = rb_entry(parent, struct ext4_free_data, node);
+                if (block < entry->start_blk)
+                        n = &(*n)->rb_left;
+                else if (block >= (entry->start_blk + entry->count))
+                        n = &(*n)->rb_right;
+                else {
+                        ext4_error(sb, __func__,
+                            "Double free of blocks %d (%d %d)\n",
+                            block, entry->start_blk, entry->count);
+                        return 0;
                }
+        }
-                if (md == NULL) {
+        rb_link_node(new_node, parent, n);
-                        ext4_unlock_group(sb, group);
+        rb_insert_color(new_node, &db->bb_free_root);
-                        md = kmalloc(sizeof(*md), GFP_NOFS);
-                        if (md == NULL)
+        /* Now try to see the extent can be merged to left and right */
-                                return -ENOMEM;
+        node = rb_prev(new_node);
-                        md->num = 0;
+        if (node) {
-                        md->group = group;
+                entry = rb_entry(node, struct ext4_free_data, node);
+                if (can_merge(entry, new_entry)) {
-                        ext4_lock_group(sb, group);
+                        new_entry->start_blk = entry->start_blk;
-                        if (db->bb_md_cur == NULL) {
+                        new_entry->count += entry->count;
-                                spin_lock(&sbi->s_md_lock);
+                        rb_erase(node, &(db->bb_free_root));
-                                list_add(&md->list, &sbi->s_active_transaction);
+                        spin_lock(&sbi->s_md_lock);
-                                spin_unlock(&sbi->s_md_lock);
+                        list_del(&entry->list);
-                                /* protect buddy cache from being freed,
+                        spin_unlock(&sbi->s_md_lock);
-                                 * otherwise we'll refresh it from
+                        kmem_cache_free(ext4_free_ext_cachep, entry);
-                                 * on-disk bitmap and lose not-yet-available
-                                 * blocks */
-                                page_cache_get(e4b->bd_buddy_page);
-                                page_cache_get(e4b->bd_bitmap_page);
-                                db->bb_md_cur = md;
-                                db->bb_tid = handle->h_transaction->t_tid;
-                                mb_debug("new md 0x%p for group %lu\n",
-                                                md, md->group);
-                        } else {
-                                kfree(md);
-                                md = db->bb_md_cur;
-                        }
                }
+        }
-                BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
+        node = rb_next(new_node);
-                md->blocks[md->num] = block + i;
+        if (node) {
-                md->num++;
+                entry = rb_entry(node, struct ext4_free_data, node);
-                if (md->num == EXT4_BB_MAX_BLOCKS) {
+                if (can_merge(new_entry, entry)) {
-                        /* no more space, put full container on a sb's list */
+                        new_entry->count += entry->count;
-                        db->bb_md_cur = NULL;
+                        rb_erase(node, &(db->bb_free_root));
+                        spin_lock(&sbi->s_md_lock);
+                        list_del(&entry->list);
+                        spin_unlock(&sbi->s_md_lock);
+                        kmem_cache_free(ext4_free_ext_cachep, entry);
                }
        }
+        /* Add the extent to transaction's private list */
+        spin_lock(&sbi->s_md_lock);
+        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+        spin_unlock(&sbi->s_md_lock);
        ext4_unlock_group(sb, group);
        return 0;
 }
@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        *freed = 0;
-        ext4_mb_poll_new_transaction(sb, handle);
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
        if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828f8b89..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,23 +100,29 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
-#ifdef EXT4_BB_MAX_BLOCKS
+struct ext4_free_data {
-#undef EXT4_BB_MAX_BLOCKS
+        /* this links the free block information from group_info */
-#endif
+        struct rb_node node;
-#define EXT4_BB_MAX_BLOCKS      30
-struct ext4_free_metadata {
+        /* this links the free block information from ext4_sb_info */
-        ext4_group_t group;
-        unsigned short num;
-        ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
        struct list_head list;
+        /* group which free block extent belongs */
+        ext4_group_t group;
+        /* free block extent */
+        ext4_grpblk_t start_blk;
+        ext4_grpblk_t count;
+        /* transaction which freed this extent */
+        tid_t   t_tid;
 };
 struct ext4_group_info {
        unsigned long   bb_state;
-        unsigned long   bb_tid;
+        struct rb_root  bb_free_root;
-        struct ext4_free_metadata *bb_md_cur;
        unsigned short  bb_first_free;
        unsigned short  bb_free;
        unsigned short  bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
 static void ext4_mb_return_to_preallocation(struct inode *inode,
                                        struct ext4_buddy *e4b, sector_t block,
                                        int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
                        struct super_block *, struct ext4_prealloc_space *pa);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dea8f13c2fd9..9b2b2bc4ec17 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
         */
 }
-int ext4_update_compat_feature(handle_t *handle,
-                                        struct super_block *sb, __u32 compat)
-{
-        int err = 0;
-        if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
-                err = ext4_journal_get_write_access(handle,
-                                EXT4_SB(sb)->s_sbh);
-                if (err)
-                        return err;
-                EXT4_SET_COMPAT_FEATURE(sb, compat);
-                sb->s_dirt = 1;
-                handle->h_sync = 1;
-                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-                                        "call ext4_journal_dirty_met adata");
-                err = ext4_journal_dirty_metadata(handle,
-                                EXT4_SB(sb)->s_sbh);
-        }
-        return err;
-}
-int ext4_update_rocompat_feature(handle_t *handle,
-                                        struct super_block *sb, __u32 rocompat)
-{
-        int err = 0;
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
-                err = ext4_journal_get_write_access(handle,
-                                EXT4_SB(sb)->s_sbh);
-                if (err)
-                        return err;
-                EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
-                sb->s_dirt = 1;
-                handle->h_sync = 1;
-                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-                                        "call ext4_journal_dirty_met adata");
-                err = ext4_journal_dirty_metadata(handle,
-                                EXT4_SB(sb)->s_sbh);
-        }
-        return err;
-}
-int ext4_update_incompat_feature(handle_t *handle,
-                                        struct super_block *sb, __u32 incompat)
-{
-        int err = 0;
-        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
-                err = ext4_journal_get_write_access(handle,
-                                EXT4_SB(sb)->s_sbh);
-                if (err)
-                        return err;
-                EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
-                sb->s_dirt = 1;
-                handle->h_sync = 1;
-                BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-                                        "call ext4_journal_dirty_met adata");
-                err = ext4_journal_dirty_metadata(handle,
-                                EXT4_SB(sb)->s_sbh);
-        }
-        return err;
-}
 /*
 * Open the external journal device
 */
@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
 enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +855,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_inode_readahead_blks
 };
@@ -933,8 +873,6 @@ static const match_table_t tokens = {
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_nouid32, "nouid32"},
-        {Opt_nocheck, "nocheck"},
-        {Opt_nocheck, "check=none"},
        {Opt_debug, "debug"},
        {Opt_oldalloc, "oldalloc"},
        {Opt_orlov, "orlov"},
@@ -973,8 +911,6 @@ static const match_table_t tokens = {
        {Opt_extents, "extents"},
        {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
-        {Opt_mballoc, "mballoc"},
-        {Opt_nomballoc, "nomballoc"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_nouid32:
                        set_opt(sbi->s_mount_opt, NO_UID32);
                        break;
-                case Opt_nocheck:
-                        clear_opt(sbi->s_mount_opt, CHECK);
-                        break;
                case Opt_debug:
                        set_opt(sbi->s_mount_opt, DEBUG);
                        break;
@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Block bitmap for group %lu not in group "
-                               "(block %llu)!", i, block_bitmap);
+                               "(block %llu)!\n", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Inode bitmap for group %lu not in group "
-                               "(block %llu)!", i, inode_bitmap);
+                               "(block %llu)!\n", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Inode table for group %lu not in group "
-                               "(block %llu)!", i, inode_table);
+                               "(block %llu)!\n", i, inode_table);
                        return 0;
                }
                spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
 {
        loff_t res;
        loff_t upper_limit = MAX_LFS_FILESIZE;
        /* small i_blocks in vfs inode? */
-        if (sizeof(blkcnt_t) < sizeof(u64)) {
+        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
                 * CONFIG_LSF is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
 * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 {
        loff_t res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
         * total number of  512 bytes blocks of the file
         */
-        if (sizeof(blkcnt_t) < sizeof(u64)) {
+        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LSF is not enabled implies the inode
+                 * !has_huge_files or CONFIG_LSF is not enabled
-                 * i_block represent total blocks in 512 bytes
+                 * implies the inode i_block represent total blocks in
-                 * 32 == size of vfs inode i_blocks * 8
+                 * 512 bytes 32 == size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;
@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        int blocksize;
        int db_count;
        int i;
-        int needs_recovery;
+        int needs_recovery, has_huge_files;
        __le32 features;
        __u64 blocks_count;
        int err;
@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       sb->s_id, le32_to_cpu(features));
                goto failed_mount;
        }
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
                 * mount if kernel is build with CONFIG_LSF
@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
-        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
+        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
-        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+                                                      has_huge_files);
+        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        "available.\n");
        }
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                                "requested data journaling mode\n");
+                clear_opt(sbi->s_mount_opt, DELALLOC);
+        } else if (test_opt(sb, DELALLOC))
+                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+        ext4_ext_init(sb);
+        err = ext4_mb_init(sb, needs_recovery);
+        if (err) {
+                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+                       err);
+                goto failed_mount4;
+        }
        /*
         * akpm: core read_super() calls in here with the superblock locked.
         * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
               "writeback");
-        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-                                "requested data journaling mode\n");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
-        } else if (test_opt(sb, DELALLOC))
-                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-        ext4_ext_init(sb);
-        err = ext4_mb_init(sb, needs_recovery);
-        if (err) {
-                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-                       err);
-                goto failed_mount4;
-        }
        lock_kernel();
        return 0;