aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-17 18:08:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-17 18:08:11 -0400
commit58617d5e59663d2edea03bd03cb74279827611bb (patch)
tree1b472f0ab43ae08fef5dea30b95592a005385686 /fs
parent26e9a397774a0e94efbb8a0bf4a952c28d808cab (diff)
parentf287a1a56130be5fdb96a4a62d1290bd064f308e (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: Remove automatic enabling of the HUGE_FILE feature flag ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback ext4: Update Documentation/filesystems/ext4.txt ext4: Remove unused mount options: nomballoc, mballoc, nocheck ext4: Remove compile warnings when building w/o CONFIG_PROC_FS ext4: Add missing newlines to printk messages ext4: Fix file fragmentation during large file write. vfs: Add no_nrwrite_index_update writeback control flag vfs: Remove the range_cont writeback mode. ext4: Use tag dirty lookup during mpage_da_submit_io ext4: let the block device know when unused blocks can be discarded ext4: Don't reuse released data blocks until transaction commits ext4: Use an rbtree for tracking blocks freed during transaction. ext4: Do mballoc init before doing filesystem recovery ext4: Free ext4_prealloc_space using kmem_cache_free ext4: Fix Kconfig typo for ext4dev ext4: Remove an old reference to ext4dev in Makefile comment
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/ext4/balloc.c12
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/ext4_sb.h3
-rw-r--r--fs/ext4/inode.c143
-rw-r--r--fs/ext4/mballoc.c263
-rw-r--r--fs/ext4/mballoc.h31
-rw-r--r--fs/ext4/super.c132
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c1
11 files changed, 280 insertions, 313 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9e9d70c02a07..d0a1174fb516 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -160,7 +160,7 @@ config EXT4_FS
160 filesystem initially. 160 filesystem initially.
161 161
162 To compile this file system support as a module, choose M here. The 162 To compile this file system support as a module, choose M here. The
163 module will be called ext4dev. 163 module will be called ext4.
164 164
165 If unsure, say N. 165 If unsure, say N.
166 166
diff --git a/fs/Makefile b/fs/Makefile
index d0c69f57e5bf..2168c902d5ca 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/
71# Do not add any filesystems before this line 71# Do not add any filesystems before this line
72obj-$(CONFIG_REISERFS_FS) += reiserfs/ 72obj-$(CONFIG_REISERFS_FS) += reiserfs/
73obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 73obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
74obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev 74obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4
75obj-$(CONFIG_JBD) += jbd/ 75obj-$(CONFIG_JBD) += jbd/
76obj-$(CONFIG_JBD2) += jbd2/ 76obj-$(CONFIG_JBD2) += jbd2/
77obj-$(CONFIG_EXT2_FS) += ext2/ 77obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece228827..b9821be709bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
568 568
569 /* this isn't the right place to decide whether block is metadata 569 /* this isn't the right place to decide whether block is metadata
570 * inode.c/extents.c knows better, but for safety ... */ 570 * inode.c/extents.c knows better, but for safety ... */
571 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 571 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
572 ext4_should_journal_data(inode)) 572 metadata = 1;
573
574 /* We need to make sure we don't reuse
575 * block released untill the transaction commit.
576 * writeback mode have weak data consistency so
577 * don't force data as metadata when freeing block
578 * for writeback mode.
579 */
580 if (metadata == 0 && !ext4_should_writeback_data(inode))
573 metadata = 1; 581 metadata = 1;
574 582
575 sb = inode->i_sb; 583 sb = inode->i_sb;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6690a41cdd9f..4880cc3e6727 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,7 +511,6 @@ do { \
511/* 511/*
512 * Mount flags 512 * Mount flags
513 */ 513 */
514#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
515#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ 514#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
516#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 515#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
517#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 516#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d43264..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
99 struct inode *s_buddy_cache; 99 struct inode *s_buddy_cache;
100 long s_blocks_reserved; 100 long s_blocks_reserved;
101 spinlock_t s_reserve_lock; 101 spinlock_t s_reserve_lock;
102 struct list_head s_active_transaction;
103 struct list_head s_closed_transaction;
104 struct list_head s_committed_transaction;
105 spinlock_t s_md_lock; 102 spinlock_t s_md_lock;
106 tid_t s_last_transaction; 103 tid_t s_last_transaction;
107 unsigned short *s_mb_offsets, *s_mb_maxs; 104 unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9b4ec9decfd1..8dbf6953845b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1648 int ret = 0, err, nr_pages, i; 1648 int ret = 0, err, nr_pages, i;
1649 unsigned long index, end; 1649 unsigned long index, end;
1650 struct pagevec pvec; 1650 struct pagevec pvec;
1651 long pages_skipped;
1651 1652
1652 BUG_ON(mpd->next_page <= mpd->first_page); 1653 BUG_ON(mpd->next_page <= mpd->first_page);
1653 pagevec_init(&pvec, 0); 1654 pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1655 end = mpd->next_page - 1; 1656 end = mpd->next_page - 1;
1656 1657
1657 while (index <= end) { 1658 while (index <= end) {
1658 /* XXX: optimize tail */ 1659 /*
1659 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1660 * We can use PAGECACHE_TAG_DIRTY lookup here because
1661 * even though we have cleared the dirty flag on the page
1662 * We still keep the page in the radix tree with tag
1663 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1664 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1665 * which is called via the below writepage callback.
1666 */
1667 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1668 PAGECACHE_TAG_DIRTY,
1669 min(end - index,
1670 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1660 if (nr_pages == 0) 1671 if (nr_pages == 0)
1661 break; 1672 break;
1662 for (i = 0; i < nr_pages; i++) { 1673 for (i = 0; i < nr_pages; i++) {
1663 struct page *page = pvec.pages[i]; 1674 struct page *page = pvec.pages[i];
1664 1675
1665 index = page->index; 1676 pages_skipped = mpd->wbc->pages_skipped;
1666 if (index > end)
1667 break;
1668 index++;
1669
1670 err = mapping->a_ops->writepage(page, mpd->wbc); 1677 err = mapping->a_ops->writepage(page, mpd->wbc);
1671 if (!err) 1678 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
1679 /*
1680 * have successfully written the page
1681 * without skipping the same
1682 */
1672 mpd->pages_written++; 1683 mpd->pages_written++;
1673 /* 1684 /*
1674 * In error case, we have to continue because 1685 * In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
2104 struct writeback_control *wbc, 2115 struct writeback_control *wbc,
2105 struct mpage_da_data *mpd) 2116 struct mpage_da_data *mpd)
2106{ 2117{
2107 long to_write;
2108 int ret; 2118 int ret;
2109 2119
2110 if (!mpd->get_block) 2120 if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
2119 mpd->pages_written = 0; 2129 mpd->pages_written = 0;
2120 mpd->retval = 0; 2130 mpd->retval = 0;
2121 2131
2122 to_write = wbc->nr_to_write;
2123
2124 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2132 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2125
2126 /* 2133 /*
2127 * Handle last extent of pages 2134 * Handle last extent of pages
2128 */ 2135 */
2129 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2136 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2130 if (mpage_da_map_blocks(mpd) == 0) 2137 if (mpage_da_map_blocks(mpd) == 0)
2131 mpage_da_submit_io(mpd); 2138 mpage_da_submit_io(mpd);
2132 }
2133 2139
2134 wbc->nr_to_write = to_write - mpd->pages_written; 2140 mpd->io_done = 1;
2141 ret = MPAGE_DA_EXTENT_TAIL;
2142 }
2143 wbc->nr_to_write -= mpd->pages_written;
2135 return ret; 2144 return ret;
2136} 2145}
2137 2146
@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2360static int ext4_da_writepages(struct address_space *mapping, 2369static int ext4_da_writepages(struct address_space *mapping,
2361 struct writeback_control *wbc) 2370 struct writeback_control *wbc)
2362{ 2371{
2372 pgoff_t index;
2373 int range_whole = 0;
2363 handle_t *handle = NULL; 2374 handle_t *handle = NULL;
2364 loff_t range_start = 0;
2365 struct mpage_da_data mpd; 2375 struct mpage_da_data mpd;
2366 struct inode *inode = mapping->host; 2376 struct inode *inode = mapping->host;
2377 int no_nrwrite_index_update;
2378 long pages_written = 0, pages_skipped;
2367 int needed_blocks, ret = 0, nr_to_writebump = 0; 2379 int needed_blocks, ret = 0, nr_to_writebump = 0;
2368 long to_write, pages_skipped = 0;
2369 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2380 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2370 2381
2371 /* 2382 /*
@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
2385 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2396 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2386 wbc->nr_to_write = sbi->s_mb_stream_request; 2397 wbc->nr_to_write = sbi->s_mb_stream_request;
2387 } 2398 }
2399 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2400 range_whole = 1;
2388 2401
2389 if (!wbc->range_cyclic) 2402 if (wbc->range_cyclic)
2390 /* 2403 index = mapping->writeback_index;
2391 * If range_cyclic is not set force range_cont 2404 else
2392 * and save the old writeback_index 2405 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2393 */
2394 wbc->range_cont = 1;
2395
2396 range_start = wbc->range_start;
2397 pages_skipped = wbc->pages_skipped;
2398 2406
2399 mpd.wbc = wbc; 2407 mpd.wbc = wbc;
2400 mpd.inode = mapping->host; 2408 mpd.inode = mapping->host;
2401 2409
2402restart_loop: 2410 /*
2403 to_write = wbc->nr_to_write; 2411 * we don't want write_cache_pages to update
2404 while (!ret && to_write > 0) { 2412 * nr_to_write and writeback_index
2413 */
2414 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2415 wbc->no_nrwrite_index_update = 1;
2416 pages_skipped = wbc->pages_skipped;
2417
2418 while (!ret && wbc->nr_to_write > 0) {
2405 2419
2406 /* 2420 /*
2407 * we insert one extent at a time. So we need 2421 * we insert one extent at a time. So we need
@@ -2422,48 +2436,53 @@ restart_loop:
2422 dump_stack(); 2436 dump_stack();
2423 goto out_writepages; 2437 goto out_writepages;
2424 } 2438 }
2425 to_write -= wbc->nr_to_write;
2426
2427 mpd.get_block = ext4_da_get_block_write; 2439 mpd.get_block = ext4_da_get_block_write;
2428 ret = mpage_da_writepages(mapping, wbc, &mpd); 2440 ret = mpage_da_writepages(mapping, wbc, &mpd);
2429 2441
2430 ext4_journal_stop(handle); 2442 ext4_journal_stop(handle);
2431 2443
2432 if (mpd.retval == -ENOSPC) 2444 if (mpd.retval == -ENOSPC) {
2445 /* commit the transaction which would
2446 * free blocks released in the transaction
2447 * and try again
2448 */
2433 jbd2_journal_force_commit_nested(sbi->s_journal); 2449 jbd2_journal_force_commit_nested(sbi->s_journal);
2434 2450 wbc->pages_skipped = pages_skipped;
2435 /* reset the retry count */ 2451 ret = 0;
2436 if (ret == MPAGE_DA_EXTENT_TAIL) { 2452 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2437 /* 2453 /*
2438 * got one extent now try with 2454 * got one extent now try with
2439 * rest of the pages 2455 * rest of the pages
2440 */ 2456 */
2441 to_write += wbc->nr_to_write; 2457 pages_written += mpd.pages_written;
2458 wbc->pages_skipped = pages_skipped;
2442 ret = 0; 2459 ret = 0;
2443 } else if (wbc->nr_to_write) { 2460 } else if (wbc->nr_to_write)
2444 /* 2461 /*
2445 * There is no more writeout needed 2462 * There is no more writeout needed
2446 * or we requested for a noblocking writeout 2463 * or we requested for a noblocking writeout
2447 * and we found the device congested 2464 * and we found the device congested
2448 */ 2465 */
2449 to_write += wbc->nr_to_write;
2450 break; 2466 break;
2451 }
2452 wbc->nr_to_write = to_write;
2453 }
2454
2455 if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
2456 /* We skipped pages in this loop */
2457 wbc->range_start = range_start;
2458 wbc->nr_to_write = to_write +
2459 wbc->pages_skipped - pages_skipped;
2460 wbc->pages_skipped = pages_skipped;
2461 goto restart_loop;
2462 } 2467 }
2468 if (pages_skipped != wbc->pages_skipped)
2469 printk(KERN_EMERG "This should not happen leaving %s "
2470 "with nr_to_write = %ld ret = %d\n",
2471 __func__, wbc->nr_to_write, ret);
2472
2473 /* Update index */
2474 index += pages_written;
2475 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2476 /*
2477 * set the writeback_index so that range_cyclic
2478 * mode will write it back later
2479 */
2480 mapping->writeback_index = index;
2463 2481
2464out_writepages: 2482out_writepages:
2465 wbc->nr_to_write = to_write - nr_to_writebump; 2483 if (!no_nrwrite_index_update)
2466 wbc->range_start = range_start; 2484 wbc->no_nrwrite_index_update = 0;
2485 wbc->nr_to_write -= nr_to_writebump;
2467 return ret; 2486 return ret;
2468} 2487}
2469 2488
@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
4175 struct inode *inode = &(ei->vfs_inode); 4194 struct inode *inode = &(ei->vfs_inode);
4176 u64 i_blocks = inode->i_blocks; 4195 u64 i_blocks = inode->i_blocks;
4177 struct super_block *sb = inode->i_sb; 4196 struct super_block *sb = inode->i_sb;
4178 int err = 0;
4179 4197
4180 if (i_blocks <= ~0U) { 4198 if (i_blocks <= ~0U) {
4181 /* 4199 /*
@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
4185 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4203 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4186 raw_inode->i_blocks_high = 0; 4204 raw_inode->i_blocks_high = 0;
4187 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4205 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4188 } else if (i_blocks <= 0xffffffffffffULL) { 4206 return 0;
4207 }
4208 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
4209 return -EFBIG;
4210
4211 if (i_blocks <= 0xffffffffffffULL) {
4189 /* 4212 /*
4190 * i_blocks can be represented in a 48 bit variable 4213 * i_blocks can be represented in a 48 bit variable
4191 * as multiple of 512 bytes 4214 * as multiple of 512 bytes
4192 */ 4215 */
4193 err = ext4_update_rocompat_feature(handle, sb,
4194 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4195 if (err)
4196 goto err_out;
4197 /* i_block is stored in the split 48 bit fields */
4198 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4216 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4199 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4217 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4200 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4218 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4201 } else { 4219 } else {
4202 /*
4203 * i_blocks should be represented in a 48 bit variable
4204 * as multiple of file system block size
4205 */
4206 err = ext4_update_rocompat_feature(handle, sb,
4207 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4208 if (err)
4209 goto err_out;
4210 ei->i_flags |= EXT4_HUGE_FILE_FL; 4220 ei->i_flags |= EXT4_HUGE_FILE_FL;
4211 /* i_block is stored in file system block size */ 4221 /* i_block is stored in file system block size */
4212 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4222 i_blocks = i_blocks >> (inode->i_blkbits - 9);
4213 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4223 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4214 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4224 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4215 } 4225 }
4216err_out: 4226 return 0;
4217 return err;
4218} 4227}
4219 4228
4220/* 4229/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714f0d85..dfe17a134052 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2300 } 2300 }
2301 2301
2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2303 2304
2304#ifdef DOUBLE_CHECK 2305#ifdef DOUBLE_CHECK
2305 { 2306 {
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2522 } 2523 }
2523 2524
2524 spin_lock_init(&sbi->s_md_lock); 2525 spin_lock_init(&sbi->s_md_lock);
2525 INIT_LIST_HEAD(&sbi->s_active_transaction);
2526 INIT_LIST_HEAD(&sbi->s_closed_transaction);
2527 INIT_LIST_HEAD(&sbi->s_committed_transaction);
2528 spin_lock_init(&sbi->s_bal_lock); 2526 spin_lock_init(&sbi->s_bal_lock);
2529 2527
2530 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2528 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2553 ext4_mb_init_per_dev_proc(sb); 2551 ext4_mb_init_per_dev_proc(sb);
2554 ext4_mb_history_init(sb); 2552 ext4_mb_history_init(sb);
2555 2553
2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2557 return 0; 2557 return 0;
2558} 2558}
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2569 list_del(&pa->pa_group_list); 2569 list_del(&pa->pa_group_list);
2570 count++; 2570 count++;
2571 kfree(pa); 2571 kmem_cache_free(ext4_pspace_cachep, pa);
2572 } 2572 }
2573 if (count) 2573 if (count)
2574 mb_debug("mballoc: %u PAs left\n", count); 2574 mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
2582 struct ext4_group_info *grinfo; 2582 struct ext4_group_info *grinfo;
2583 struct ext4_sb_info *sbi = EXT4_SB(sb); 2583 struct ext4_sb_info *sbi = EXT4_SB(sb);
2584 2584
2585 /* release freed, non-committed blocks */
2586 spin_lock(&sbi->s_md_lock);
2587 list_splice_init(&sbi->s_closed_transaction,
2588 &sbi->s_committed_transaction);
2589 list_splice_init(&sbi->s_active_transaction,
2590 &sbi->s_committed_transaction);
2591 spin_unlock(&sbi->s_md_lock);
2592 ext4_mb_free_committed_blocks(sb);
2593
2594 if (sbi->s_group_info) { 2585 if (sbi->s_group_info) {
2595 for (i = 0; i < sbi->s_groups_count; i++) { 2586 for (i = 0; i < sbi->s_groups_count; i++) {
2596 grinfo = ext4_get_group_info(sb, i); 2587 grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
2644 return 0; 2635 return 0;
2645} 2636}
2646 2637
2647static noinline_for_stack void 2638/*
2648ext4_mb_free_committed_blocks(struct super_block *sb) 2639 * This function is called by the jbd2 layer once the commit has finished,
2640 * so we know we can free the blocks that were released with that commit.
2641 */
2642static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2649{ 2643{
2650 struct ext4_sb_info *sbi = EXT4_SB(sb); 2644 struct super_block *sb = journal->j_private;
2651 int err;
2652 int i;
2653 int count = 0;
2654 int count2 = 0;
2655 struct ext4_free_metadata *md;
2656 struct ext4_buddy e4b; 2645 struct ext4_buddy e4b;
2646 struct ext4_group_info *db;
2647 int err, count = 0, count2 = 0;
2648 struct ext4_free_data *entry;
2649 ext4_fsblk_t discard_block;
2650 struct list_head *l, *ltmp;
2657 2651
2658 if (list_empty(&sbi->s_committed_transaction)) 2652 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2659 return; 2653 entry = list_entry(l, struct ext4_free_data, list);
2660
2661 /* there is committed blocks to be freed yet */
2662 do {
2663 /* get next array of blocks */
2664 md = NULL;
2665 spin_lock(&sbi->s_md_lock);
2666 if (!list_empty(&sbi->s_committed_transaction)) {
2667 md = list_entry(sbi->s_committed_transaction.next,
2668 struct ext4_free_metadata, list);
2669 list_del(&md->list);
2670 }
2671 spin_unlock(&sbi->s_md_lock);
2672
2673 if (md == NULL)
2674 break;
2675 2654
2676 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2655 mb_debug("gonna free %u blocks in group %lu (0x%p):",
2677 md->num, md->group, md); 2656 entry->count, entry->group, entry);
2678 2657
2679 err = ext4_mb_load_buddy(sb, md->group, &e4b); 2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2680 /* we expect to find existing buddy because it's pinned */ 2659 /* we expect to find existing buddy because it's pinned */
2681 BUG_ON(err != 0); 2660 BUG_ON(err != 0);
2682 2661
2662 db = e4b.bd_info;
2683 /* there are blocks to put in buddy to make them really free */ 2663 /* there are blocks to put in buddy to make them really free */
2684 count += md->num; 2664 count += entry->count;
2685 count2++; 2665 count2++;
2686 ext4_lock_group(sb, md->group); 2666 ext4_lock_group(sb, entry->group);
2687 for (i = 0; i < md->num; i++) { 2667 /* Take it out of per group rb tree */
2688 mb_debug(" %u", md->blocks[i]); 2668 rb_erase(&entry->node, &(db->bb_free_root));
2689 mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2669 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2670
2671 if (!db->bb_free_root.rb_node) {
2672 /* No more items in the per group rb tree
2673 * balance refcounts from ext4_mb_free_metadata()
2674 */
2675 page_cache_release(e4b.bd_buddy_page);
2676 page_cache_release(e4b.bd_bitmap_page);
2690 } 2677 }
2691 mb_debug("\n"); 2678 ext4_unlock_group(sb, entry->group);
2692 ext4_unlock_group(sb, md->group); 2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2693 2680 + entry->start_blk
2694 /* balance refcounts from ext4_mb_free_metadata() */ 2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2695 page_cache_release(e4b.bd_buddy_page); 2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
2696 page_cache_release(e4b.bd_bitmap_page); 2683 (unsigned long long) discard_block, entry->count);
2697 2684 sb_issue_discard(sb, discard_block, entry->count);
2698 kfree(md); 2685
2686 kmem_cache_free(ext4_free_ext_cachep, entry);
2699 ext4_mb_release_desc(&e4b); 2687 ext4_mb_release_desc(&e4b);
2700 2688 }
2701 } while (md);
2702 2689
2703 mb_debug("freed %u blocks in %u structures\n", count, count2); 2690 mb_debug("freed %u blocks in %u structures\n", count, count2);
2704} 2691}
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2712 2699
2713static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2700static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2714{ 2701{
2702#ifdef CONFIG_PROC_FS
2715 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2703 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2716 struct ext4_sb_info *sbi = EXT4_SB(sb); 2704 struct ext4_sb_info *sbi = EXT4_SB(sb);
2717 struct proc_dir_entry *proc; 2705 struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ err_out:
2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2723 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2724 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2737 return -ENOMEM; 2725 return -ENOMEM;
2726#else
2727 return 0;
2728#endif
2738} 2729}
2739 2730
2740static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2731static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2741{ 2732{
2733#ifdef CONFIG_PROC_FS
2742 struct ext4_sb_info *sbi = EXT4_SB(sb); 2734 struct ext4_sb_info *sbi = EXT4_SB(sb);
2743 2735
2744 if (sbi->s_proc == NULL) 2736 if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2742 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2743 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2744 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2753 2745#endif
2754 return 0; 2746 return 0;
2755} 2747}
2756 2748
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
2771 kmem_cache_destroy(ext4_pspace_cachep); 2763 kmem_cache_destroy(ext4_pspace_cachep);
2772 return -ENOMEM; 2764 return -ENOMEM;
2773 } 2765 }
2766
2767 ext4_free_ext_cachep =
2768 kmem_cache_create("ext4_free_block_extents",
2769 sizeof(struct ext4_free_data),
2770 0, SLAB_RECLAIM_ACCOUNT, NULL);
2771 if (ext4_free_ext_cachep == NULL) {
2772 kmem_cache_destroy(ext4_pspace_cachep);
2773 kmem_cache_destroy(ext4_ac_cachep);
2774 return -ENOMEM;
2775 }
2774 return 0; 2776 return 0;
2775} 2777}
2776 2778
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
2779 /* XXX: synchronize_rcu(); */ 2781 /* XXX: synchronize_rcu(); */
2780 kmem_cache_destroy(ext4_pspace_cachep); 2782 kmem_cache_destroy(ext4_pspace_cachep);
2781 kmem_cache_destroy(ext4_ac_cachep); 2783 kmem_cache_destroy(ext4_ac_cachep);
2784 kmem_cache_destroy(ext4_free_ext_cachep);
2782} 2785}
2783 2786
2784 2787
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4324 goto out1; 4327 goto out1;
4325 } 4328 }
4326 4329
4327 ext4_mb_poll_new_transaction(sb, handle);
4328
4329 *errp = ext4_mb_initialize_context(ac, ar); 4330 *errp = ext4_mb_initialize_context(ac, ar);
4330 if (*errp) { 4331 if (*errp) {
4331 ar->len = 0; 4332 ar->len = 0;
@@ -4384,35 +4385,20 @@ out1:
4384 4385
4385 return block; 4386 return block;
4386} 4387}
4387static void ext4_mb_poll_new_transaction(struct super_block *sb,
4388 handle_t *handle)
4389{
4390 struct ext4_sb_info *sbi = EXT4_SB(sb);
4391
4392 if (sbi->s_last_transaction == handle->h_transaction->t_tid)
4393 return;
4394
4395 /* new transaction! time to close last one and free blocks for
4396 * committed transaction. we know that only transaction can be
4397 * active, so previos transaction can be being logged and we
4398 * know that transaction before previous is known to be already
4399 * logged. this means that now we may free blocks freed in all
4400 * transactions before previous one. hope I'm clear enough ... */
4401 4388
4402 spin_lock(&sbi->s_md_lock); 4389/*
4403 if (sbi->s_last_transaction != handle->h_transaction->t_tid) { 4390 * We can merge two free data extents only if the physical blocks
4404 mb_debug("new transaction %lu, old %lu\n", 4391 * are contiguous, AND the extents were freed by the same transaction,
4405 (unsigned long) handle->h_transaction->t_tid, 4392 * AND the blocks are associated with the same group.
4406 (unsigned long) sbi->s_last_transaction); 4393 */
4407 list_splice_init(&sbi->s_closed_transaction, 4394static int can_merge(struct ext4_free_data *entry1,
4408 &sbi->s_committed_transaction); 4395 struct ext4_free_data *entry2)
4409 list_splice_init(&sbi->s_active_transaction, 4396{
4410 &sbi->s_closed_transaction); 4397 if ((entry1->t_tid == entry2->t_tid) &&
4411 sbi->s_last_transaction = handle->h_transaction->t_tid; 4398 (entry1->group == entry2->group) &&
4412 } 4399 ((entry1->start_blk + entry1->count) == entry2->start_blk))
4413 spin_unlock(&sbi->s_md_lock); 4400 return 1;
4414 4401 return 0;
4415 ext4_mb_free_committed_blocks(sb);
4416} 4402}
4417 4403
4418static noinline_for_stack int 4404static noinline_for_stack int
@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4422 struct ext4_group_info *db = e4b->bd_info; 4408 struct ext4_group_info *db = e4b->bd_info;
4423 struct super_block *sb = e4b->bd_sb; 4409 struct super_block *sb = e4b->bd_sb;
4424 struct ext4_sb_info *sbi = EXT4_SB(sb); 4410 struct ext4_sb_info *sbi = EXT4_SB(sb);
4425 struct ext4_free_metadata *md; 4411 struct ext4_free_data *entry, *new_entry;
4426 int i; 4412 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node;
4414
4427 4415
4428 BUG_ON(e4b->bd_bitmap_page == NULL); 4416 BUG_ON(e4b->bd_bitmap_page == NULL);
4429 BUG_ON(e4b->bd_buddy_page == NULL); 4417 BUG_ON(e4b->bd_buddy_page == NULL);
4430 4418
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node;
4425
4431 ext4_lock_group(sb, group); 4426 ext4_lock_group(sb, group);
4432 for (i = 0; i < count; i++) { 4427 if (!*n) {
4433 md = db->bb_md_cur; 4428 /* first free block exent. We need to
4434 if (md && db->bb_tid != handle->h_transaction->t_tid) { 4429 protect buddy cache from being freed,
4435 db->bb_md_cur = NULL; 4430 * otherwise we'll refresh it from
4436 md = NULL; 4431 * on-disk bitmap and lose not-yet-available
4432 * blocks */
4433 page_cache_get(e4b->bd_buddy_page);
4434 page_cache_get(e4b->bd_bitmap_page);
4435 }
4436 while (*n) {
4437 parent = *n;
4438 entry = rb_entry(parent, struct ext4_free_data, node);
4439 if (block < entry->start_blk)
4440 n = &(*n)->rb_left;
4441 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right;
4443 else {
4444 ext4_error(sb, __func__,
4445 "Double free of blocks %d (%d %d)\n",
4446 block, entry->start_blk, entry->count);
4447 return 0;
4437 } 4448 }
4449 }
4438 4450
4439 if (md == NULL) { 4451 rb_link_node(new_node, parent, n);
4440 ext4_unlock_group(sb, group); 4452 rb_insert_color(new_node, &db->bb_free_root);
4441 md = kmalloc(sizeof(*md), GFP_NOFS); 4453
4442 if (md == NULL) 4454 /* Now try to see the extent can be merged to left and right */
4443 return -ENOMEM; 4455 node = rb_prev(new_node);
4444 md->num = 0; 4456 if (node) {
4445 md->group = group; 4457 entry = rb_entry(node, struct ext4_free_data, node);
4446 4458 if (can_merge(entry, new_entry)) {
4447 ext4_lock_group(sb, group); 4459 new_entry->start_blk = entry->start_blk;
4448 if (db->bb_md_cur == NULL) { 4460 new_entry->count += entry->count;
4449 spin_lock(&sbi->s_md_lock); 4461 rb_erase(node, &(db->bb_free_root));
4450 list_add(&md->list, &sbi->s_active_transaction); 4462 spin_lock(&sbi->s_md_lock);
4451 spin_unlock(&sbi->s_md_lock); 4463 list_del(&entry->list);
4452 /* protect buddy cache from being freed, 4464 spin_unlock(&sbi->s_md_lock);
4453 * otherwise we'll refresh it from 4465 kmem_cache_free(ext4_free_ext_cachep, entry);
4454 * on-disk bitmap and lose not-yet-available
4455 * blocks */
4456 page_cache_get(e4b->bd_buddy_page);
4457 page_cache_get(e4b->bd_bitmap_page);
4458 db->bb_md_cur = md;
4459 db->bb_tid = handle->h_transaction->t_tid;
4460 mb_debug("new md 0x%p for group %lu\n",
4461 md, md->group);
4462 } else {
4463 kfree(md);
4464 md = db->bb_md_cur;
4465 }
4466 } 4466 }
4467 }
4467 4468
4468 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); 4469 node = rb_next(new_node);
4469 md->blocks[md->num] = block + i; 4470 if (node) {
4470 md->num++; 4471 entry = rb_entry(node, struct ext4_free_data, node);
4471 if (md->num == EXT4_BB_MAX_BLOCKS) { 4472 if (can_merge(new_entry, entry)) {
4472 /* no more space, put full container on a sb's list */ 4473 new_entry->count += entry->count;
4473 db->bb_md_cur = NULL; 4474 rb_erase(node, &(db->bb_free_root));
4475 spin_lock(&sbi->s_md_lock);
4476 list_del(&entry->list);
4477 spin_unlock(&sbi->s_md_lock);
4478 kmem_cache_free(ext4_free_ext_cachep, entry);
4474 } 4479 }
4475 } 4480 }
4481 /* Add the extent to transaction's private list */
4482 spin_lock(&sbi->s_md_lock);
4483 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4484 spin_unlock(&sbi->s_md_lock);
4476 ext4_unlock_group(sb, group); 4485 ext4_unlock_group(sb, group);
4477 return 0; 4486 return 0;
4478} 4487}
@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4500 4509
4501 *freed = 0; 4510 *freed = 0;
4502 4511
4503 ext4_mb_poll_new_transaction(sb, handle);
4504
4505 sbi = EXT4_SB(sb); 4512 sbi = EXT4_SB(sb);
4506 es = EXT4_SB(sb)->s_es; 4513 es = EXT4_SB(sb)->s_es;
4507 if (block < le32_to_cpu(es->s_first_data_block) || 4514 if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828f8b89..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h>
22#include <linux/marker.h>
21#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
22#include "ext4.h" 24#include "ext4.h"
23#include "group.h" 25#include "group.h"
@@ -98,23 +100,29 @@
98 100
99static struct kmem_cache *ext4_pspace_cachep; 101static struct kmem_cache *ext4_pspace_cachep;
100static struct kmem_cache *ext4_ac_cachep; 102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
101 104
102#ifdef EXT4_BB_MAX_BLOCKS 105struct ext4_free_data {
103#undef EXT4_BB_MAX_BLOCKS 106 /* this links the free block information from group_info */
104#endif 107 struct rb_node node;
105#define EXT4_BB_MAX_BLOCKS 30
106 108
107struct ext4_free_metadata { 109 /* this links the free block information from ext4_sb_info */
108 ext4_group_t group;
109 unsigned short num;
110 ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
111 struct list_head list; 110 struct list_head list;
111
112 /* group which free block extent belongs */
113 ext4_group_t group;
114
115 /* free block extent */
116 ext4_grpblk_t start_blk;
117 ext4_grpblk_t count;
118
119 /* transaction which freed this extent */
120 tid_t t_tid;
112}; 121};
113 122
114struct ext4_group_info { 123struct ext4_group_info {
115 unsigned long bb_state; 124 unsigned long bb_state;
116 unsigned long bb_tid; 125 struct rb_root bb_free_root;
117 struct ext4_free_metadata *bb_md_cur;
118 unsigned short bb_first_free; 126 unsigned short bb_first_free;
119 unsigned short bb_free; 127 unsigned short bb_free;
120 unsigned short bb_fragments; 128 unsigned short bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
261 269
262static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
263 ext4_group_t group); 271 ext4_group_t group);
264static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
265static void ext4_mb_free_committed_blocks(struct super_block *);
266static void ext4_mb_return_to_preallocation(struct inode *inode, 272static void ext4_mb_return_to_preallocation(struct inode *inode,
267 struct ext4_buddy *e4b, sector_t block, 273 struct ext4_buddy *e4b, sector_t block,
268 int count); 274 int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
270 struct super_block *, struct ext4_prealloc_space *pa); 276 struct super_block *, struct ext4_prealloc_space *pa);
271static int ext4_mb_init_per_dev_proc(struct super_block *sb); 277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
272static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
273 280
274 281
275static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dea8f13c2fd9..9b2b2bc4ec17 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
374 */ 374 */
375} 375}
376 376
377int ext4_update_compat_feature(handle_t *handle,
378 struct super_block *sb, __u32 compat)
379{
380 int err = 0;
381 if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
382 err = ext4_journal_get_write_access(handle,
383 EXT4_SB(sb)->s_sbh);
384 if (err)
385 return err;
386 EXT4_SET_COMPAT_FEATURE(sb, compat);
387 sb->s_dirt = 1;
388 handle->h_sync = 1;
389 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
390 "call ext4_journal_dirty_met adata");
391 err = ext4_journal_dirty_metadata(handle,
392 EXT4_SB(sb)->s_sbh);
393 }
394 return err;
395}
396
397int ext4_update_rocompat_feature(handle_t *handle,
398 struct super_block *sb, __u32 rocompat)
399{
400 int err = 0;
401 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
402 err = ext4_journal_get_write_access(handle,
403 EXT4_SB(sb)->s_sbh);
404 if (err)
405 return err;
406 EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
407 sb->s_dirt = 1;
408 handle->h_sync = 1;
409 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
410 "call ext4_journal_dirty_met adata");
411 err = ext4_journal_dirty_metadata(handle,
412 EXT4_SB(sb)->s_sbh);
413 }
414 return err;
415}
416
417int ext4_update_incompat_feature(handle_t *handle,
418 struct super_block *sb, __u32 incompat)
419{
420 int err = 0;
421 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
422 err = ext4_journal_get_write_access(handle,
423 EXT4_SB(sb)->s_sbh);
424 if (err)
425 return err;
426 EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
427 sb->s_dirt = 1;
428 handle->h_sync = 1;
429 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
430 "call ext4_journal_dirty_met adata");
431 err = ext4_journal_dirty_metadata(handle,
432 EXT4_SB(sb)->s_sbh);
433 }
434 return err;
435}
436
437/* 377/*
438 * Open the external journal device 378 * Open the external journal device
439 */ 379 */
@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
904enum { 844enum {
905 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 845 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
906 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 846 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
907 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 847 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
908 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 848 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
909 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 849 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
910 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 850 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +855,7 @@ enum {
915 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 855 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
916 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 856 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
917 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 857 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
918 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 858 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
919 Opt_inode_readahead_blks 859 Opt_inode_readahead_blks
920}; 860};
921 861
@@ -933,8 +873,6 @@ static const match_table_t tokens = {
933 {Opt_err_panic, "errors=panic"}, 873 {Opt_err_panic, "errors=panic"},
934 {Opt_err_ro, "errors=remount-ro"}, 874 {Opt_err_ro, "errors=remount-ro"},
935 {Opt_nouid32, "nouid32"}, 875 {Opt_nouid32, "nouid32"},
936 {Opt_nocheck, "nocheck"},
937 {Opt_nocheck, "check=none"},
938 {Opt_debug, "debug"}, 876 {Opt_debug, "debug"},
939 {Opt_oldalloc, "oldalloc"}, 877 {Opt_oldalloc, "oldalloc"},
940 {Opt_orlov, "orlov"}, 878 {Opt_orlov, "orlov"},
@@ -973,8 +911,6 @@ static const match_table_t tokens = {
973 {Opt_extents, "extents"}, 911 {Opt_extents, "extents"},
974 {Opt_noextents, "noextents"}, 912 {Opt_noextents, "noextents"},
975 {Opt_i_version, "i_version"}, 913 {Opt_i_version, "i_version"},
976 {Opt_mballoc, "mballoc"},
977 {Opt_nomballoc, "nomballoc"},
978 {Opt_stripe, "stripe=%u"}, 914 {Opt_stripe, "stripe=%u"},
979 {Opt_resize, "resize"}, 915 {Opt_resize, "resize"},
980 {Opt_delalloc, "delalloc"}, 916 {Opt_delalloc, "delalloc"},
@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
1073 case Opt_nouid32: 1009 case Opt_nouid32:
1074 set_opt(sbi->s_mount_opt, NO_UID32); 1010 set_opt(sbi->s_mount_opt, NO_UID32);
1075 break; 1011 break;
1076 case Opt_nocheck:
1077 clear_opt(sbi->s_mount_opt, CHECK);
1078 break;
1079 case Opt_debug: 1012 case Opt_debug:
1080 set_opt(sbi->s_mount_opt, DEBUG); 1013 set_opt(sbi->s_mount_opt, DEBUG);
1081 break; 1014 break;
@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1618 if (block_bitmap < first_block || block_bitmap > last_block) { 1551 if (block_bitmap < first_block || block_bitmap > last_block) {
1619 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1552 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1620 "Block bitmap for group %lu not in group " 1553 "Block bitmap for group %lu not in group "
1621 "(block %llu)!", i, block_bitmap); 1554 "(block %llu)!\n", i, block_bitmap);
1622 return 0; 1555 return 0;
1623 } 1556 }
1624 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1557 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1625 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1558 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1626 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1559 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1627 "Inode bitmap for group %lu not in group " 1560 "Inode bitmap for group %lu not in group "
1628 "(block %llu)!", i, inode_bitmap); 1561 "(block %llu)!\n", i, inode_bitmap);
1629 return 0; 1562 return 0;
1630 } 1563 }
1631 inode_table = ext4_inode_table(sb, gdp); 1564 inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1633 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1566 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1634 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1567 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1635 "Inode table for group %lu not in group " 1568 "Inode table for group %lu not in group "
1636 "(block %llu)!", i, inode_table); 1569 "(block %llu)!\n", i, inode_table);
1637 return 0; 1570 return 0;
1638 } 1571 }
1639 spin_lock(sb_bgl_lock(sbi, i)); 1572 spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1778 * 1711 *
1779 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 1712 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
1780 */ 1713 */
1781static loff_t ext4_max_size(int blkbits) 1714static loff_t ext4_max_size(int blkbits, int has_huge_files)
1782{ 1715{
1783 loff_t res; 1716 loff_t res;
1784 loff_t upper_limit = MAX_LFS_FILESIZE; 1717 loff_t upper_limit = MAX_LFS_FILESIZE;
1785 1718
1786 /* small i_blocks in vfs inode? */ 1719 /* small i_blocks in vfs inode? */
1787 if (sizeof(blkcnt_t) < sizeof(u64)) { 1720 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1788 /* 1721 /*
1789 * CONFIG_LSF is not enabled implies the inode 1722 * CONFIG_LSF is not enabled implies the inode
1790 * i_block represent total blocks in 512 bytes 1723 * i_block represent total blocks in 512 bytes
@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
1814 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 1747 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
1815 * We need to be 1 filesystem block less than the 2^48 sector limit. 1748 * We need to be 1 filesystem block less than the 2^48 sector limit.
1816 */ 1749 */
1817static loff_t ext4_max_bitmap_size(int bits) 1750static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1818{ 1751{
1819 loff_t res = EXT4_NDIR_BLOCKS; 1752 loff_t res = EXT4_NDIR_BLOCKS;
1820 int meta_blocks; 1753 int meta_blocks;
@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
1827 * total number of 512 bytes blocks of the file 1760 * total number of 512 bytes blocks of the file
1828 */ 1761 */
1829 1762
1830 if (sizeof(blkcnt_t) < sizeof(u64)) { 1763 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1831 /* 1764 /*
1832 * CONFIG_LSF is not enabled implies the inode 1765 * !has_huge_files or CONFIG_LSF is not enabled
1833 * i_block represent total blocks in 512 bytes 1766 * implies the inode i_block represent total blocks in
1834 * 32 == size of vfs inode i_blocks * 8 1767 * 512 bytes 32 == size of vfs inode i_blocks * 8
1835 */ 1768 */
1836 upper_limit = (1LL << 32) - 1; 1769 upper_limit = (1LL << 32) - 1;
1837 1770
@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1940 int blocksize; 1873 int blocksize;
1941 int db_count; 1874 int db_count;
1942 int i; 1875 int i;
1943 int needs_recovery; 1876 int needs_recovery, has_huge_files;
1944 __le32 features; 1877 __le32 features;
1945 __u64 blocks_count; 1878 __u64 blocks_count;
1946 int err; 1879 int err;
@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2081 sb->s_id, le32_to_cpu(features)); 2014 sb->s_id, le32_to_cpu(features));
2082 goto failed_mount; 2015 goto failed_mount;
2083 } 2016 }
2084 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 2017 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2018 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2019 if (has_huge_files) {
2085 /* 2020 /*
2086 * Large file size enabled file system can only be 2021 * Large file size enabled file system can only be
2087 * mount if kernel is build with CONFIG_LSF 2022 * mount if kernel is build with CONFIG_LSF
@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2131 } 2066 }
2132 } 2067 }
2133 2068
2134 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); 2069 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2135 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 2070 has_huge_files);
2071 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
2136 2072
2137 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2073 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
2138 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; 2074 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2456 "available.\n"); 2392 "available.\n");
2457 } 2393 }
2458 2394
2395 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2396 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2397 "requested data journaling mode\n");
2398 clear_opt(sbi->s_mount_opt, DELALLOC);
2399 } else if (test_opt(sb, DELALLOC))
2400 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2401
2402 ext4_ext_init(sb);
2403 err = ext4_mb_init(sb, needs_recovery);
2404 if (err) {
2405 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2406 err);
2407 goto failed_mount4;
2408 }
2409
2459 /* 2410 /*
2460 * akpm: core read_super() calls in here with the superblock locked. 2411 * akpm: core read_super() calls in here with the superblock locked.
2461 * That deadlocks, because orphan cleanup needs to lock the superblock 2412 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2475 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2426 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2476 "writeback"); 2427 "writeback");
2477 2428
2478 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2479 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2480 "requested data journaling mode\n");
2481 clear_opt(sbi->s_mount_opt, DELALLOC);
2482 } else if (test_opt(sb, DELALLOC))
2483 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2484
2485 ext4_ext_init(sb);
2486 err = ext4_mb_init(sb, needs_recovery);
2487 if (err) {
2488 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2489 err);
2490 goto failed_mount4;
2491 }
2492
2493 lock_kernel(); 2429 lock_kernel();
2494 return 0; 2430 return 0;
2495 2431
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0abe02c4242a..8b119e16aa36 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -995,6 +995,9 @@ restart_loop:
995 } 995 }
996 spin_unlock(&journal->j_list_lock); 996 spin_unlock(&journal->j_list_lock);
997 997
998 if (journal->j_commit_callback)
999 journal->j_commit_callback(journal, commit_transaction);
1000
998 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
999 journal->j_devname, commit_transaction->t_tid, 1002 journal->j_devname, commit_transaction->t_tid,
1000 journal->j_tail_sequence); 1003 journal->j_tail_sequence);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5d540588fa9..39b7805a599a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
52 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list); 54 INIT_LIST_HEAD(&transaction->t_inode_list);
55 INIT_LIST_HEAD(&transaction->t_private_list);
55 56
56 /* Set up the commit timer for the new transaction. */ 57 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 58 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);