aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 12:53:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 12:53:20 -0400
commit35806b4f7c5620b547f183e9d53f7cfaeabb582b (patch)
treedc966f5edd9e482fdc85b8fb886ae39ce5c5ec80
parent32e51f141fd8d880f57b6a2eb53ce72856254d4a (diff)
parentd183e11a4a66d80e10d60b0918a47cf073135379 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits) jbd2: Add MAINTAINERS entry jbd2: fix a potential leak of a journal_head on an error path ext4: teach ext4_ext_split to calculate extents efficiently ext4: Convert ext4 to new truncate calling convention ext4: do not normalize block requests from fallocate() ext4: enable "punch hole" functionality ext4: add "punch hole" flag to ext4_map_blocks() ext4: punch out extents ext4: add new function ext4_block_zero_page_range() ext4: add flag to ext4_has_free_blocks ext4: reserve inodes and feature code for 'quota' feature ext4: add support for multiple mount protection ext4: ensure f_bfree returned by ext4_statfs() is non-negative ext4: protect bb_first_free in ext4_trim_all_free() with group lock ext4: only load buddy bitmap in ext4_trim_fs() when it is needed jbd2: Fix comment to match the code in jbd2__journal_start() ext4: fix waiting and sending of a barrier in ext4_sync_file() jbd2: Add function jbd2_trans_will_send_data_barrier() jbd2: fix sending of data flush on journal commit ext4: fix ext4_ext_fiemap_cb() to handle blocks before request range correctly ...
-rw-r--r--Documentation/filesystems/ext4.txt4
-rw-r--r--MAINTAINERS13
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c204
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c58
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--include/linux/jbd2.h8
24 files changed, 2019 insertions, 1103 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index c79ec58fd7f6..3ae9bc94352a 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support.
226noacl This option disables POSIX Access Control List 226noacl This option disables POSIX Access Control List
227 support. 227 support.
228 228
229reservation
230
231noreservation
232
233bsddf (*) Make 'df' act like BSD. 229bsddf (*) Make 'df' act like BSD.
234minixdf Make 'df' act like Minix. 230minixdf Make 'df' act like Minix.
235 231
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ab17de642e5..d54d551004f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org>
3572M: Jan Kara <jack@suse.cz> 3572M: Jan Kara <jack@suse.cz>
3573L: linux-ext4@vger.kernel.org 3573L: linux-ext4@vger.kernel.org
3574S: Maintained 3574S: Maintained
3575F: fs/jbd*/ 3575F: fs/jbd/
3576F: include/linux/ext*jbd*.h 3576F: include/linux/ext3_jbd.h
3577F: include/linux/jbd*.h 3577F: include/linux/jbd.h
3578
3579JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
3580M: "Theodore Ts'o" <tytso@mit.edu>
3581L: linux-ext4@vger.kernel.org
3582S: Maintained
3583F: fs/jbd2/
3584F: include/linux/jbd2.h
3578 3585
3579JSM Neo PCI based serial card 3586JSM Neo PCI based serial card
3580M: Breno Leitao <leitao@linux.vnet.ibm.com> 3587M: Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o
10 11
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 362}
363 363
364/** 364/**
365 * ext4_add_groupblocks() -- Add given blocks to an existing group
366 * @handle: handle to this transaction
367 * @sb: super block
368 * @block: start physcial block to add to the block group
369 * @count: number of blocks to free
370 *
371 * This marks the blocks as free in the bitmap. We ask the
372 * mballoc to reload the buddy after this by setting group
373 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
374 */
375void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
376 ext4_fsblk_t block, unsigned long count)
377{
378 struct buffer_head *bitmap_bh = NULL;
379 struct buffer_head *gd_bh;
380 ext4_group_t block_group;
381 ext4_grpblk_t bit;
382 unsigned int i;
383 struct ext4_group_desc *desc;
384 struct ext4_sb_info *sbi = EXT4_SB(sb);
385 int err = 0, ret, blk_free_count;
386 ext4_grpblk_t blocks_freed;
387 struct ext4_group_info *grp;
388
389 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
390
391 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
392 grp = ext4_get_group_info(sb, block_group);
393 /*
394 * Check to see if we are freeing blocks across a group
395 * boundary.
396 */
397 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
398 goto error_return;
399 }
400 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
401 if (!bitmap_bh)
402 goto error_return;
403 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
404 if (!desc)
405 goto error_return;
406
407 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
408 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
409 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
410 in_range(block + count - 1, ext4_inode_table(sb, desc),
411 sbi->s_itb_per_group)) {
412 ext4_error(sb, "Adding blocks in system zones - "
413 "Block = %llu, count = %lu",
414 block, count);
415 goto error_return;
416 }
417
418 /*
419 * We are about to add blocks to the bitmap,
420 * so we need undo access.
421 */
422 BUFFER_TRACE(bitmap_bh, "getting undo access");
423 err = ext4_journal_get_undo_access(handle, bitmap_bh);
424 if (err)
425 goto error_return;
426
427 /*
428 * We are about to modify some metadata. Call the journal APIs
429 * to unshare ->b_data if a currently-committing transaction is
430 * using it
431 */
432 BUFFER_TRACE(gd_bh, "get_write_access");
433 err = ext4_journal_get_write_access(handle, gd_bh);
434 if (err)
435 goto error_return;
436 /*
437 * make sure we don't allow a parallel init on other groups in the
438 * same buddy cache
439 */
440 down_write(&grp->alloc_sem);
441 for (i = 0, blocks_freed = 0; i < count; i++) {
442 BUFFER_TRACE(bitmap_bh, "clear bit");
443 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
444 bit + i, bitmap_bh->b_data)) {
445 ext4_error(sb, "bit already cleared for block %llu",
446 (ext4_fsblk_t)(block + i));
447 BUFFER_TRACE(bitmap_bh, "bit already cleared");
448 } else {
449 blocks_freed++;
450 }
451 }
452 ext4_lock_group(sb, block_group);
453 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
454 ext4_free_blks_set(sb, desc, blk_free_count);
455 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
456 ext4_unlock_group(sb, block_group);
457 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
458
459 if (sbi->s_log_groups_per_flex) {
460 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
461 atomic_add(blocks_freed,
462 &sbi->s_flex_groups[flex_group].free_blocks);
463 }
464 /*
465 * request to reload the buddy with the
466 * new bitmap information
467 */
468 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
469 grp->bb_free += blocks_freed;
470 up_write(&grp->alloc_sem);
471
472 /* We dirtied the bitmap block */
473 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
474 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
475
476 /* And the group descriptor block */
477 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
478 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
479 if (!err)
480 err = ret;
481
482error_return:
483 brelse(bitmap_bh);
484 ext4_std_error(sb, err);
485 return;
486}
487
488/**
489 * ext4_has_free_blocks() 365 * ext4_has_free_blocks()
490 * @sbi: in-core super block structure. 366 * @sbi: in-core super block structure.
491 * @nblocks: number of needed blocks 367 * @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
493 * Check if filesystem has nblocks free & available for allocation. 369 * Check if filesystem has nblocks free & available for allocation.
494 * On success return 1, return 0 on failure. 370 * On success return 1, return 0 on failure.
495 */ 371 */
496static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 372static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags)
497{ 374{
498 s64 free_blocks, dirty_blocks, root_blocks; 375 s64 free_blocks, dirty_blocks, root_blocks;
499 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
507 EXT4_FREEBLOCKS_WATERMARK) { 384 EXT4_FREEBLOCKS_WATERMARK) {
508 free_blocks = percpu_counter_sum_positive(fbc); 385 free_blocks = percpu_counter_sum_positive(fbc);
509 dirty_blocks = percpu_counter_sum_positive(dbc); 386 dirty_blocks = percpu_counter_sum_positive(dbc);
510 if (dirty_blocks < 0) {
511 printk(KERN_CRIT "Dirty block accounting "
512 "went wrong %lld\n",
513 (long long)dirty_blocks);
514 }
515 } 387 }
516 /* Check whether we have space after 388 /* Check whether we have space after
517 * accounting for current dirty blocks & root reserved blocks. 389 * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
522 /* Hm, nope. Are (enough) root reserved blocks available? */ 394 /* Hm, nope. Are (enough) root reserved blocks available? */
523 if (sbi->s_resuid == current_fsuid() || 395 if (sbi->s_resuid == current_fsuid() ||
524 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
525 capable(CAP_SYS_RESOURCE)) { 397 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399
526 if (free_blocks >= (nblocks + dirty_blocks)) 400 if (free_blocks >= (nblocks + dirty_blocks))
527 return 1; 401 return 1;
528 } 402 }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
531} 405}
532 406
533int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 407int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
534 s64 nblocks) 408 s64 nblocks, unsigned int flags)
535{ 409{
536 if (ext4_has_free_blocks(sbi, nblocks)) { 410 if (ext4_has_free_blocks(sbi, nblocks, flags)) {
537 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
538 return 0; 412 return 0;
539 } else 413 } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
554 */ 428 */
555int ext4_should_retry_alloc(struct super_block *sb, int *retries) 429int ext4_should_retry_alloc(struct super_block *sb, int *retries)
556{ 430{
557 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || 431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
558 (*retries)++ > 3 || 432 (*retries)++ > 3 ||
559 !EXT4_SB(sb)->s_journal) 433 !EXT4_SB(sb)->s_journal)
560 return 0; 434 return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
577 * error stores in errp pointer 451 * error stores in errp pointer
578 */ 452 */
579ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 453ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
580 ext4_fsblk_t goal, unsigned long *count, int *errp) 454 ext4_fsblk_t goal, unsigned int flags,
455 unsigned long *count, int *errp)
581{ 456{
582 struct ext4_allocation_request ar; 457 struct ext4_allocation_request ar;
583 ext4_fsblk_t ret; 458 ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
587 ar.inode = inode; 462 ar.inode = inode;
588 ar.goal = goal; 463 ar.goal = goal;
589 ar.len = count ? *count : 1; 464 ar.len = count ? *count : 1;
465 ar.flags = flags;
590 466
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 467 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 468 if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
108#define EXT4_MB_DELALLOC_RESERVED 0x0400 108#define EXT4_MB_DELALLOC_RESERVED 0x0400
109/* We are doing stream allocation */ 109/* We are doing stream allocation */
110#define EXT4_MB_STREAM_ALLOC 0x0800 110#define EXT4_MB_STREAM_ALLOC 0x0800
111 111/* Use reserved root blocks if needed */
112#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
112 113
113struct ext4_allocation_request { 114struct ext4_allocation_request {
114 /* target inode for block we're allocating */ 115 /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
209 */ 210 */
210#define EXT4_BAD_INO 1 /* Bad blocks inode */ 211#define EXT4_BAD_INO 1 /* Bad blocks inode */
211#define EXT4_ROOT_INO 2 /* Root inode */ 212#define EXT4_ROOT_INO 2 /* Root inode */
213#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
214#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
212#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ 215#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
213#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ 216#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
214#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ 217#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
512 /* Convert extent to initialized after IO complete */ 515 /* Convert extent to initialized after IO complete */
513#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 516#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
514 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 517 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
518 /* Punch out blocks of an extent */
519#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
520 /* Don't normalize allocation size (used for fallocate) */
521#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
515 522
516/* 523/*
517 * Flags used by ext4_free_blocks 524 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1035 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
1029 __le32 s_flags; /* Miscellaneous flags */ 1036 __le32 s_flags; /* Miscellaneous flags */
1030 __le16 s_raid_stride; /* RAID stride */ 1037 __le16 s_raid_stride; /* RAID stride */
1031 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1038 __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1039 __le64 s_mmp_block; /* Block for multi-mount protection */
1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1040 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1041 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
1144 unsigned long s_ext_blocks; 1151 unsigned long s_ext_blocks;
1145 unsigned long s_ext_extents; 1152 unsigned long s_ext_extents;
1146#endif 1153#endif
1154 /* ext4 extent cache stats */
1155 unsigned long extent_cache_hits;
1156 unsigned long extent_cache_misses;
1147 1157
1148 /* for buddy allocator */ 1158 /* for buddy allocator */
1149 struct ext4_group_info ***s_group_info; 1159 struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
1201 struct ext4_li_request *s_li_request; 1211 struct ext4_li_request *s_li_request;
1202 /* Wait multiplier for lazy initialization thread */ 1212 /* Wait multiplier for lazy initialization thread */
1203 unsigned int s_li_wait_mult; 1213 unsigned int s_li_wait_mult;
1214
1215 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk;
1204}; 1217};
1205 1218
1206static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1338#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 1351#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
1339#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1352#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1340#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1353#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1354#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1341 1355
1342#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1356#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1343#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1357#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1351#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1365#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1352#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1366#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1353 1367
1368#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1369#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1370 EXT4_FEATURE_INCOMPAT_META_BG)
1371#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1372 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1373 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1374
1375#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1376#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1377 EXT4_FEATURE_INCOMPAT_RECOVER| \
1378 EXT4_FEATURE_INCOMPAT_META_BG)
1379#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1380 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1381 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1382
1354#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1383#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1355#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1384#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1356 EXT4_FEATURE_INCOMPAT_RECOVER| \ 1385 EXT4_FEATURE_INCOMPAT_RECOVER| \
1357 EXT4_FEATURE_INCOMPAT_META_BG| \ 1386 EXT4_FEATURE_INCOMPAT_META_BG| \
1358 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1387 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1359 EXT4_FEATURE_INCOMPAT_64BIT| \ 1388 EXT4_FEATURE_INCOMPAT_64BIT| \
1360 EXT4_FEATURE_INCOMPAT_FLEX_BG) 1389 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1390 EXT4_FEATURE_INCOMPAT_MMP)
1361#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1391#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1362 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1392 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1363 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1393 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1590 */ 1620 */
1591struct ext4_lazy_init { 1621struct ext4_lazy_init {
1592 unsigned long li_state; 1622 unsigned long li_state;
1593
1594 wait_queue_head_t li_wait_daemon;
1595 wait_queue_head_t li_wait_task;
1596 struct timer_list li_timer;
1597 struct task_struct *li_task;
1598
1599 struct list_head li_request_list; 1623 struct list_head li_request_list;
1600 struct mutex li_list_mtx; 1624 struct mutex li_list_mtx;
1601}; 1625};
@@ -1615,6 +1639,67 @@ struct ext4_features {
1615}; 1639};
1616 1640
1617/* 1641/*
1642 * This structure will be used for multiple mount protection. It will be
1643 * written into the block number saved in the s_mmp_block field in the
1644 * superblock. Programs that check MMP should assume that if
1645 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
1646 * to use the filesystem, regardless of how old the timestamp is.
1647 */
1648#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
1649#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
1650#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
1651#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
1652
1653struct mmp_struct {
1654 __le32 mmp_magic; /* Magic number for MMP */
1655 __le32 mmp_seq; /* Sequence no. updated periodically */
1656
1657 /*
1658 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
1659 * purposes and do not affect the correctness of the algorithm
1660 */
1661 __le64 mmp_time; /* Time last updated */
1662 char mmp_nodename[64]; /* Node which last updated MMP block */
1663 char mmp_bdevname[32]; /* Bdev which last updated MMP block */
1664
1665 /*
1666 * mmp_check_interval is used to verify if the MMP block has been
1667 * updated on the block device. The value is updated based on the
1668 * maximum time to write the MMP block during an update cycle.
1669 */
1670 __le16 mmp_check_interval;
1671
1672 __le16 mmp_pad1;
1673 __le32 mmp_pad2[227];
1674};
1675
1676/* arguments passed to the mmp thread */
1677struct mmpd_data {
1678 struct buffer_head *bh; /* bh from initial read_mmp_block() */
1679 struct super_block *sb; /* super block of the fs */
1680};
1681
1682/*
1683 * Check interval multiplier
1684 * The MMP block is written every update interval and initially checked every
1685 * update interval x the multiplier (the value is then adapted based on the
1686 * write latency). The reason is that writes can be delayed under load and we
1687 * don't want readers to incorrectly assume that the filesystem is no longer
1688 * in use.
1689 */
1690#define EXT4_MMP_CHECK_MULT 2UL
1691
1692/*
1693 * Minimum interval for MMP checking in seconds.
1694 */
1695#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
1696
1697/*
1698 * Maximum interval for MMP checking in seconds.
1699 */
1700#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
1701
1702/*
1618 * Function prototypes 1703 * Function prototypes
1619 */ 1704 */
1620 1705
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1638extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1723extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1639 ext4_group_t group); 1724 ext4_group_t group);
1640extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1725extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1641 ext4_fsblk_t goal, unsigned long *count, int *errp); 1726 ext4_fsblk_t goal,
1642extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1727 unsigned int flags,
1643extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1728 unsigned long *count,
1644 ext4_fsblk_t block, unsigned long count); 1729 int *errp);
1730extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1731 s64 nblocks, unsigned int flags);
1645extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1732extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1646extern void ext4_check_blocks_bitmap(struct super_block *); 1733extern void ext4_check_blocks_bitmap(struct super_block *);
1647extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1734extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1706 unsigned long count, int flags); 1793 unsigned long count, int flags);
1707extern int ext4_mb_add_groupinfo(struct super_block *sb, 1794extern int ext4_mb_add_groupinfo(struct super_block *sb,
1708 ext4_group_t i, struct ext4_group_desc *desc); 1795 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count);
1709extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1710 1799
1711/* inode.c */ 1800/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1729extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1730extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
1731extern void ext4_truncate(struct inode *); 1820extern void ext4_truncate(struct inode *);
1821extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
1732extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 1822extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1733extern void ext4_set_inode_flags(struct inode *); 1823extern void ext4_set_inode_flags(struct inode *);
1734extern void ext4_get_inode_flags(struct ext4_inode_info *); 1824extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
1738extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1828extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1739extern int ext4_block_truncate_page(handle_t *handle, 1829extern int ext4_block_truncate_page(handle_t *handle,
1740 struct address_space *mapping, loff_t from); 1830 struct address_space *mapping, loff_t from);
1831extern int ext4_block_zero_page_range(handle_t *handle,
1832 struct address_space *mapping, loff_t from, loff_t length);
1741extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1833extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1742extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1834extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1743extern void ext4_da_update_reserve_space(struct inode *inode, 1835extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1788 __LINE__, ## message) 1880 __LINE__, ## message)
1789extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1881extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1790 __attribute__ ((format (printf, 3, 4))); 1882 __attribute__ ((format (printf, 3, 4)));
1883extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1884 const char *, unsigned int, const char *);
1885#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1886 __LINE__, msg)
1791extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1887extern void __ext4_grp_locked_error(const char *, unsigned int, \
1792 struct super_block *, ext4_group_t, \ 1888 struct super_block *, ext4_group_t, \
1793 unsigned long, ext4_fsblk_t, \ 1889 unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2064extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2160extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2065 struct ext4_map_blocks *map, int flags); 2161 struct ext4_map_blocks *map, int flags);
2066extern void ext4_ext_truncate(struct inode *); 2162extern void ext4_ext_truncate(struct inode *);
2163extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
2164 loff_t length);
2067extern void ext4_ext_init(struct super_block *); 2165extern void ext4_ext_init(struct super_block *);
2068extern void ext4_ext_release(struct super_block *); 2166extern void ext4_ext_release(struct super_block *);
2069extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2167extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2092 int len, 2190 int len,
2093 struct writeback_control *wbc); 2191 struct writeback_control *wbc);
2094 2192
2193/* mmp.c */
2194extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2195
2095/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2196/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2096enum ext4_state_bits { 2197enum ext4_state_bits {
2097 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2198 BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11{
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21}
22
23int __ext4_journal_get_write_access(const char *where, unsigned int line, 9int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25{ 11{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132int __ext4_journal_get_write_access(const char *where, unsigned int line, 129int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
146int __ext4_handle_dirty_super(const char *where, unsigned int line, 143int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149#define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
46 46
47#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
48 48
49static int ext4_split_extent(handle_t *handle,
50 struct inode *inode,
51 struct ext4_ext_path *path,
52 struct ext4_map_blocks *map,
53 int split_flag,
54 int flags);
55
49static int ext4_ext_truncate_extend_restart(handle_t *handle, 56static int ext4_ext_truncate_extend_restart(handle_t *handle,
50 struct inode *inode, 57 struct inode *inode,
51 int needed) 58 int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
192static ext4_fsblk_t 199static ext4_fsblk_t
193ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 200ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
194 struct ext4_ext_path *path, 201 struct ext4_ext_path *path,
195 struct ext4_extent *ex, int *err) 202 struct ext4_extent *ex, int *err, unsigned int flags)
196{ 203{
197 ext4_fsblk_t goal, newblock; 204 ext4_fsblk_t goal, newblock;
198 205
199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
208 NULL, err);
201 return newblock; 209 return newblock;
202} 210}
203 211
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
474 } 482 }
475 ext_debug("\n"); 483 ext_debug("\n");
476} 484}
485
486static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
487 ext4_fsblk_t newblock, int level)
488{
489 int depth = ext_depth(inode);
490 struct ext4_extent *ex;
491
492 if (depth != level) {
493 struct ext4_extent_idx *idx;
494 idx = path[level].p_idx;
495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
496 ext_debug("%d: move %d:%llu in new index %llu\n", level,
497 le32_to_cpu(idx->ei_block),
498 ext4_idx_pblock(idx),
499 newblock);
500 idx++;
501 }
502
503 return;
504 }
505
506 ex = path[depth].p_ext;
507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
509 le32_to_cpu(ex->ee_block),
510 ext4_ext_pblock(ex),
511 ext4_ext_is_uninitialized(ex),
512 ext4_ext_get_actual_len(ex),
513 newblock);
514 ex++;
515 }
516}
517
477#else 518#else
478#define ext4_ext_show_path(inode, path) 519#define ext4_ext_show_path(inode, path)
479#define ext4_ext_show_leaf(inode, path) 520#define ext4_ext_show_leaf(inode, path)
521#define ext4_ext_show_move(inode, path, newblock, level)
480#endif 522#endif
481 523
482void ext4_ext_drop_refs(struct ext4_ext_path *path) 524void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
792 * - initializes subtree 834 * - initializes subtree
793 */ 835 */
794static int ext4_ext_split(handle_t *handle, struct inode *inode, 836static int ext4_ext_split(handle_t *handle, struct inode *inode,
795 struct ext4_ext_path *path, 837 unsigned int flags,
796 struct ext4_extent *newext, int at) 838 struct ext4_ext_path *path,
839 struct ext4_extent *newext, int at)
797{ 840{
798 struct buffer_head *bh = NULL; 841 struct buffer_head *bh = NULL;
799 int depth = ext_depth(inode); 842 int depth = ext_depth(inode);
800 struct ext4_extent_header *neh; 843 struct ext4_extent_header *neh;
801 struct ext4_extent_idx *fidx; 844 struct ext4_extent_idx *fidx;
802 struct ext4_extent *ex;
803 int i = at, k, m, a; 845 int i = at, k, m, a;
804 ext4_fsblk_t newblock, oldblock; 846 ext4_fsblk_t newblock, oldblock;
805 __le32 border; 847 __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
848 for (a = 0; a < depth - at; a++) { 890 for (a = 0; a < depth - at; a++) {
849 newblock = ext4_ext_new_meta_block(handle, inode, path, 891 newblock = ext4_ext_new_meta_block(handle, inode, path,
850 newext, &err); 892 newext, &err, flags);
851 if (newblock == 0) 893 if (newblock == 0)
852 goto cleanup; 894 goto cleanup;
853 ablocks[a] = newblock; 895 ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
877 neh->eh_magic = EXT4_EXT_MAGIC; 919 neh->eh_magic = EXT4_EXT_MAGIC;
878 neh->eh_depth = 0; 920 neh->eh_depth = 0;
879 ex = EXT_FIRST_EXTENT(neh);
880 921
881 /* move remainder of path[depth] to the new leaf */ 922 /* move remainder of path[depth] to the new leaf */
882 if (unlikely(path[depth].p_hdr->eh_entries != 923 if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
888 goto cleanup; 929 goto cleanup;
889 } 930 }
890 /* start copy from next extent */ 931 /* start copy from next extent */
891 /* TODO: we could do it by single memmove */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
892 m = 0; 933 ext4_ext_show_move(inode, path, newblock, depth);
893 path[depth].p_ext++;
894 while (path[depth].p_ext <=
895 EXT_MAX_EXTENT(path[depth].p_hdr)) {
896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
897 le32_to_cpu(path[depth].p_ext->ee_block),
898 ext4_ext_pblock(path[depth].p_ext),
899 ext4_ext_is_uninitialized(path[depth].p_ext),
900 ext4_ext_get_actual_len(path[depth].p_ext),
901 newblock);
902 /*memmove(ex++, path[depth].p_ext++,
903 sizeof(struct ext4_extent));
904 neh->eh_entries++;*/
905 path[depth].p_ext++;
906 m++;
907 }
908 if (m) { 934 if (m) {
909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 935 struct ext4_extent *ex;
936 ex = EXT_FIRST_EXTENT(neh);
937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
910 le16_add_cpu(&neh->eh_entries, m); 938 le16_add_cpu(&neh->eh_entries, m);
911 } 939 }
912 940
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
968 996
969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 i, newblock, le32_to_cpu(border), oldblock); 998 i, newblock, le32_to_cpu(border), oldblock);
971 /* copy indexes */
972 m = 0;
973 path[i].p_idx++;
974 999
975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 /* move remainder of path[i] to the new index block */
976 EXT_MAX_INDEX(path[i].p_hdr));
977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978 EXT_LAST_INDEX(path[i].p_hdr))) { 1002 EXT_LAST_INDEX(path[i].p_hdr))) {
979 EXT4_ERROR_INODE(inode, 1003 EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
982 err = -EIO; 1006 err = -EIO;
983 goto cleanup; 1007 goto cleanup;
984 } 1008 }
985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1009 /* start copy indexes */
986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987 le32_to_cpu(path[i].p_idx->ei_block), 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988 ext4_idx_pblock(path[i].p_idx), 1012 EXT_MAX_INDEX(path[i].p_hdr));
989 newblock); 1013 ext4_ext_show_move(inode, path, newblock, i);
990 /*memmove(++fidx, path[i].p_idx++,
991 sizeof(struct ext4_extent_idx));
992 neh->eh_entries++;
993 BUG_ON(neh->eh_entries > neh->eh_max);*/
994 path[i].p_idx++;
995 m++;
996 }
997 if (m) { 1014 if (m) {
998 memmove(++fidx, path[i].p_idx - m, 1015 memmove(++fidx, path[i].p_idx,
999 sizeof(struct ext4_extent_idx) * m); 1016 sizeof(struct ext4_extent_idx) * m);
1000 le16_add_cpu(&neh->eh_entries, m); 1017 le16_add_cpu(&neh->eh_entries, m);
1001 } 1018 }
@@ -1056,8 +1073,9 @@ cleanup:
1056 * just created block 1073 * just created block
1057 */ 1074 */
1058static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1075static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1059 struct ext4_ext_path *path, 1076 unsigned int flags,
1060 struct ext4_extent *newext) 1077 struct ext4_ext_path *path,
1078 struct ext4_extent *newext)
1061{ 1079{
1062 struct ext4_ext_path *curp = path; 1080 struct ext4_ext_path *curp = path;
1063 struct ext4_extent_header *neh; 1081 struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1065 ext4_fsblk_t newblock; 1083 ext4_fsblk_t newblock;
1066 int err = 0; 1084 int err = 0;
1067 1085
1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1086 newblock = ext4_ext_new_meta_block(handle, inode, path,
1087 newext, &err, flags);
1069 if (newblock == 0) 1088 if (newblock == 0)
1070 return err; 1089 return err;
1071 1090
@@ -1140,8 +1159,9 @@ out:
1140 * if no free index is found, then it requests in-depth growing. 1159 * if no free index is found, then it requests in-depth growing.
1141 */ 1160 */
1142static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1161static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1143 struct ext4_ext_path *path, 1162 unsigned int flags,
1144 struct ext4_extent *newext) 1163 struct ext4_ext_path *path,
1164 struct ext4_extent *newext)
1145{ 1165{
1146 struct ext4_ext_path *curp; 1166 struct ext4_ext_path *curp;
1147 int depth, i, err = 0; 1167 int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
1161 if (EXT_HAS_FREE_INDEX(curp)) { 1181 if (EXT_HAS_FREE_INDEX(curp)) {
1162 /* if we found index with free entry, then use that 1182 /* if we found index with free entry, then use that
1163 * entry: create all needed subtree and add new leaf */ 1183 * entry: create all needed subtree and add new leaf */
1164 err = ext4_ext_split(handle, inode, path, newext, i); 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i);
1165 if (err) 1185 if (err)
1166 goto out; 1186 goto out;
1167 1187
@@ -1174,7 +1194,8 @@ repeat:
1174 err = PTR_ERR(path); 1194 err = PTR_ERR(path);
1175 } else { 1195 } else {
1176 /* tree is full, time to grow in depth */ 1196 /* tree is full, time to grow in depth */
1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1197 err = ext4_ext_grow_indepth(handle, inode, flags,
1198 path, newext);
1178 if (err) 1199 if (err)
1179 goto out; 1200 goto out;
1180 1201
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1564 * 1 if they got merged. 1585 * 1 if they got merged.
1565 */ 1586 */
1566static int ext4_ext_try_to_merge(struct inode *inode, 1587static int ext4_ext_try_to_merge_right(struct inode *inode,
1567 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1568 struct ext4_extent *ex) 1589 struct ext4_extent *ex)
1569{ 1590{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1603} 1624}
1604 1625
1605/* 1626/*
1627 * This function tries to merge the @ex extent to neighbours in the tree.
1628 * return 1 if merge left else 0.
1629 */
1630static int ext4_ext_try_to_merge(struct inode *inode,
1631 struct ext4_ext_path *path,
1632 struct ext4_extent *ex) {
1633 struct ext4_extent_header *eh;
1634 unsigned int depth;
1635 int merge_done = 0;
1636 int ret = 0;
1637
1638 depth = ext_depth(inode);
1639 BUG_ON(path[depth].p_hdr == NULL);
1640 eh = path[depth].p_hdr;
1641
1642 if (ex > EXT_FIRST_EXTENT(eh))
1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1644
1645 if (!merge_done)
1646 ret = ext4_ext_try_to_merge_right(inode, path, ex);
1647
1648 return ret;
1649}
1650
1651/*
1606 * check if a portion of the "newext" extent overlaps with an 1652 * check if a portion of the "newext" extent overlaps with an
1607 * existing extent. 1653 * existing extent.
1608 * 1654 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1668 int depth, len, err; 1714 int depth, len, err;
1669 ext4_lblk_t next; 1715 ext4_lblk_t next;
1670 unsigned uninitialized = 0; 1716 unsigned uninitialized = 0;
1717 int flags = 0;
1671 1718
1672 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1673 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
1742 * There is no free space in the found leaf. 1789 * There is no free space in the found leaf.
1743 * We're gonna add a new leaf in the tree. 1790 * We're gonna add a new leaf in the tree.
1744 */ 1791 */
1745 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1793 flags = EXT4_MB_USE_ROOT_BLOCKS;
1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1746 if (err) 1795 if (err)
1747 goto cleanup; 1796 goto cleanup;
1748 depth = ext_depth(inode); 1797 depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2003} 2052}
2004 2053
2005/* 2054/*
2055 * ext4_ext_in_cache()
2056 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole,
2059 * this routine should be used instead of
2060 * ext4_ext_in_cache if the calling function needs to
2061 * know the size of the hole.
2062 *
2063 * @inode: The files inode
2064 * @block: The block to look for in the cache
2065 * @ex: Pointer where the cached extent will be stored
2066 * if it contains block
2067 *
2006 * Return 0 if cache is invalid; 1 if the cache is valid 2068 * Return 0 if cache is invalid; 1 if the cache is valid
2007 */ 2069 */
2008static int 2070static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2009ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2071 struct ext4_ext_cache *ex){
2010 struct ext4_extent *ex)
2011{
2012 struct ext4_ext_cache *cex; 2072 struct ext4_ext_cache *cex;
2073 struct ext4_sb_info *sbi;
2013 int ret = 0; 2074 int ret = 0;
2014 2075
2015 /* 2076 /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2017 */ 2078 */
2018 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2019 cex = &EXT4_I(inode)->i_cached_extent; 2080 cex = &EXT4_I(inode)->i_cached_extent;
2081 sbi = EXT4_SB(inode->i_sb);
2020 2082
2021 /* has cache valid data? */ 2083 /* has cache valid data? */
2022 if (cex->ec_len == 0) 2084 if (cex->ec_len == 0)
2023 goto errout; 2085 goto errout;
2024 2086
2025 if (in_range(block, cex->ec_block, cex->ec_len)) { 2087 if (in_range(block, cex->ec_block, cex->ec_len)) {
2026 ex->ee_block = cpu_to_le32(cex->ec_block); 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2027 ext4_ext_store_pblock(ex, cex->ec_start);
2028 ex->ee_len = cpu_to_le16(cex->ec_len);
2029 ext_debug("%u cached by %u:%u:%llu\n", 2089 ext_debug("%u cached by %u:%u:%llu\n",
2030 block, 2090 block,
2031 cex->ec_block, cex->ec_len, cex->ec_start); 2091 cex->ec_block, cex->ec_len, cex->ec_start);
2032 ret = 1; 2092 ret = 1;
2033 } 2093 }
2034errout: 2094errout:
2095 if (!ret)
2096 sbi->extent_cache_misses++;
2097 else
2098 sbi->extent_cache_hits++;
2035 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2036 return ret; 2100 return ret;
2037} 2101}
2038 2102
2039/* 2103/*
2104 * ext4_ext_in_cache()
2105 * Checks to see if the given block is in the cache.
2106 * If it is, the cached extent is stored in the given
2107 * extent pointer.
2108 *
2109 * @inode: The files inode
2110 * @block: The block to look for in the cache
2111 * @ex: Pointer where the cached extent will be stored
2112 * if it contains block
2113 *
2114 * Return 0 if cache is invalid; 1 if the cache is valid
2115 */
2116static int
2117ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2118 struct ext4_extent *ex)
2119{
2120 struct ext4_ext_cache cex;
2121 int ret = 0;
2122
2123 if (ext4_ext_check_cache(inode, block, &cex)) {
2124 ex->ee_block = cpu_to_le32(cex.ec_block);
2125 ext4_ext_store_pblock(ex, cex.ec_start);
2126 ex->ee_len = cpu_to_le16(cex.ec_len);
2127 ret = 1;
2128 }
2129
2130 return ret;
2131}
2132
2133
2134/*
2040 * ext4_ext_rm_idx: 2135 * ext4_ext_rm_idx:
2041 * removes index from the index block. 2136 * removes index from the index block.
2042 * It's used in truncate case only, thus all requests are for 2137 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2163 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2164 } else if (from == le32_to_cpu(ex->ee_block) 2259 } else if (from == le32_to_cpu(ex->ee_block)
2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2261 /* head removal */
2167 from, to, le32_to_cpu(ex->ee_block), ee_len); 2262 ext4_lblk_t num;
2263 ext4_fsblk_t start;
2264
2265 num = to - from;
2266 start = ext4_ext_pblock(ex);
2267
2268 ext_debug("free first %u blocks starting %llu\n", num, start);
2269 ext4_free_blocks(handle, inode, 0, start, num, flags);
2270
2168 } else { 2271 } else {
2169 printk(KERN_INFO "strange request: removal(2) " 2272 printk(KERN_INFO "strange request: removal(2) "
2170 "%u-%u from %u:%u\n", 2273 "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2173 return 0; 2276 return 0;
2174} 2277}
2175 2278
2279
2280/*
2281 * ext4_ext_rm_leaf() Removes the extents associated with the
2282 * blocks appearing between "start" and "end", and splits the extents
2283 * if "start" and "end" appear in the same extent
2284 *
2285 * @handle: The journal handle
2286 * @inode: The files inode
2287 * @path: The path to the leaf
2288 * @start: The first block to remove
2289 * @end: The last block to remove
2290 */
2176static int 2291static int
2177ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2292ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2178 struct ext4_ext_path *path, ext4_lblk_t start) 2293 struct ext4_ext_path *path, ext4_lblk_t start,
2294 ext4_lblk_t end)
2179{ 2295{
2180 int err = 0, correct_index = 0; 2296 int err = 0, correct_index = 0;
2181 int depth = ext_depth(inode), credits; 2297 int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2186 unsigned short ex_ee_len; 2302 unsigned short ex_ee_len;
2187 unsigned uninitialized = 0; 2303 unsigned uninitialized = 0;
2188 struct ext4_extent *ex; 2304 struct ext4_extent *ex;
2305 struct ext4_map_blocks map;
2189 2306
2190 /* the header must be checked already in ext4_ext_remove_space() */ 2307 /* the header must be checked already in ext4_ext_remove_space() */
2191 ext_debug("truncate since %u in leaf\n", start); 2308 ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2215 path[depth].p_ext = ex; 2332 path[depth].p_ext = ex;
2216 2333
2217 a = ex_ee_block > start ? ex_ee_block : start; 2334 a = ex_ee_block > start ? ex_ee_block : start;
2218 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2335 b = ex_ee_block+ex_ee_len - 1 < end ?
2219 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2336 ex_ee_block+ex_ee_len - 1 : end;
2220 2337
2221 ext_debug(" border %u:%u\n", a, b); 2338 ext_debug(" border %u:%u\n", a, b);
2222 2339
2223 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2340 /* If this extent is beyond the end of the hole, skip it */
2224 block = 0; 2341 if (end <= ex_ee_block) {
2225 num = 0; 2342 ex--;
2226 BUG(); 2343 ex_ee_block = le32_to_cpu(ex->ee_block);
2344 ex_ee_len = ext4_ext_get_actual_len(ex);
2345 continue;
2346 } else if (a != ex_ee_block &&
2347 b != ex_ee_block + ex_ee_len - 1) {
2348 /*
2349 * If this is a truncate, then this condition should
2350 * never happen because at least one of the end points
2351 * needs to be on the edge of the extent.
2352 */
2353 if (end == EXT_MAX_BLOCK) {
2354 ext_debug(" bad truncate %u:%u\n",
2355 start, end);
2356 block = 0;
2357 num = 0;
2358 err = -EIO;
2359 goto out;
2360 }
2361 /*
2362 * else this is a hole punch, so the extent needs to
2363 * be split since neither edge of the hole is on the
2364 * extent edge
2365 */
2366 else{
2367 map.m_pblk = ext4_ext_pblock(ex);
2368 map.m_lblk = ex_ee_block;
2369 map.m_len = b - ex_ee_block;
2370
2371 err = ext4_split_extent(handle,
2372 inode, path, &map, 0,
2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2374 EXT4_GET_BLOCKS_PRE_IO);
2375
2376 if (err < 0)
2377 goto out;
2378
2379 ex_ee_len = ext4_ext_get_actual_len(ex);
2380
2381 b = ex_ee_block+ex_ee_len - 1 < end ?
2382 ex_ee_block+ex_ee_len - 1 : end;
2383
2384 /* Then remove tail of this extent */
2385 block = ex_ee_block;
2386 num = a - block;
2387 }
2227 } else if (a != ex_ee_block) { 2388 } else if (a != ex_ee_block) {
2228 /* remove tail of the extent */ 2389 /* remove tail of the extent */
2229 block = ex_ee_block; 2390 block = ex_ee_block;
2230 num = a - block; 2391 num = a - block;
2231 } else if (b != ex_ee_block + ex_ee_len - 1) { 2392 } else if (b != ex_ee_block + ex_ee_len - 1) {
2232 /* remove head of the extent */ 2393 /* remove head of the extent */
2233 block = a; 2394 block = b;
2234 num = b - a; 2395 num = ex_ee_block + ex_ee_len - b;
2235 /* there is no "make a hole" API yet */ 2396
2236 BUG(); 2397 /*
2398 * If this is a truncate, this condition
2399 * should never happen
2400 */
2401 if (end == EXT_MAX_BLOCK) {
2402 ext_debug(" bad truncate %u:%u\n",
2403 start, end);
2404 err = -EIO;
2405 goto out;
2406 }
2237 } else { 2407 } else {
2238 /* remove whole extent: excellent! */ 2408 /* remove whole extent: excellent! */
2239 block = ex_ee_block; 2409 block = ex_ee_block;
2240 num = 0; 2410 num = 0;
2241 BUG_ON(a != ex_ee_block); 2411 if (a != ex_ee_block) {
2242 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2412 ext_debug(" bad truncate %u:%u\n",
2413 start, end);
2414 err = -EIO;
2415 goto out;
2416 }
2417
2418 if (b != ex_ee_block + ex_ee_len - 1) {
2419 ext_debug(" bad truncate %u:%u\n",
2420 start, end);
2421 err = -EIO;
2422 goto out;
2423 }
2243 } 2424 }
2244 2425
2245 /* 2426 /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2270 if (num == 0) { 2451 if (num == 0) {
2271 /* this extent is removed; mark slot entirely unused */ 2452 /* this extent is removed; mark slot entirely unused */
2272 ext4_ext_store_pblock(ex, 0); 2453 ext4_ext_store_pblock(ex, 0);
2273 le16_add_cpu(&eh->eh_entries, -1); 2454 } else if (block != ex_ee_block) {
2455 /*
2456 * If this was a head removal, then we need to update
2457 * the physical block since it is now at a different
2458 * location
2459 */
2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2274 } 2461 }
2275 2462
2276 ex->ee_block = cpu_to_le32(block); 2463 ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2286 if (err) 2473 if (err)
2287 goto out; 2474 goto out;
2288 2475
2476 /*
2477 * If the extent was completely released,
2478 * we need to remove it from the leaf
2479 */
2480 if (num == 0) {
2481 if (end != EXT_MAX_BLOCK) {
2482 /*
2483 * For hole punching, we need to scoot all the
2484 * extents up when an extent is removed so that
2485 * we dont have blank extents in the middle
2486 */
2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2488 sizeof(struct ext4_extent));
2489
2490 /* Now get rid of the one at the end */
2491 memset(EXT_LAST_EXTENT(eh), 0,
2492 sizeof(struct ext4_extent));
2493 }
2494 le16_add_cpu(&eh->eh_entries, -1);
2495 }
2496
2289 ext_debug("new extent: %u:%u:%llu\n", block, num, 2497 ext_debug("new extent: %u:%u:%llu\n", block, num,
2290 ext4_ext_pblock(ex)); 2498 ext4_ext_pblock(ex));
2291 ex--; 2499 ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2326 return 1; 2534 return 1;
2327} 2535}
2328 2536
2329static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2538 ext4_lblk_t end)
2330{ 2539{
2331 struct super_block *sb = inode->i_sb; 2540 struct super_block *sb = inode->i_sb;
2332 int depth = ext_depth(inode); 2541 int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
2365 while (i >= 0 && err == 0) { 2574 while (i >= 0 && err == 0) {
2366 if (i == depth) { 2575 if (i == depth) {
2367 /* this is leaf block */ 2576 /* this is leaf block */
2368 err = ext4_ext_rm_leaf(handle, inode, path, start); 2577 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end);
2369 /* root level has p_bh == NULL, brelse() eats this */ 2579 /* root level has p_bh == NULL, brelse() eats this */
2370 brelse(path[i].p_bh); 2580 brelse(path[i].p_bh);
2371 path[i].p_bh = NULL; 2581 path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529 return ret; 2739 return ret;
2530} 2740}
2531 2741
2742/*
2743 * used by extent splitting.
2744 */
2745#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2746 due to ENOSPC */
2747#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2748#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2749
2750/*
2751 * ext4_split_extent_at() splits an extent at given block.
2752 *
2753 * @handle: the journal handle
2754 * @inode: the file inode
2755 * @path: the path to the extent
2756 * @split: the logical block where the extent is splitted.
2757 * @split_flags: indicates if the extent could be zeroout if split fails, and
2758 * the states(init or uninit) of new extents.
2759 * @flags: flags used to insert new extent to extent tree.
2760 *
2761 *
2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2763 * of which are deterimined by split_flag.
2764 *
2765 * There are two cases:
2766 * a> the extent are splitted into two extent.
2767 * b> split is not needed, and just mark the extent.
2768 *
2769 * return 0 on success.
2770 */
2771static int ext4_split_extent_at(handle_t *handle,
2772 struct inode *inode,
2773 struct ext4_ext_path *path,
2774 ext4_lblk_t split,
2775 int split_flag,
2776 int flags)
2777{
2778 ext4_fsblk_t newblock;
2779 ext4_lblk_t ee_block;
2780 struct ext4_extent *ex, newex, orig_ex;
2781 struct ext4_extent *ex2 = NULL;
2782 unsigned int ee_len, depth;
2783 int err = 0;
2784
2785 ext_debug("ext4_split_extents_at: inode %lu, logical"
2786 "block %llu\n", inode->i_ino, (unsigned long long)split);
2787
2788 ext4_ext_show_leaf(inode, path);
2789
2790 depth = ext_depth(inode);
2791 ex = path[depth].p_ext;
2792 ee_block = le32_to_cpu(ex->ee_block);
2793 ee_len = ext4_ext_get_actual_len(ex);
2794 newblock = split - ee_block + ext4_ext_pblock(ex);
2795
2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2797
2798 err = ext4_ext_get_access(handle, inode, path + depth);
2799 if (err)
2800 goto out;
2801
2802 if (split == ee_block) {
2803 /*
2804 * case b: block @split is the block that the extent begins with
2805 * then we just change the state of the extent, and splitting
2806 * is not needed.
2807 */
2808 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809 ext4_ext_mark_uninitialized(ex);
2810 else
2811 ext4_ext_mark_initialized(ex);
2812
2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2814 ext4_ext_try_to_merge(inode, path, ex);
2815
2816 err = ext4_ext_dirty(handle, inode, path + depth);
2817 goto out;
2818 }
2819
2820 /* case a */
2821 memcpy(&orig_ex, ex, sizeof(orig_ex));
2822 ex->ee_len = cpu_to_le16(split - ee_block);
2823 if (split_flag & EXT4_EXT_MARK_UNINIT1)
2824 ext4_ext_mark_uninitialized(ex);
2825
2826 /*
2827 * path may lead to new leaf, not to original leaf any more
2828 * after ext4_ext_insert_extent() returns,
2829 */
2830 err = ext4_ext_dirty(handle, inode, path + depth);
2831 if (err)
2832 goto fix_extent_len;
2833
2834 ex2 = &newex;
2835 ex2->ee_block = cpu_to_le32(split);
2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
2837 ext4_ext_store_pblock(ex2, newblock);
2838 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2839 ext4_ext_mark_uninitialized(ex2);
2840
2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2843 err = ext4_ext_zeroout(inode, &orig_ex);
2844 if (err)
2845 goto fix_extent_len;
2846 /* update the extent length and mark as initialized */
2847 ex->ee_len = cpu_to_le32(ee_len);
2848 ext4_ext_try_to_merge(inode, path, ex);
2849 err = ext4_ext_dirty(handle, inode, path + depth);
2850 goto out;
2851 } else if (err)
2852 goto fix_extent_len;
2853
2854out:
2855 ext4_ext_show_leaf(inode, path);
2856 return err;
2857
2858fix_extent_len:
2859 ex->ee_len = orig_ex.ee_len;
2860 ext4_ext_dirty(handle, inode, path + depth);
2861 return err;
2862}
2863
2864/*
2865 * ext4_split_extents() splits an extent and mark extent which is covered
2866 * by @map as split_flags indicates
2867 *
2868 * It may result in splitting the extent into multiple extents (upto three)
2869 * There are three possibilities:
2870 * a> There is no split required
2871 * b> Splits in two extents: Split is happening at either end of the extent
2872 * c> Splits in three extents: Somone is splitting in middle of the extent
2873 *
2874 */
2875static int ext4_split_extent(handle_t *handle,
2876 struct inode *inode,
2877 struct ext4_ext_path *path,
2878 struct ext4_map_blocks *map,
2879 int split_flag,
2880 int flags)
2881{
2882 ext4_lblk_t ee_block;
2883 struct ext4_extent *ex;
2884 unsigned int ee_len, depth;
2885 int err = 0;
2886 int uninitialized;
2887 int split_flag1, flags1;
2888
2889 depth = ext_depth(inode);
2890 ex = path[depth].p_ext;
2891 ee_block = le32_to_cpu(ex->ee_block);
2892 ee_len = ext4_ext_get_actual_len(ex);
2893 uninitialized = ext4_ext_is_uninitialized(ex);
2894
2895 if (map->m_lblk + map->m_len < ee_block + ee_len) {
2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2897 EXT4_EXT_MAY_ZEROOUT : 0;
2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2899 if (uninitialized)
2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2901 EXT4_EXT_MARK_UNINIT2;
2902 err = ext4_split_extent_at(handle, inode, path,
2903 map->m_lblk + map->m_len, split_flag1, flags1);
2904 if (err)
2905 goto out;
2906 }
2907
2908 ext4_ext_drop_refs(path);
2909 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2910 if (IS_ERR(path))
2911 return PTR_ERR(path);
2912
2913 if (map->m_lblk >= ee_block) {
2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2915 EXT4_EXT_MAY_ZEROOUT : 0;
2916 if (uninitialized)
2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2918 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2920 err = ext4_split_extent_at(handle, inode, path,
2921 map->m_lblk, split_flag1, flags);
2922 if (err)
2923 goto out;
2924 }
2925
2926 ext4_ext_show_leaf(inode, path);
2927out:
2928 return err ? err : map->m_len;
2929}
2930
2532#define EXT4_EXT_ZERO_LEN 7 2931#define EXT4_EXT_ZERO_LEN 7
2533/* 2932/*
2534 * This function is called by ext4_ext_map_blocks() if someone tries to write 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2545 struct ext4_map_blocks *map, 2944 struct ext4_map_blocks *map,
2546 struct ext4_ext_path *path) 2945 struct ext4_ext_path *path)
2547{ 2946{
2548 struct ext4_extent *ex, newex, orig_ex; 2947 struct ext4_map_blocks split_map;
2549 struct ext4_extent *ex1 = NULL; 2948 struct ext4_extent zero_ex;
2550 struct ext4_extent *ex2 = NULL; 2949 struct ext4_extent *ex;
2551 struct ext4_extent *ex3 = NULL;
2552 struct ext4_extent_header *eh;
2553 ext4_lblk_t ee_block, eof_block; 2950 ext4_lblk_t ee_block, eof_block;
2554 unsigned int allocated, ee_len, depth; 2951 unsigned int allocated, ee_len, depth;
2555 ext4_fsblk_t newblock;
2556 int err = 0; 2952 int err = 0;
2557 int ret = 0; 2953 int split_flag = 0;
2558 int may_zeroout;
2559 2954
2560 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2561 "block %llu, max_blocks %u\n", inode->i_ino, 2956 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2567 eof_block = map->m_lblk + map->m_len; 2962 eof_block = map->m_lblk + map->m_len;
2568 2963
2569 depth = ext_depth(inode); 2964 depth = ext_depth(inode);
2570 eh = path[depth].p_hdr;
2571 ex = path[depth].p_ext; 2965 ex = path[depth].p_ext;
2572 ee_block = le32_to_cpu(ex->ee_block); 2966 ee_block = le32_to_cpu(ex->ee_block);
2573 ee_len = ext4_ext_get_actual_len(ex); 2967 ee_len = ext4_ext_get_actual_len(ex);
2574 allocated = ee_len - (map->m_lblk - ee_block); 2968 allocated = ee_len - (map->m_lblk - ee_block);
2575 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2576
2577 ex2 = ex;
2578 orig_ex.ee_block = ex->ee_block;
2579 orig_ex.ee_len = cpu_to_le16(ee_len);
2580 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2581 2969
2970 WARN_ON(map->m_lblk < ee_block);
2582 /* 2971 /*
2583 * It is safe to convert extent to initialized via explicit 2972 * It is safe to convert extent to initialized via explicit
2584 * zeroout only if extent is fully insde i_size or new_size. 2973 * zeroout only if extent is fully insde i_size or new_size.
2585 */ 2974 */
2586 may_zeroout = ee_block + ee_len <= eof_block; 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2587 2976
2588 err = ext4_ext_get_access(handle, inode, path + depth);
2589 if (err)
2590 goto out;
2591 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2592 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2593 err = ext4_ext_zeroout(inode, &orig_ex); 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2980 err = ext4_ext_zeroout(inode, ex);
2594 if (err) 2981 if (err)
2595 goto fix_extent_len;
2596 /* update the extent length and mark as initialized */
2597 ex->ee_block = orig_ex.ee_block;
2598 ex->ee_len = orig_ex.ee_len;
2599 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2600 ext4_ext_dirty(handle, inode, path + depth);
2601 /* zeroed the full extent */
2602 return allocated;
2603 }
2604
2605 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2606 if (map->m_lblk > ee_block) {
2607 ex1 = ex;
2608 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2609 ext4_ext_mark_uninitialized(ex1);
2610 ex2 = &newex;
2611 }
2612 /*
2613 * for sanity, update the length of the ex2 extent before
2614 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2615 * overlap of blocks.
2616 */
2617 if (!ex1 && allocated > map->m_len)
2618 ex2->ee_len = cpu_to_le16(map->m_len);
2619 /* ex3: to ee_block + ee_len : uninitialised */
2620 if (allocated > map->m_len) {
2621 unsigned int newdepth;
2622 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2623 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2624 /*
2625 * map->m_lblk == ee_block is handled by the zerouout
2626 * at the beginning.
2627 * Mark first half uninitialized.
2628 * Mark second half initialized and zero out the
2629 * initialized extent
2630 */
2631 ex->ee_block = orig_ex.ee_block;
2632 ex->ee_len = cpu_to_le16(ee_len - allocated);
2633 ext4_ext_mark_uninitialized(ex);
2634 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2635 ext4_ext_dirty(handle, inode, path + depth);
2636
2637 ex3 = &newex;
2638 ex3->ee_block = cpu_to_le32(map->m_lblk);
2639 ext4_ext_store_pblock(ex3, newblock);
2640 ex3->ee_len = cpu_to_le16(allocated);
2641 err = ext4_ext_insert_extent(handle, inode, path,
2642 ex3, 0);
2643 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err)
2646 goto fix_extent_len;
2647 ex->ee_block = orig_ex.ee_block;
2648 ex->ee_len = orig_ex.ee_len;
2649 ext4_ext_store_pblock(ex,
2650 ext4_ext_pblock(&orig_ex));
2651 ext4_ext_dirty(handle, inode, path + depth);
2652 /* blocks available from map->m_lblk */
2653 return allocated;
2654
2655 } else if (err)
2656 goto fix_extent_len;
2657
2658 /*
2659 * We need to zero out the second half because
2660 * an fallocate request can update file size and
2661 * converting the second half to initialized extent
2662 * implies that we can leak some junk data to user
2663 * space.
2664 */
2665 err = ext4_ext_zeroout(inode, ex3);
2666 if (err) {
2667 /*
2668 * We should actually mark the
2669 * second half as uninit and return error
2670 * Insert would have changed the extent
2671 */
2672 depth = ext_depth(inode);
2673 ext4_ext_drop_refs(path);
2674 path = ext4_ext_find_extent(inode, map->m_lblk,
2675 path);
2676 if (IS_ERR(path)) {
2677 err = PTR_ERR(path);
2678 return err;
2679 }
2680 /* get the second half extent details */
2681 ex = path[depth].p_ext;
2682 err = ext4_ext_get_access(handle, inode,
2683 path + depth);
2684 if (err)
2685 return err;
2686 ext4_ext_mark_uninitialized(ex);
2687 ext4_ext_dirty(handle, inode, path + depth);
2688 return err;
2689 }
2690
2691 /* zeroed the second half */
2692 return allocated;
2693 }
2694 ex3 = &newex;
2695 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2696 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2697 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2698 ext4_ext_mark_uninitialized(ex3);
2699 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2700 if (err == -ENOSPC && may_zeroout) {
2701 err = ext4_ext_zeroout(inode, &orig_ex);
2702 if (err)
2703 goto fix_extent_len;
2704 /* update the extent length and mark as initialized */
2705 ex->ee_block = orig_ex.ee_block;
2706 ex->ee_len = orig_ex.ee_len;
2707 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2708 ext4_ext_dirty(handle, inode, path + depth);
2709 /* zeroed the full extent */
2710 /* blocks available from map->m_lblk */
2711 return allocated;
2712
2713 } else if (err)
2714 goto fix_extent_len;
2715 /*
2716 * The depth, and hence eh & ex might change
2717 * as part of the insert above.
2718 */
2719 newdepth = ext_depth(inode);
2720 /*
2721 * update the extent length after successful insert of the
2722 * split extent
2723 */
2724 ee_len -= ext4_ext_get_actual_len(ex3);
2725 orig_ex.ee_len = cpu_to_le16(ee_len);
2726 may_zeroout = ee_block + ee_len <= eof_block;
2727
2728 depth = newdepth;
2729 ext4_ext_drop_refs(path);
2730 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2731 if (IS_ERR(path)) {
2732 err = PTR_ERR(path);
2733 goto out; 2982 goto out;
2734 }
2735 eh = path[depth].p_hdr;
2736 ex = path[depth].p_ext;
2737 if (ex2 != &newex)
2738 ex2 = ex;
2739 2983
2740 err = ext4_ext_get_access(handle, inode, path + depth); 2984 err = ext4_ext_get_access(handle, inode, path + depth);
2741 if (err) 2985 if (err)
2742 goto out; 2986 goto out;
2743 2987 ext4_ext_mark_initialized(ex);
2744 allocated = map->m_len; 2988 ext4_ext_try_to_merge(inode, path, ex);
2745 2989 err = ext4_ext_dirty(handle, inode, path + depth);
2746 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2990 goto out;
2747 * to insert a extent in the middle zerout directly
2748 * otherwise give the extent a chance to merge to left
2749 */
2750 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2751 map->m_lblk != ee_block && may_zeroout) {
2752 err = ext4_ext_zeroout(inode, &orig_ex);
2753 if (err)
2754 goto fix_extent_len;
2755 /* update the extent length and mark as initialized */
2756 ex->ee_block = orig_ex.ee_block;
2757 ex->ee_len = orig_ex.ee_len;
2758 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2759 ext4_ext_dirty(handle, inode, path + depth);
2760 /* zero out the first half */
2761 /* blocks available from map->m_lblk */
2762 return allocated;
2763 }
2764 }
2765 /*
2766 * If there was a change of depth as part of the
2767 * insertion of ex3 above, we need to update the length
2768 * of the ex1 extent again here
2769 */
2770 if (ex1 && ex1 != ex) {
2771 ex1 = ex;
2772 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2773 ext4_ext_mark_uninitialized(ex1);
2774 ex2 = &newex;
2775 }
2776 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2777 ex2->ee_block = cpu_to_le32(map->m_lblk);
2778 ext4_ext_store_pblock(ex2, newblock);
2779 ex2->ee_len = cpu_to_le16(allocated);
2780 if (ex2 != ex)
2781 goto insert;
2782 /*
2783 * New (initialized) extent starts from the first block
2784 * in the current extent. i.e., ex2 == ex
2785 * We have to see if it can be merged with the extent
2786 * on the left.
2787 */
2788 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2789 /*
2790 * To merge left, pass "ex2 - 1" to try_to_merge(),
2791 * since it merges towards right _only_.
2792 */
2793 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2794 if (ret) {
2795 err = ext4_ext_correct_indexes(handle, inode, path);
2796 if (err)
2797 goto out;
2798 depth = ext_depth(inode);
2799 ex2--;
2800 }
2801 } 2991 }
2992
2802 /* 2993 /*
2803 * Try to Merge towards right. This might be required 2994 * four cases:
2804 * only when the whole extent is being written to. 2995 * 1. split the extent into three extents.
2805 * i.e. ex2 == ex and ex3 == NULL. 2996 * 2. split the extent into two extents, zeroout the first half.
2997 * 3. split the extent into two extents, zeroout the second half.
2998 * 4. split the extent into two extents with out zeroout.
2806 */ 2999 */
2807 if (!ex3) { 3000 split_map.m_lblk = map->m_lblk;
2808 ret = ext4_ext_try_to_merge(inode, path, ex2); 3001 split_map.m_len = map->m_len;
2809 if (ret) { 3002
2810 err = ext4_ext_correct_indexes(handle, inode, path); 3003 if (allocated > map->m_len) {
3004 if (allocated <= EXT4_EXT_ZERO_LEN &&
3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3006 /* case 3 */
3007 zero_ex.ee_block =
3008 cpu_to_le32(map->m_lblk);
3009 zero_ex.ee_len = cpu_to_le16(allocated);
3010 ext4_ext_store_pblock(&zero_ex,
3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3012 err = ext4_ext_zeroout(inode, &zero_ex);
2811 if (err) 3013 if (err)
2812 goto out; 3014 goto out;
3015 split_map.m_lblk = map->m_lblk;
3016 split_map.m_len = allocated;
3017 } else if ((map->m_lblk - ee_block + map->m_len <
3018 EXT4_EXT_ZERO_LEN) &&
3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3020 /* case 2 */
3021 if (map->m_lblk != ee_block) {
3022 zero_ex.ee_block = ex->ee_block;
3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3024 ee_block);
3025 ext4_ext_store_pblock(&zero_ex,
3026 ext4_ext_pblock(ex));
3027 err = ext4_ext_zeroout(inode, &zero_ex);
3028 if (err)
3029 goto out;
3030 }
3031
3032 split_map.m_lblk = ee_block;
3033 split_map.m_len = map->m_lblk - ee_block + map->m_len;
3034 allocated = map->m_len;
2813 } 3035 }
2814 } 3036 }
2815 /* Mark modified extent as dirty */ 3037
2816 err = ext4_ext_dirty(handle, inode, path + depth); 3038 allocated = ext4_split_extent(handle, inode, path,
2817 goto out; 3039 &split_map, split_flag, 0);
2818insert: 3040 if (allocated < 0)
2819 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3041 err = allocated;
2820 if (err == -ENOSPC && may_zeroout) { 3042
2821 err = ext4_ext_zeroout(inode, &orig_ex);
2822 if (err)
2823 goto fix_extent_len;
2824 /* update the extent length and mark as initialized */
2825 ex->ee_block = orig_ex.ee_block;
2826 ex->ee_len = orig_ex.ee_len;
2827 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2828 ext4_ext_dirty(handle, inode, path + depth);
2829 /* zero out the first half */
2830 return allocated;
2831 } else if (err)
2832 goto fix_extent_len;
2833out: 3043out:
2834 ext4_ext_show_leaf(inode, path);
2835 return err ? err : allocated; 3044 return err ? err : allocated;
2836
2837fix_extent_len:
2838 ex->ee_block = orig_ex.ee_block;
2839 ex->ee_len = orig_ex.ee_len;
2840 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2841 ext4_ext_mark_uninitialized(ex);
2842 ext4_ext_dirty(handle, inode, path + depth);
2843 return err;
2844} 3045}
2845 3046
2846/* 3047/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2871 struct ext4_ext_path *path, 3072 struct ext4_ext_path *path,
2872 int flags) 3073 int flags)
2873{ 3074{
2874 struct ext4_extent *ex, newex, orig_ex; 3075 ext4_lblk_t eof_block;
2875 struct ext4_extent *ex1 = NULL; 3076 ext4_lblk_t ee_block;
2876 struct ext4_extent *ex2 = NULL; 3077 struct ext4_extent *ex;
2877 struct ext4_extent *ex3 = NULL; 3078 unsigned int ee_len;
2878 ext4_lblk_t ee_block, eof_block; 3079 int split_flag = 0, depth;
2879 unsigned int allocated, ee_len, depth;
2880 ext4_fsblk_t newblock;
2881 int err = 0;
2882 int may_zeroout;
2883 3080
2884 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2885 "block %llu, max_blocks %u\n", inode->i_ino, 3082 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2889 inode->i_sb->s_blocksize_bits; 3086 inode->i_sb->s_blocksize_bits;
2890 if (eof_block < map->m_lblk + map->m_len) 3087 if (eof_block < map->m_lblk + map->m_len)
2891 eof_block = map->m_lblk + map->m_len; 3088 eof_block = map->m_lblk + map->m_len;
2892
2893 depth = ext_depth(inode);
2894 ex = path[depth].p_ext;
2895 ee_block = le32_to_cpu(ex->ee_block);
2896 ee_len = ext4_ext_get_actual_len(ex);
2897 allocated = ee_len - (map->m_lblk - ee_block);
2898 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2899
2900 ex2 = ex;
2901 orig_ex.ee_block = ex->ee_block;
2902 orig_ex.ee_len = cpu_to_le16(ee_len);
2903 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2904
2905 /* 3089 /*
2906 * It is safe to convert extent to initialized via explicit 3090 * It is safe to convert extent to initialized via explicit
2907 * zeroout only if extent is fully insde i_size or new_size. 3091 * zeroout only if extent is fully insde i_size or new_size.
2908 */ 3092 */
2909 may_zeroout = ee_block + ee_len <= eof_block; 3093 depth = ext_depth(inode);
2910 3094 ex = path[depth].p_ext;
2911 /* 3095 ee_block = le32_to_cpu(ex->ee_block);
2912 * If the uninitialized extent begins at the same logical 3096 ee_len = ext4_ext_get_actual_len(ex);
2913 * block where the write begins, and the write completely
2914 * covers the extent, then we don't need to split it.
2915 */
2916 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2917 return allocated;
2918
2919 err = ext4_ext_get_access(handle, inode, path + depth);
2920 if (err)
2921 goto out;
2922 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2923 if (map->m_lblk > ee_block) {
2924 ex1 = ex;
2925 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2926 ext4_ext_mark_uninitialized(ex1);
2927 ex2 = &newex;
2928 }
2929 /*
2930 * for sanity, update the length of the ex2 extent before
2931 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2932 * overlap of blocks.
2933 */
2934 if (!ex1 && allocated > map->m_len)
2935 ex2->ee_len = cpu_to_le16(map->m_len);
2936 /* ex3: to ee_block + ee_len : uninitialised */
2937 if (allocated > map->m_len) {
2938 unsigned int newdepth;
2939 ex3 = &newex;
2940 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2941 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2942 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2943 ext4_ext_mark_uninitialized(ex3);
2944 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2945 if (err == -ENOSPC && may_zeroout) {
2946 err = ext4_ext_zeroout(inode, &orig_ex);
2947 if (err)
2948 goto fix_extent_len;
2949 /* update the extent length and mark as initialized */
2950 ex->ee_block = orig_ex.ee_block;
2951 ex->ee_len = orig_ex.ee_len;
2952 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2953 ext4_ext_dirty(handle, inode, path + depth);
2954 /* zeroed the full extent */
2955 /* blocks available from map->m_lblk */
2956 return allocated;
2957
2958 } else if (err)
2959 goto fix_extent_len;
2960 /*
2961 * The depth, and hence eh & ex might change
2962 * as part of the insert above.
2963 */
2964 newdepth = ext_depth(inode);
2965 /*
2966 * update the extent length after successful insert of the
2967 * split extent
2968 */
2969 ee_len -= ext4_ext_get_actual_len(ex3);
2970 orig_ex.ee_len = cpu_to_le16(ee_len);
2971 may_zeroout = ee_block + ee_len <= eof_block;
2972
2973 depth = newdepth;
2974 ext4_ext_drop_refs(path);
2975 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2976 if (IS_ERR(path)) {
2977 err = PTR_ERR(path);
2978 goto out;
2979 }
2980 ex = path[depth].p_ext;
2981 if (ex2 != &newex)
2982 ex2 = ex;
2983 3097
2984 err = ext4_ext_get_access(handle, inode, path + depth); 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2985 if (err) 3099 split_flag |= EXT4_EXT_MARK_UNINIT2;
2986 goto out;
2987 3100
2988 allocated = map->m_len; 3101 flags |= EXT4_GET_BLOCKS_PRE_IO;
2989 } 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
2990 /*
2991 * If there was a change of depth as part of the
2992 * insertion of ex3 above, we need to update the length
2993 * of the ex1 extent again here
2994 */
2995 if (ex1 && ex1 != ex) {
2996 ex1 = ex;
2997 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2998 ext4_ext_mark_uninitialized(ex1);
2999 ex2 = &newex;
3000 }
3001 /*
3002 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3003 * using direct I/O, uninitialised still.
3004 */
3005 ex2->ee_block = cpu_to_le32(map->m_lblk);
3006 ext4_ext_store_pblock(ex2, newblock);
3007 ex2->ee_len = cpu_to_le16(allocated);
3008 ext4_ext_mark_uninitialized(ex2);
3009 if (ex2 != ex)
3010 goto insert;
3011 /* Mark modified extent as dirty */
3012 err = ext4_ext_dirty(handle, inode, path + depth);
3013 ext_debug("out here\n");
3014 goto out;
3015insert:
3016 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3017 if (err == -ENOSPC && may_zeroout) {
3018 err = ext4_ext_zeroout(inode, &orig_ex);
3019 if (err)
3020 goto fix_extent_len;
3021 /* update the extent length and mark as initialized */
3022 ex->ee_block = orig_ex.ee_block;
3023 ex->ee_len = orig_ex.ee_len;
3024 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3025 ext4_ext_dirty(handle, inode, path + depth);
3026 /* zero out the first half */
3027 return allocated;
3028 } else if (err)
3029 goto fix_extent_len;
3030out:
3031 ext4_ext_show_leaf(inode, path);
3032 return err ? err : allocated;
3033
3034fix_extent_len:
3035 ex->ee_block = orig_ex.ee_block;
3036 ex->ee_len = orig_ex.ee_len;
3037 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3038 ext4_ext_mark_uninitialized(ex);
3039 ext4_ext_dirty(handle, inode, path + depth);
3040 return err;
3041} 3103}
3104
3042static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3105static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3043 struct inode *inode, 3106 struct inode *inode,
3044 struct ext4_ext_path *path) 3107 struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3047 struct ext4_extent_header *eh; 3110 struct ext4_extent_header *eh;
3048 int depth; 3111 int depth;
3049 int err = 0; 3112 int err = 0;
3050 int ret = 0;
3051 3113
3052 depth = ext_depth(inode); 3114 depth = ext_depth(inode);
3053 eh = path[depth].p_hdr; 3115 eh = path[depth].p_hdr;
3054 ex = path[depth].p_ext; 3116 ex = path[depth].p_ext;
3055 3117
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3119 "block %llu, max_blocks %u\n", inode->i_ino,
3120 (unsigned long long)le32_to_cpu(ex->ee_block),
3121 ext4_ext_get_actual_len(ex));
3122
3056 err = ext4_ext_get_access(handle, inode, path + depth); 3123 err = ext4_ext_get_access(handle, inode, path + depth);
3057 if (err) 3124 if (err)
3058 goto out; 3125 goto out;
3059 /* first mark the extent as initialized */ 3126 /* first mark the extent as initialized */
3060 ext4_ext_mark_initialized(ex); 3127 ext4_ext_mark_initialized(ex);
3061 3128
3062 /* 3129 /* note: ext4_ext_correct_indexes() isn't needed here because
3063 * We have to see if it can be merged with the extent 3130 * borders are not changed
3064 * on the left.
3065 */
3066 if (ex > EXT_FIRST_EXTENT(eh)) {
3067 /*
3068 * To merge left, pass "ex - 1" to try_to_merge(),
3069 * since it merges towards right _only_.
3070 */
3071 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3072 if (ret) {
3073 err = ext4_ext_correct_indexes(handle, inode, path);
3074 if (err)
3075 goto out;
3076 depth = ext_depth(inode);
3077 ex--;
3078 }
3079 }
3080 /*
3081 * Try to Merge towards right.
3082 */ 3131 */
3083 ret = ext4_ext_try_to_merge(inode, path, ex); 3132 ext4_ext_try_to_merge(inode, path, ex);
3084 if (ret) { 3133
3085 err = ext4_ext_correct_indexes(handle, inode, path);
3086 if (err)
3087 goto out;
3088 depth = ext_depth(inode);
3089 }
3090 /* Mark modified extent as dirty */ 3134 /* Mark modified extent as dirty */
3091 err = ext4_ext_dirty(handle, inode, path + depth); 3135 err = ext4_ext_dirty(handle, inode, path + depth);
3092out: 3136out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3302 ext4_fsblk_t newblock = 0; 3346 ext4_fsblk_t newblock = 0;
3303 int err = 0, depth, ret; 3347 int err = 0, depth, ret;
3304 unsigned int allocated = 0; 3348 unsigned int allocated = 0;
3349 unsigned int punched_out = 0;
3350 unsigned int result = 0;
3305 struct ext4_allocation_request ar; 3351 struct ext4_allocation_request ar;
3306 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3353 struct ext4_map_blocks punch_map;
3307 3354
3308 ext_debug("blocks %u/%u requested for inode %lu\n", 3355 ext_debug("blocks %u/%u requested for inode %lu\n",
3309 map->m_lblk, map->m_len, inode->i_ino); 3356 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3311 3358
3312 /* check in cache */ 3359 /* check in cache */
3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
3314 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3315 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3316 /* 3364 /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3375 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3376 ee_block, ee_len, newblock); 3424 ee_block, ee_len, newblock);
3377 3425
3378 /* Do not put uninitialized extent in the cache */ 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3379 if (!ext4_ext_is_uninitialized(ex)) { 3427 /*
3380 ext4_ext_put_in_cache(inode, ee_block, 3428 * Do not put uninitialized extent
3381 ee_len, ee_start); 3429 * in the cache
3382 goto out; 3430 */
3431 if (!ext4_ext_is_uninitialized(ex)) {
3432 ext4_ext_put_in_cache(inode, ee_block,
3433 ee_len, ee_start);
3434 goto out;
3435 }
3436 ret = ext4_ext_handle_uninitialized_extents(
3437 handle, inode, map, path, flags,
3438 allocated, newblock);
3439 return ret;
3383 } 3440 }
3384 ret = ext4_ext_handle_uninitialized_extents(handle, 3441
3385 inode, map, path, flags, allocated, 3442 /*
3386 newblock); 3443 * Punch out the map length, but only to the
3387 return ret; 3444 * end of the extent
3445 */
3446 punched_out = allocated < map->m_len ?
3447 allocated : map->m_len;
3448
3449 /*
3450 * Sense extents need to be converted to
3451 * uninitialized, they must fit in an
3452 * uninitialized extent
3453 */
3454 if (punched_out > EXT_UNINIT_MAX_LEN)
3455 punched_out = EXT_UNINIT_MAX_LEN;
3456
3457 punch_map.m_lblk = map->m_lblk;
3458 punch_map.m_pblk = newblock;
3459 punch_map.m_len = punched_out;
3460 punch_map.m_flags = 0;
3461
3462 /* Check to see if the extent needs to be split */
3463 if (punch_map.m_len != ee_len ||
3464 punch_map.m_lblk != ee_block) {
3465
3466 ret = ext4_split_extent(handle, inode,
3467 path, &punch_map, 0,
3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3469 EXT4_GET_BLOCKS_PRE_IO);
3470
3471 if (ret < 0) {
3472 err = ret;
3473 goto out2;
3474 }
3475 /*
3476 * find extent for the block at
3477 * the start of the hole
3478 */
3479 ext4_ext_drop_refs(path);
3480 kfree(path);
3481
3482 path = ext4_ext_find_extent(inode,
3483 map->m_lblk, NULL);
3484 if (IS_ERR(path)) {
3485 err = PTR_ERR(path);
3486 path = NULL;
3487 goto out2;
3488 }
3489
3490 depth = ext_depth(inode);
3491 ex = path[depth].p_ext;
3492 ee_len = ext4_ext_get_actual_len(ex);
3493 ee_block = le32_to_cpu(ex->ee_block);
3494 ee_start = ext4_ext_pblock(ex);
3495
3496 }
3497
3498 ext4_ext_mark_uninitialized(ex);
3499
3500 err = ext4_ext_remove_space(inode, map->m_lblk,
3501 map->m_lblk + punched_out);
3502
3503 goto out2;
3388 } 3504 }
3389 } 3505 }
3390 3506
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3446 else 3562 else
3447 /* disable in-core preallocation for non-regular files */ 3563 /* disable in-core preallocation for non-regular files */
3448 ar.flags = 0; 3564 ar.flags = 0;
3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3449 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err);
3450 if (!newblock) 3568 if (!newblock)
3451 goto out2; 3569 goto out2;
@@ -3529,7 +3647,11 @@ out2:
3529 } 3647 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated); 3649 newblock, map->m_len, err ? err : allocated);
3532 return err ? err : allocated; 3650
3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3652 punched_out : allocated;
3653
3654 return err ? err : result;
3533} 3655}
3534 3656
3535void ext4_ext_truncate(struct inode *inode) 3657void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
3577 3699
3578 last_block = (inode->i_size + sb->s_blocksize - 1) 3700 last_block = (inode->i_size + sb->s_blocksize - 1)
3579 >> EXT4_BLOCK_SIZE_BITS(sb); 3701 >> EXT4_BLOCK_SIZE_BITS(sb);
3580 err = ext4_ext_remove_space(inode, last_block); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
3581 3703
3582 /* In a multi-transaction truncate, we only make the final 3704 /* In a multi-transaction truncate, we only make the final
3583 * transaction synchronous. 3705 * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
3585 if (IS_SYNC(inode)) 3707 if (IS_SYNC(inode))
3586 ext4_handle_sync(handle); 3708 ext4_handle_sync(handle);
3587 3709
3588out_stop:
3589 up_write(&EXT4_I(inode)->i_data_sem); 3710 up_write(&EXT4_I(inode)->i_data_sem);
3711
3712out_stop:
3590 /* 3713 /*
3591 * If this was a simple ftruncate() and the file will remain alive, 3714 * If this was a simple ftruncate() and the file will remain alive,
3592 * then we need to clear up the orphan record which we created above. 3715 * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3651 struct ext4_map_blocks map; 3774 struct ext4_map_blocks map;
3652 unsigned int credits, blkbits = inode->i_blkbits; 3775 unsigned int credits, blkbits = inode->i_blkbits;
3653 3776
3654 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3655 if (mode & ~FALLOC_FL_KEEP_SIZE)
3656 return -EOPNOTSUPP;
3657
3658 /* 3777 /*
3659 * currently supporting (pre)allocate mode for extent-based 3778 * currently supporting (pre)allocate mode for extent-based
3660 * files _only_ 3779 * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3663 return -EOPNOTSUPP; 3782 return -EOPNOTSUPP;
3664 3783
3784 /* Return error if mode is not supported */
3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3786 return -EOPNOTSUPP;
3787
3788 if (mode & FALLOC_FL_PUNCH_HOLE)
3789 return ext4_punch_hole(file, offset, len);
3790
3665 trace_ext4_fallocate_enter(inode, offset, len, mode); 3791 trace_ext4_fallocate_enter(inode, offset, len, mode);
3666 map.m_lblk = offset >> blkbits; 3792 map.m_lblk = offset >> blkbits;
3667 /* 3793 /*
@@ -3691,7 +3817,8 @@ retry:
3691 break; 3817 break;
3692 } 3818 }
3693 ret = ext4_map_blocks(handle, inode, &map, 3819 ret = ext4_map_blocks(handle, inode, &map,
3694 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3821 EXT4_GET_BLOCKS_NO_NORMALIZE);
3695 if (ret <= 0) { 3822 if (ret <= 0) {
3696#ifdef EXT4FS_DEBUG 3823#ifdef EXT4FS_DEBUG
3697 WARN_ON(ret <= 0); 3824 WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 pgoff_t last_offset; 3949 pgoff_t last_offset;
3823 pgoff_t offset; 3950 pgoff_t offset;
3824 pgoff_t index; 3951 pgoff_t index;
3952 pgoff_t start_index = 0;
3825 struct page **pages = NULL; 3953 struct page **pages = NULL;
3826 struct buffer_head *bh = NULL; 3954 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL; 3955 struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
3848 kfree(pages); 3976 kfree(pages);
3849 return EXT_CONTINUE; 3977 return EXT_CONTINUE;
3850 } 3978 }
3979 index = 0;
3851 3980
3981next_page:
3852 /* Try to find the 1st mapped buffer. */ 3982 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 3983 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3854 blksize_bits; 3984 blksize_bits;
3855 if (!page_has_buffers(pages[0])) 3985 if (!page_has_buffers(pages[index]))
3856 goto out; 3986 goto out;
3857 head = page_buffers(pages[0]); 3987 head = page_buffers(pages[index]);
3858 if (!head) 3988 if (!head)
3859 goto out; 3989 goto out;
3860 3990
3991 index++;
3861 bh = head; 3992 bh = head;
3862 do { 3993 do {
3863 if (buffer_mapped(bh)) { 3994 if (end >= newex->ec_block +
3995 newex->ec_len)
3996 /* The buffer is out of
3997 * the request range.
3998 */
3999 goto out;
4000
4001 if (buffer_mapped(bh) &&
4002 end >= newex->ec_block) {
4003 start_index = index - 1;
3864 /* get the 1st mapped buffer. */ 4004 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer; 4005 goto found_mapped_buffer;
3872 } 4006 }
4007
3873 bh = bh->b_this_page; 4008 bh = bh->b_this_page;
3874 end++; 4009 end++;
3875 } while (bh != head); 4010 } while (bh != head);
3876 4011
3877 /* No mapped buffer found. */ 4012 /* No mapped buffer in the range found in this page,
3878 goto out; 4013 * We need to look up next page.
4014 */
4015 if (index >= ret) {
4016 /* There is no page left, but we need to limit
4017 * newex->ec_len.
4018 */
4019 newex->ec_len = end - newex->ec_block;
4020 goto out;
4021 }
4022 goto next_page;
3879 } else { 4023 } else {
3880 /*Find contiguous delayed buffers. */ 4024 /*Find contiguous delayed buffers. */
3881 if (ret > 0 && pages[0]->index == last_offset) 4025 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]); 4026 head = page_buffers(pages[0]);
3883 bh = head; 4027 bh = head;
4028 index = 1;
4029 start_index = 0;
3884 } 4030 }
3885 4031
3886found_mapped_buffer: 4032found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
3903 end++; 4049 end++;
3904 } while (bh != head); 4050 } while (bh != head);
3905 4051
3906 for (index = 1; index < ret; index++) { 4052 for (; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) { 4053 if (!page_has_buffers(pages[index])) {
3908 bh = NULL; 4054 bh = NULL;
3909 break; 4055 break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
3913 bh = NULL; 4059 bh = NULL;
3914 break; 4060 break;
3915 } 4061 }
4062
3916 if (pages[index]->index != 4063 if (pages[index]->index !=
3917 pages[0]->index + index) { 4064 pages[start_index]->index + index
4065 - start_index) {
3918 /* Blocks are not contiguous. */ 4066 /* Blocks are not contiguous. */
3919 bh = NULL; 4067 bh = NULL;
3920 break; 4068 break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
4006 return (error < 0 ? error : 0); 4154 return (error < 0 ? error : 0);
4007} 4155}
4008 4156
4157/*
4158 * ext4_ext_punch_hole
4159 *
4160 * Punches a hole of "length" bytes in a file starting
4161 * at byte "offset"
4162 *
4163 * @inode: The inode of the file to punch a hole in
4164 * @offset: The starting byte offset of the hole
4165 * @length: The length of the hole
4166 *
4167 * Returns the number of blocks removed or negative on err
4168 */
4169int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4170{
4171 struct inode *inode = file->f_path.dentry->d_inode;
4172 struct super_block *sb = inode->i_sb;
4173 struct ext4_ext_cache cache_ex;
4174 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4175 struct address_space *mapping = inode->i_mapping;
4176 struct ext4_map_blocks map;
4177 handle_t *handle;
4178 loff_t first_block_offset, last_block_offset, block_len;
4179 loff_t first_page, last_page, first_page_offset, last_page_offset;
4180 int ret, credits, blocks_released, err = 0;
4181
4182 first_block = (offset + sb->s_blocksize - 1) >>
4183 EXT4_BLOCK_SIZE_BITS(sb);
4184 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4185
4186 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4187 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4188
4189 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4190 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4191
4192 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4193 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4194
4195 /*
4196 * Write out all dirty pages to avoid race conditions
4197 * Then release them.
4198 */
4199 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4200 err = filemap_write_and_wait_range(mapping,
4201 first_page_offset == 0 ? 0 : first_page_offset-1,
4202 last_page_offset);
4203
4204 if (err)
4205 return err;
4206 }
4207
4208 /* Now release the pages */
4209 if (last_page_offset > first_page_offset) {
4210 truncate_inode_pages_range(mapping, first_page_offset,
4211 last_page_offset-1);
4212 }
4213
4214 /* finish any pending end_io work */
4215 ext4_flush_completed_IO(inode);
4216
4217 credits = ext4_writepage_trans_blocks(inode);
4218 handle = ext4_journal_start(inode, credits);
4219 if (IS_ERR(handle))
4220 return PTR_ERR(handle);
4221
4222 err = ext4_orphan_add(handle, inode);
4223 if (err)
4224 goto out;
4225
4226 /*
4227 * Now we need to zero out the un block aligned data.
4228 * If the file is smaller than a block, just
4229 * zero out the middle
4230 */
4231 if (first_block > last_block)
4232 ext4_block_zero_page_range(handle, mapping, offset, length);
4233 else {
4234 /* zero out the head of the hole before the first block */
4235 block_len = first_block_offset - offset;
4236 if (block_len > 0)
4237 ext4_block_zero_page_range(handle, mapping,
4238 offset, block_len);
4239
4240 /* zero out the tail of the hole after the last block */
4241 block_len = offset + length - last_block_offset;
4242 if (block_len > 0) {
4243 ext4_block_zero_page_range(handle, mapping,
4244 last_block_offset, block_len);
4245 }
4246 }
4247
4248 /* If there are no blocks to remove, return now */
4249 if (first_block >= last_block)
4250 goto out;
4251
4252 down_write(&EXT4_I(inode)->i_data_sem);
4253 ext4_ext_invalidate_cache(inode);
4254 ext4_discard_preallocations(inode);
4255
4256 /*
4257 * Loop over all the blocks and identify blocks
4258 * that need to be punched out
4259 */
4260 iblock = first_block;
4261 blocks_released = 0;
4262 while (iblock < last_block) {
4263 max_blocks = last_block - iblock;
4264 num_blocks = 1;
4265 memset(&map, 0, sizeof(map));
4266 map.m_lblk = iblock;
4267 map.m_len = max_blocks;
4268 ret = ext4_ext_map_blocks(handle, inode, &map,
4269 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4270
4271 if (ret > 0) {
4272 blocks_released += ret;
4273 num_blocks = ret;
4274 } else if (ret == 0) {
4275 /*
4276 * If map blocks could not find the block,
4277 * then it is in a hole. If the hole was
4278 * not already cached, then map blocks should
4279 * put it in the cache. So we can get the hole
4280 * out of the cache
4281 */
4282 memset(&cache_ex, 0, sizeof(cache_ex));
4283 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4284 !cache_ex.ec_start) {
4285
4286 /* The hole is cached */
4287 num_blocks = cache_ex.ec_block +
4288 cache_ex.ec_len - iblock;
4289
4290 } else {
4291 /* The block could not be identified */
4292 err = -EIO;
4293 break;
4294 }
4295 } else {
4296 /* Map blocks error */
4297 err = ret;
4298 break;
4299 }
4300
4301 if (num_blocks == 0) {
4302 /* This condition should never happen */
4303 ext_debug("Block lookup failed");
4304 err = -EIO;
4305 break;
4306 }
4307
4308 iblock += num_blocks;
4309 }
4310
4311 if (blocks_released > 0) {
4312 ext4_ext_invalidate_cache(inode);
4313 ext4_discard_preallocations(inode);
4314 }
4315
4316 if (IS_SYNC(inode))
4317 ext4_handle_sync(handle);
4318
4319 up_write(&EXT4_I(inode)->i_data_sem);
4320
4321out:
4322 ext4_orphan_del(handle, inode);
4323 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4324 ext4_mark_inode_dirty(handle, inode);
4325 ext4_journal_stop(handle);
4326 return err;
4327}
4009int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4328int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4010 __u64 start, __u64 len) 4329 __u64 start, __u64 len)
4011{ 4330{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4042 4361
4043 return error; 4362 return error;
4044} 4363}
4045
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
272}; 272};
273 273
274const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
275 .truncate = ext4_truncate,
276 .setattr = ext4_setattr, 275 .setattr = ext4_setattr,
277 .getattr = ext4_getattr, 276 .getattr = ext4_getattr,
278#ifdef CONFIG_EXT4_FS_XATTR 277#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
36 36
37static void dump_completed_IO(struct inode * inode) 37static void dump_completed_IO(struct inode * inode)
38{ 38{
39#ifdef EXT4_DEBUG 39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after; 40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1; 41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags; 42 unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
173 int ret; 173 int ret;
174 tid_t commit_tid; 174 tid_t commit_tid;
175 bool needs_barrier = false;
175 176
176 J_ASSERT(ext4_journal_current_handle() == NULL); 177 J_ASSERT(ext4_journal_current_handle() == NULL);
177 178
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
211 } 212 }
212 213
213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 214 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
214 if (jbd2_log_start_commit(journal, commit_tid)) { 215 if (journal->j_flags & JBD2_BARRIER &&
215 /* 216 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
216 * When the journal is on a different device than the 217 needs_barrier = true;
217 * fs data disk, we need to issue the barrier in 218 jbd2_log_start_commit(journal, commit_tid);
218 * writeback mode. (In ordered mode, the jbd2 layer 219 ret = jbd2_log_wait_commit(journal, commit_tid);
219 * will take care of issuing the barrier. In 220 if (needs_barrier)
220 * data=journal, all of the data blocks are written to
221 * the journal device.)
222 */
223 if (ext4_should_writeback_data(inode) &&
224 (journal->j_fs_dev != journal->j_dev) &&
225 (journal->j_flags & JBD2_BARRIER))
226 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
227 NULL);
228 ret = jbd2_log_wait_commit(journal, commit_tid);
229 } else if (journal->j_flags & JBD2_BARRIER)
230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 221 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out: 222 out:
232 trace_ext4_sync_file_exit(inode, ret); 223 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 goal, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
@@ -1930,7 +1930,7 @@ repeat:
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 if (PageWriteback(page)) 2799 wait_on_page_writeback(page);
2800 wait_on_page_writeback(page);
2801
2802 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2803 2801
2804 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
3513 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3514 3512
3515 if (end > isize) 3513 if (end > isize)
3516 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3517 } 3515 }
3518 } 3516 }
3519 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
3916int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
3917 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
3918{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
3919 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3920 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3921 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
3922 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
3923 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
3924 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
3931 return -EINVAL; 3950 return -EINVAL;
3932 3951
3933 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
3934 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
3935 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3936 3963
3937 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4380 4407
4381int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4382{ 4409{
4383 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4384 return 0;
4385 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4386 return 1; 4411 return 1;
4387 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4392} 4417}
4393 4418
4394/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4395 * ext4_truncate() 4445 * ext4_truncate()
4396 * 4446 *
4397 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4617 /* 4667 /*
4618 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4619 */ 4669 */
4620 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4621 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4622 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4623 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5311 5361
5312 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5313 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5314 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5315 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5316 handle_t *handle; 5365 handle_t *handle;
5317 5366
5318 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5346 goto err_out; 5395 goto err_out;
5347 } 5396 }
5348 } 5397 }
5349 /* ext4_truncate will clear the flag */
5350 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5351 ext4_truncate(inode);
5352 } 5398 }
5353 5399
5354 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5355 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5356 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5357 5407
5358 if (!rc) { 5408 if (!rc) {
5359 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5811 goto out_unlock; 5861 goto out_unlock;
5812 } 5862 }
5813 ret = 0; 5863 ret = 0;
5814 if (PageMappedToDisk(page)) 5864
5815 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
5816 5871
5817 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
5818 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
5819 else 5874 else
5820 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
5821 5876
5822 lock_page(page);
5823 /* 5877 /*
5824 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
5825 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5829 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
5830 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5831 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
5832 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
5833 goto out_unlock; 5887 return VM_FAULT_LOCKED;
5834 } 5888 }
5835 } 5889 }
5836 unlock_page(page); 5890 unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5850 if (ret < 0) 5904 if (ret < 0)
5851 goto out_unlock; 5905 goto out_unlock;
5852 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
5853out_unlock: 5917out_unlock:
5854 if (ret) 5918 if (ret)
5855 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
787 struct inode *inode; 787 struct inode *inode;
788 char *data; 788 char *data;
789 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
790 791
791 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
792 793
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
819 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
820 break; 821 break;
821 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
822 err = -EIO; 835 err = -EIO;
823 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
824 if (desc == NULL) 837 if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
871 } 884 }
872 885
873 /* wait for I/O completion */ 886 /* wait for I/O completion */
874 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
875 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
876 890
877 err = -EIO; 891 err = -EIO;
878 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
879 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
880 goto out; 894 goto out;
881 895
882 err = 0; 896 err = 0;
883 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
884 /* init the page */
885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
886 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
887 int group; 899 int group;
888 struct ext4_group_info *grinfo;
889 900
890 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
891 if (group >= ngroups) 902 if (group >= ngroups)
892 break; 903 break;
893 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
894 /* 909 /*
895 * data carry information regarding this 910 * data carry information regarding this
896 * particular group in the format specified 911 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
919 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
920 */ 935 */
921 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
922 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
923 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
924 incore = NULL; 941 incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
948 965
949out: 966out:
950 if (bh) { 967 if (bh) {
951 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
952 brelse(bh[i]); 969 brelse(bh[i]);
953 if (bh != &bhs) 970 if (bh != &bhs)
954 kfree(bh); 971 kfree(bh);
@@ -957,22 +974,21 @@ out:
957} 974}
958 975
959/* 976/*
960 * lock the group_info alloc_sem of all the groups 977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
961 * belonging to the same buddy cache page. This 978 * on the same buddy page doesn't happen whild holding the buddy page lock.
962 * make sure other parallel operation on the buddy 979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
963 * cache doesn't happen whild holding the buddy cache 980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
964 * lock
965 */ 981 */
966static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, 982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
967 ext4_group_t group) 983 ext4_group_t group, struct ext4_buddy *e4b)
968{ 984{
969 int i; 985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
970 int block, pnum; 986 int block, pnum, poff;
971 int blocks_per_page; 987 int blocks_per_page;
972 int groups_per_page; 988 struct page *page;
973 ext4_group_t ngroups = ext4_get_groups_count(sb); 989
974 ext4_group_t first_group; 990 e4b->bd_buddy_page = NULL;
975 struct ext4_group_info *grp; 991 e4b->bd_bitmap_page = NULL;
976 992
977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
978 /* 994 /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
982 */ 998 */
983 block = group * 2; 999 block = group * 2;
984 pnum = block / blocks_per_page; 1000 pnum = block / blocks_per_page;
985 first_group = pnum * blocks_per_page / 2; 1001 poff = block % blocks_per_page;
986 1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
987 groups_per_page = blocks_per_page >> 1; 1003 if (!page)
988 if (groups_per_page == 0) 1004 return -EIO;
989 groups_per_page = 1; 1005 BUG_ON(page->mapping != inode->i_mapping);
990 /* read all groups the page covers into the cache */ 1006 e4b->bd_bitmap_page = page;
991 for (i = 0; i < groups_per_page; i++) { 1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
992 1008
993 if ((first_group + i) >= ngroups) 1009 if (blocks_per_page >= 2) {
994 break; 1010 /* buddy and bitmap are on the same page */
995 grp = ext4_get_group_info(sb, first_group + i); 1011 return 0;
996 /* take all groups write allocation
997 * semaphore. This make sure there is
998 * no block allocation going on in any
999 * of that groups
1000 */
1001 down_write_nested(&grp->alloc_sem, i);
1002 } 1012 }
1003 return i; 1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1004} 1023}
1005 1024
1006static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1007 ext4_group_t group, int locked_group)
1008{ 1026{
1009 int i; 1027 if (e4b->bd_bitmap_page) {
1010 int block, pnum; 1028 unlock_page(e4b->bd_bitmap_page);
1011 int blocks_per_page; 1029 page_cache_release(e4b->bd_bitmap_page);
1012 ext4_group_t first_group; 1030 }
1013 struct ext4_group_info *grp; 1031 if (e4b->bd_buddy_page) {
1014 1032 unlock_page(e4b->bd_buddy_page);
1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1033 page_cache_release(e4b->bd_buddy_page);
1016 /*
1017 * the buddy cache inode stores the block bitmap
1018 * and buddy information in consecutive blocks.
1019 * So for each group we need two blocks.
1020 */
1021 block = group * 2;
1022 pnum = block / blocks_per_page;
1023 first_group = pnum * blocks_per_page / 2;
1024 /* release locks on all the groups */
1025 for (i = 0; i < locked_group; i++) {
1026
1027 grp = ext4_get_group_info(sb, first_group + i);
1028 /* take all groups write allocation
1029 * semaphore. This make sure there is
1030 * no block allocation going on in any
1031 * of that groups
1032 */
1033 up_write(&grp->alloc_sem);
1034 } 1034 }
1035
1036} 1035}
1037 1036
1038/* 1037/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
1044int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1045{ 1044{
1046 1045
1047 int ret = 0;
1048 void *bitmap;
1049 int blocks_per_page;
1050 int block, pnum, poff;
1051 int num_grp_locked = 0;
1052 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
1054 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
1055 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
1056 1050
1057 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1059 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
1060 /* 1053 /*
1061 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
1062 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
1063 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
1064 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
1065 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
1066 */ 1059 */
1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1069 /* 1062 /*
1070 * somebody initialized the group 1063 * somebody initialized the group
1071 * return without doing anything 1064 * return without doing anything
1072 */ 1065 */
1073 ret = 0;
1074 goto err; 1066 goto err;
1075 } 1067 }
1076 /* 1068
1077 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
1078 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
1079 * So for each group we need two blocks. 1071 if (ret)
1080 */ 1072 goto err;
1081 block = group * 2; 1073 if (!PageUptodate(page)) {
1082 pnum = block / blocks_per_page;
1083 poff = block % blocks_per_page;
1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1085 if (page) {
1086 BUG_ON(page->mapping != inode->i_mapping);
1087 ret = ext4_mb_init_cache(page, NULL);
1088 if (ret) {
1089 unlock_page(page);
1090 goto err;
1091 }
1092 unlock_page(page);
1093 }
1094 if (page == NULL || !PageUptodate(page)) {
1095 ret = -EIO; 1074 ret = -EIO;
1096 goto err; 1075 goto err;
1097 } 1076 }
1098 mark_page_accessed(page); 1077 mark_page_accessed(page);
1099 bitmap_page = page;
1100 bitmap = page_address(page) + (poff * sb->s_blocksize);
1101 1078
1102 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1103 block++;
1104 pnum = block / blocks_per_page;
1105 poff = block % blocks_per_page;
1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1107 if (page == bitmap_page) {
1108 /* 1080 /*
1109 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1110 * the same page we don't need to force 1082 * the same page we don't need to force
1111 * init the buddy 1083 * init the buddy
1112 */ 1084 */
1113 unlock_page(page); 1085 ret = 0;
1114 } else if (page) { 1086 goto err;
1115 BUG_ON(page->mapping != inode->i_mapping);
1116 ret = ext4_mb_init_cache(page, bitmap);
1117 if (ret) {
1118 unlock_page(page);
1119 goto err;
1120 }
1121 unlock_page(page);
1122 } 1087 }
1123 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1124 ret = -EIO; 1094 ret = -EIO;
1125 goto err; 1095 goto err;
1126 } 1096 }
1127 mark_page_accessed(page); 1097 mark_page_accessed(page);
1128err: 1098err:
1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1130 if (bitmap_page)
1131 page_cache_release(bitmap_page);
1132 if (page)
1133 page_cache_release(page);
1134 return ret; 1100 return ret;
1135} 1101}
1136 1102
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1164 e4b->bd_group = group; 1130 e4b->bd_group = group;
1165 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1166 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1167 e4b->alloc_semp = &grp->alloc_sem;
1168
1169 /* Take the read lock on the group alloc
1170 * sem. This would make sure a parallel
1171 * ext4_mb_init_group happening on other
1172 * groups mapped by the page is blocked
1173 * till we are done with allocation
1174 */
1175repeat_load_buddy:
1176 down_read(e4b->alloc_semp);
1177 1133
1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1179 /* we need to check for group need init flag
1180 * with alloc_semp held so that we can be sure
1181 * that new blocks didn't get added to the group
1182 * when we are loading the buddy cache
1183 */
1184 up_read(e4b->alloc_semp);
1185 /* 1135 /*
1186 * we need full data about the group 1136 * we need full data about the group
1187 * to make a good selection 1137 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
1189 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1190 if (ret) 1140 if (ret)
1191 return ret; 1141 return ret;
1192 goto repeat_load_buddy;
1193 } 1142 }
1194 1143
1195 /* 1144 /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
1273 return 0; 1222 return 0;
1274 1223
1275err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1276 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1277 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1278 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1279 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1280 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1281 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1282
1283 /* Done with the buddy cache */
1284 up_read(e4b->alloc_semp);
1285 return ret; 1233 return ret;
1286} 1234}
1287 1235
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1291 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1292 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1293 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1294 /* Done with the buddy cache */
1295 if (e4b->alloc_semp)
1296 up_read(e4b->alloc_semp);
1297} 1242}
1298 1243
1299 1244
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1606 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1607 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1608 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1609 /* on allocation we use ac to track the held semaphore */
1610 ac->alloc_semp = e4b->alloc_semp;
1611 e4b->alloc_semp = NULL;
1612 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1613 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1614 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2659 struct super_block *sb = journal->j_private; 2601 struct super_block *sb = journal->j_private;
2660 struct ext4_buddy e4b; 2602 struct ext4_buddy e4b;
2661 struct ext4_group_info *db; 2603 struct ext4_group_info *db;
2662 int err, ret, count = 0, count2 = 0; 2604 int err, count = 0, count2 = 0;
2663 struct ext4_free_data *entry; 2605 struct ext4_free_data *entry;
2664 struct list_head *l, *ltmp; 2606 struct list_head *l, *ltmp;
2665 2607
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2669 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2611 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2670 entry->count, entry->group, entry); 2612 entry->count, entry->group, entry);
2671 2613
2672 if (test_opt(sb, DISCARD)) { 2614 if (test_opt(sb, DISCARD))
2673 ret = ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2674 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2675 if (unlikely(ret == -EOPNOTSUPP)) {
2676 ext4_warning(sb, "discard not supported, "
2677 "disabling");
2678 clear_opt(sb, DISCARD);
2679 }
2680 }
2681 2617
2682 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2683 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4226 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4227 } 4163 }
4228 } 4164 }
4229 if (ac->alloc_semp)
4230 up_read(ac->alloc_semp);
4231 if (pa) { 4165 if (pa) {
4232 /* 4166 /*
4233 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4234 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4235 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4236 * doesn't grow big. We need to release 4170 * doesn't grow big.
4237 * alloc_semp before calling ext4_mb_add_n_trim()
4238 */ 4171 */
4239 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4240 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4303 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4304 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4305 */ 4238 */
4306 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4307 /* let others to free the space */ 4242 /* let others to free the space */
4308 yield(); 4243 yield();
4309 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 return 0; 4248 return 0;
4314 } 4249 }
4315 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4316 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4317 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4318 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4319 } 4260 }
4320 inquota = ar->len; 4261 inquota = ar->len;
4321 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
4704} 4645}
4705 4646
4706/** 4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4707 * ext4_trim_extent -- function to TRIM one single free extent in the group 4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4708 * @sb: super block for the file system 4770 * @sb: super block for the file system
4709 * @start: starting block of the free extent in the alloc. group 4771 * @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
4715 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4716 * be called with under the group lock. 4778 * be called with under the group lock.
4717 */ 4779 */
4718static int ext4_trim_extent(struct super_block *sb, int start, int count, 4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4719 ext4_group_t group, struct ext4_buddy *e4b) 4781 ext4_group_t group, struct ext4_buddy *e4b)
4720{ 4782{
4721 struct ext4_free_extent ex; 4783 struct ext4_free_extent ex;
4722 int ret = 0;
4723 4784
4724 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4725 4786
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4733 */ 4794 */
4734 mb_mark_used(e4b, &ex); 4795 mb_mark_used(e4b, &ex);
4735 ext4_unlock_group(sb, group); 4796 ext4_unlock_group(sb, group);
4736 4797 ext4_issue_discard(sb, group, start, count);
4737 ret = ext4_issue_discard(sb, group, start, count);
4738
4739 ext4_lock_group(sb, group); 4798 ext4_lock_group(sb, group);
4740 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4741 return ret;
4742} 4800}
4743 4801
4744/** 4802/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4760 * the group buddy bitmap. This is done until whole group is scanned. 4818 * the group buddy bitmap. This is done until whole group is scanned.
4761 */ 4819 */
4762static ext4_grpblk_t 4820static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4765{ 4824{
4766 void *bitmap; 4825 void *bitmap;
4767 ext4_grpblk_t next, count = 0; 4826 ext4_grpblk_t next, count = 0;
4768 ext4_group_t group; 4827 struct ext4_buddy e4b;
4769 int ret = 0; 4828 int ret;
4770 4829
4771 BUG_ON(e4b == NULL); 4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4772 4837
4773 bitmap = e4b->bd_bitmap;
4774 group = e4b->bd_group;
4775 start = (e4b->bd_info->bb_first_free > start) ?
4776 e4b->bd_info->bb_first_free : start;
4777 ext4_lock_group(sb, group); 4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4778 4841
4779 while (start < max) { 4842 while (start < max) {
4780 start = mb_find_next_zero_bit(bitmap, max, start); 4843 start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4783 next = mb_find_next_bit(bitmap, max, start); 4846 next = mb_find_next_bit(bitmap, max, start);
4784 4847
4785 if ((next - start) >= minblocks) { 4848 if ((next - start) >= minblocks) {
4786 ret = ext4_trim_extent(sb, start, 4849 ext4_trim_extent(sb, start,
4787 next - start, group, e4b); 4850 next - start, group, &e4b);
4788 if (ret < 0)
4789 break;
4790 count += next - start; 4851 count += next - start;
4791 } 4852 }
4792 start = next + 1; 4853 start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4802 ext4_lock_group(sb, group); 4863 ext4_lock_group(sb, group);
4803 } 4864 }
4804 4865
4805 if ((e4b->bd_info->bb_free - count) < minblocks) 4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4806 break; 4867 break;
4807 } 4868 }
4808 ext4_unlock_group(sb, group); 4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4809 4871
4810 ext4_debug("trimmed %d blocks in the group %d\n", 4872 ext4_debug("trimmed %d blocks in the group %d\n",
4811 count, group); 4873 count, group);
4812 4874
4813 if (ret < 0)
4814 count = ret;
4815
4816 return count; 4875 return count;
4817} 4876}
4818 4877
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4830 */ 4889 */
4831int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4832{ 4891{
4833 struct ext4_buddy e4b; 4892 struct ext4_group_info *grp;
4834 ext4_group_t first_group, last_group; 4893 ext4_group_t first_group, last_group;
4835 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4836 ext4_grpblk_t cnt = 0, first_block, last_block; 4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4837 uint64_t start, len, minlen, trimmed; 4896 uint64_t start, len, minlen, trimmed = 0;
4838 ext4_fsblk_t first_data_blk = 4897 ext4_fsblk_t first_data_blk =
4839 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4840 int ret = 0; 4899 int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4842 start = range->start >> sb->s_blocksize_bits; 4901 start = range->start >> sb->s_blocksize_bits;
4843 len = range->len >> sb->s_blocksize_bits; 4902 len = range->len >> sb->s_blocksize_bits;
4844 minlen = range->minlen >> sb->s_blocksize_bits; 4903 minlen = range->minlen >> sb->s_blocksize_bits;
4845 trimmed = 0;
4846 4904
4847 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4848 return -EINVAL; 4906 return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 return -EINVAL; 4921 return -EINVAL;
4864 4922
4865 for (group = first_group; group <= last_group; group++) { 4923 for (group = first_group; group <= last_group; group++) {
4866 ret = ext4_mb_load_buddy(sb, group, &e4b); 4924 grp = ext4_get_group_info(sb, group);
4867 if (ret) { 4925 /* We only do this if the grp has never been initialized */
4868 ext4_error(sb, "Error in loading buddy " 4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4869 "information for %u", group); 4927 ret = ext4_mb_init_group(sb, group);
4870 break; 4928 if (ret)
4929 break;
4871 } 4930 }
4872 4931
4873 /* 4932 /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4880 last_block = first_block + len; 4939 last_block = first_block + len;
4881 len -= last_block - first_block; 4940 len -= last_block - first_block;
4882 4941
4883 if (e4b.bd_info->bb_free >= minlen) { 4942 if (grp->bb_free >= minlen) {
4884 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4943 cnt = ext4_trim_all_free(sb, group, first_block,
4885 last_block, minlen); 4944 last_block, minlen);
4886 if (cnt < 0) { 4945 if (cnt < 0) {
4887 ret = cnt; 4946 ret = cnt;
4888 ext4_mb_unload_buddy(&e4b);
4889 break; 4947 break;
4890 } 4948 }
4891 } 4949 }
4892 ext4_mb_unload_buddy(&e4b);
4893 trimmed += cnt; 4950 trimmed += cnt;
4894 first_block = 0; 4951 first_block = 0;
4895 } 4952 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
193 __u8 ac_op; /* operation, for history only */ 193 __u8 ac_op; /* operation, for history only */
194 struct page *ac_bitmap_page; 194 struct page *ac_bitmap_page;
195 struct page *ac_buddy_page; 195 struct page *ac_buddy_page;
196 /*
197 * pointer to the held semaphore upon successful
198 * block allocation
199 */
200 struct rw_semaphore *alloc_semp;
201 struct ext4_prealloc_space *ac_pa; 196 struct ext4_prealloc_space *ac_pa;
202 struct ext4_locality_group *ac_lg; 197 struct ext4_locality_group *ac_lg;
203}; 198};
@@ -215,7 +210,6 @@ struct ext4_buddy {
215 struct super_block *bd_sb; 210 struct super_block *bd_sb;
216 __u16 bd_blkbits; 211 __u16 bd_blkbits;
217 ext4_group_t bd_group; 212 ext4_group_t bd_group;
218 struct rw_semaphore *alloc_semp;
219}; 213};
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 214#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 215#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); 379 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
1#include <linux/fs.h>
2#include <linux/random.h>
3#include <linux/buffer_head.h>
4#include <linux/utsname.h>
5#include <linux/kthread.h>
6
7#include "ext4.h"
8
9/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster.
12 */
13static int write_mmp_block(struct buffer_head *bh)
14{
15 mark_buffer_dirty(bh);
16 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync;
18 get_bh(bh);
19 submit_bh(WRITE_SYNC, bh);
20 wait_on_buffer(bh);
21 if (unlikely(!buffer_uptodate(bh)))
22 return 1;
23
24 return 0;
25}
26
27/*
28 * Read the MMP block. It _must_ be read from disk and hence we clear the
29 * uptodate flag on the buffer.
30 */
31static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
32 ext4_fsblk_t mmp_block)
33{
34 struct mmp_struct *mmp;
35
36 if (*bh)
37 clear_buffer_uptodate(*bh);
38
39 /* This would be sb_bread(sb, mmp_block), except we need to be sure
40 * that the MD RAID device cache has been bypassed, and that the read
41 * is not blocked in the elevator. */
42 if (!*bh)
43 *bh = sb_getblk(sb, mmp_block);
44 if (*bh) {
45 get_bh(*bh);
46 lock_buffer(*bh);
47 (*bh)->b_end_io = end_buffer_read_sync;
48 submit_bh(READ_SYNC, *bh);
49 wait_on_buffer(*bh);
50 if (!buffer_uptodate(*bh)) {
51 brelse(*bh);
52 *bh = NULL;
53 }
54 }
55 if (!*bh) {
56 ext4_warning(sb, "Error while reading MMP block %llu",
57 mmp_block);
58 return -EIO;
59 }
60
61 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
63 return -EINVAL;
64
65 return 0;
66}
67
68/*
69 * Dump as much information as possible to help the admin.
70 */
71void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
72 const char *function, unsigned int line, const char *msg)
73{
74 __ext4_warning(sb, function, line, msg);
75 __ext4_warning(sb, function, line,
76 "MMP failure info: last update time: %llu, last update "
77 "node: %s, last update device: %s\n",
78 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
79 mmp->mmp_nodename, mmp->mmp_bdevname);
80}
81
82/*
83 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
84 */
85static int kmmpd(void *data)
86{
87 struct super_block *sb = ((struct mmpd_data *) data)->sb;
88 struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
89 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
90 struct mmp_struct *mmp;
91 ext4_fsblk_t mmp_block;
92 u32 seq = 0;
93 unsigned long failed_writes = 0;
94 int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
95 unsigned mmp_check_interval;
96 unsigned long last_update_time;
97 unsigned long diff;
98 int retval;
99
100 mmp_block = le64_to_cpu(es->s_mmp_block);
101 mmp = (struct mmp_struct *)(bh->b_data);
102 mmp->mmp_time = cpu_to_le64(get_seconds());
103 /*
104 * Start with the higher mmp_check_interval and reduce it if
105 * the MMP block is being updated on time.
106 */
107 mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
108 EXT4_MMP_MIN_CHECK_INTERVAL);
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname,
113 sizeof(mmp->mmp_nodename));
114
115 while (!kthread_should_stop()) {
116 if (++seq > EXT4_MMP_SEQ_MAX)
117 seq = 1;
118
119 mmp->mmp_seq = cpu_to_le32(seq);
120 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies;
122
123 retval = write_mmp_block(bh);
124 /*
125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds.
127 */
128 if (retval && (failed_writes % 60) == 0) {
129 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++;
131 }
132
133 if (!(le32_to_cpu(es->s_feature_incompat) &
134 EXT4_FEATURE_INCOMPAT_MMP)) {
135 ext4_warning(sb, "kmmpd being stopped since MMP feature"
136 " has been disabled.");
137 EXT4_SB(sb)->s_mmp_tsk = NULL;
138 goto failed;
139 }
140
141 if (sb->s_flags & MS_RDONLY) {
142 ext4_warning(sb, "kmmpd being stopped since filesystem "
143 "has been remounted as readonly.");
144 EXT4_SB(sb)->s_mmp_tsk = NULL;
145 goto failed;
146 }
147
148 diff = jiffies - last_update_time;
149 if (diff < mmp_update_interval * HZ)
150 schedule_timeout_interruptible(mmp_update_interval *
151 HZ - diff);
152
153 /*
154 * We need to make sure that more than mmp_check_interval
155 * seconds have not passed since writing. If that has happened
156 * we need to check if the MMP block is as we left it.
157 */
158 diff = jiffies - last_update_time;
159 if (diff > mmp_check_interval * HZ) {
160 struct buffer_head *bh_check = NULL;
161 struct mmp_struct *mmp_check;
162
163 retval = read_mmp_block(sb, &bh_check, mmp_block);
164 if (retval) {
165 ext4_error(sb, "error reading MMP data: %d",
166 retval);
167
168 EXT4_SB(sb)->s_mmp_tsk = NULL;
169 goto failed;
170 }
171
172 mmp_check = (struct mmp_struct *)(bh_check->b_data);
173 if (mmp->mmp_seq != mmp_check->mmp_seq ||
174 memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
175 sizeof(mmp->mmp_nodename))) {
176 dump_mmp_msg(sb, mmp_check,
177 "Error while updating MMP info. "
178 "The filesystem seems to have been"
179 " multiply mounted.");
180 ext4_error(sb, "abort");
181 goto failed;
182 }
183 put_bh(bh_check);
184 }
185
186 /*
187 * Adjust the mmp_check_interval depending on how much time
188 * it took for the MMP block to be written.
189 */
190 mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
191 EXT4_MMP_MAX_CHECK_INTERVAL),
192 EXT4_MMP_MIN_CHECK_INTERVAL);
193 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
194 }
195
196 /*
197 * Unmount seems to be clean.
198 */
199 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
200 mmp->mmp_time = cpu_to_le64(get_seconds());
201
202 retval = write_mmp_block(bh);
203
204failed:
205 kfree(data);
206 brelse(bh);
207 return retval;
208}
209
210/*
211 * Get a random new sequence number but make sure it is not greater than
212 * EXT4_MMP_SEQ_MAX.
213 */
214static unsigned int mmp_new_seq(void)
215{
216 u32 new_seq;
217
218 do {
219 get_random_bytes(&new_seq, sizeof(u32));
220 } while (new_seq > EXT4_MMP_SEQ_MAX);
221
222 return new_seq;
223}
224
225/*
226 * Protect the filesystem from being mounted more than once.
227 */
228int ext4_multi_mount_protect(struct super_block *sb,
229 ext4_fsblk_t mmp_block)
230{
231 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
232 struct buffer_head *bh = NULL;
233 struct mmp_struct *mmp = NULL;
234 struct mmpd_data *mmpd_data;
235 u32 seq;
236 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
237 unsigned int wait_time = 0;
238 int retval;
239
240 if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
241 mmp_block >= ext4_blocks_count(es)) {
242 ext4_warning(sb, "Invalid MMP block in superblock");
243 goto failed;
244 }
245
246 retval = read_mmp_block(sb, &bh, mmp_block);
247 if (retval)
248 goto failed;
249
250 mmp = (struct mmp_struct *)(bh->b_data);
251
252 if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
253 mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
254
255 /*
256 * If check_interval in MMP block is larger, use that instead of
257 * update_interval from the superblock.
258 */
259 if (mmp->mmp_check_interval > mmp_check_interval)
260 mmp_check_interval = mmp->mmp_check_interval;
261
262 seq = le32_to_cpu(mmp->mmp_seq);
263 if (seq == EXT4_MMP_SEQ_CLEAN)
264 goto skip;
265
266 if (seq == EXT4_MMP_SEQ_FSCK) {
267 dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
268 goto failed;
269 }
270
271 wait_time = min(mmp_check_interval * 2 + 1,
272 mmp_check_interval + 60);
273
274 /* Print MMP interval if more than 20 secs. */
275 if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
276 ext4_warning(sb, "MMP interval %u higher than expected, please"
277 " wait.\n", wait_time * 2);
278
279 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
280 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
281 goto failed;
282 }
283
284 retval = read_mmp_block(sb, &bh, mmp_block);
285 if (retval)
286 goto failed;
287 mmp = (struct mmp_struct *)(bh->b_data);
288 if (seq != le32_to_cpu(mmp->mmp_seq)) {
289 dump_mmp_msg(sb, mmp,
290 "Device is already active on another node.");
291 goto failed;
292 }
293
294skip:
295 /*
296 * write a new random sequence number.
297 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
299
300 retval = write_mmp_block(bh);
301 if (retval)
302 goto failed;
303
304 /*
305 * wait for MMP interval and check mmp_seq.
306 */
307 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
308 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
309 goto failed;
310 }
311
312 retval = read_mmp_block(sb, &bh, mmp_block);
313 if (retval)
314 goto failed;
315 mmp = (struct mmp_struct *)(bh->b_data);
316 if (seq != le32_to_cpu(mmp->mmp_seq)) {
317 dump_mmp_msg(sb, mmp,
318 "Device is already active on another node.");
319 goto failed;
320 }
321
322 mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
323 if (!mmpd_data) {
324 ext4_warning(sb, "not enough memory for mmpd_data");
325 goto failed;
326 }
327 mmpd_data->sb = sb;
328 mmpd_data->bh = bh;
329
330 /*
331 * Start a kernel thread to update the MMP block periodically.
332 */
333 EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
334 bdevname(bh->b_bdev,
335 mmp->mmp_bdevname));
336 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
337 EXT4_SB(sb)->s_mmp_tsk = NULL;
338 kfree(mmpd_data);
339 ext4_warning(sb, "Unable to create kmmpd thread for %s.",
340 sb->s_id);
341 goto failed;
342 }
343
344 return 0;
345
346failed:
347 brelse(bh);
348 return 1;
349}
350
351
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
876 * It needs to call wait_on_page_writeback() to wait for the 876 * It needs to call wait_on_page_writeback() to wait for the
877 * writeback of the page. 877 * writeback of the page.
878 */ 878 */
879 if (PageWriteback(page)) 879 wait_on_page_writeback(page);
880 wait_on_page_writeback(page);
881 880
882 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
883 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1413 frame->at = entries; 1413 frame->at = entries;
1414 frame->bh = bh; 1414 frame->bh = bh;
1415 bh = bh2; 1415 bh = bh2;
1416
1417 ext4_handle_dirty_metadata(handle, dir, frame->bh);
1418 ext4_handle_dirty_metadata(handle, dir, bh);
1419
1416 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1420 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1417 dx_release (frames); 1421 if (!de) {
1418 if (!(de)) 1422 /*
1423 * Even if the block split failed, we have to properly write
1424 * out all the changes we did so far. Otherwise we can end up
1425 * with corrupted filesystem.
1426 */
1427 ext4_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1419 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1420 1432
1421 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1422 brelse(bh); 1434 brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
2240 handle_t *handle; 2252 handle_t *handle;
2241 struct inode *inode; 2253 struct inode *inode;
2242 int l, err, retries = 0; 2254 int l, err, retries = 0;
2255 int credits;
2243 2256
2244 l = strlen(symname)+1; 2257 l = strlen(symname)+1;
2245 if (l > dir->i_sb->s_blocksize) 2258 if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
2247 2260
2248 dquot_initialize(dir); 2261 dquot_initialize(dir);
2249 2262
2263 if (l > EXT4_N_BLOCKS * 4) {
2264 /*
2265 * For non-fast symlinks, we just allocate inode and put it on
2266 * orphan list in the first transaction => we need bitmap,
2267 * group descriptor, sb, inode block, quota blocks.
2268 */
2269 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2270 } else {
2271 /*
2272 * Fast symlink. We have to add entry to directory
2273 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2274 * allocate new inode (bitmap, group descriptor, inode block,
2275 * quota blocks, sb is already counted in previous macros).
2276 */
2277 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2278 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2279 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2280 }
2250retry: 2281retry:
2251 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 handle = ext4_journal_start(dir, credits);
2252 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2253 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2254 if (IS_ERR(handle)) 2283 if (IS_ERR(handle))
2255 return PTR_ERR(handle); 2284 return PTR_ERR(handle);
2256 2285
@@ -2263,21 +2292,44 @@ retry:
2263 if (IS_ERR(inode)) 2292 if (IS_ERR(inode))
2264 goto out_stop; 2293 goto out_stop;
2265 2294
2266 if (l > sizeof(EXT4_I(inode)->i_data)) { 2295 if (l > EXT4_N_BLOCKS * 4) {
2267 inode->i_op = &ext4_symlink_inode_operations; 2296 inode->i_op = &ext4_symlink_inode_operations;
2268 ext4_set_aops(inode); 2297 ext4_set_aops(inode);
2269 /* 2298 /*
2270 * page_symlink() calls into ext4_prepare/commit_write. 2299 * We cannot call page_symlink() with transaction started
2271 * We have a transaction open. All is sweetness. It also sets 2300 * because it calls into ext4_write_begin() which can wait
2272 * i_size in generic_commit_write(). 2301 * for transaction commit if we are running out of space
2302 * and thus we deadlock. So we have to stop transaction now
2303 * and restart it when symlink contents is written.
2304 *
2305 * To keep fs consistent in case of crash, we have to put inode
2306 * to orphan list in the mean time.
2273 */ 2307 */
2308 drop_nlink(inode);
2309 err = ext4_orphan_add(handle, inode);
2310 ext4_journal_stop(handle);
2311 if (err)
2312 goto err_drop_inode;
2274 err = __page_symlink(inode, symname, l, 1); 2313 err = __page_symlink(inode, symname, l, 1);
2314 if (err)
2315 goto err_drop_inode;
2316 /*
2317 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2318 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2319 */
2320 handle = ext4_journal_start(dir,
2321 EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2322 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2323 if (IS_ERR(handle)) {
2324 err = PTR_ERR(handle);
2325 goto err_drop_inode;
2326 }
2327 inc_nlink(inode);
2328 err = ext4_orphan_del(handle, inode);
2275 if (err) { 2329 if (err) {
2330 ext4_journal_stop(handle);
2276 clear_nlink(inode); 2331 clear_nlink(inode);
2277 unlock_new_inode(inode); 2332 goto err_drop_inode;
2278 ext4_mark_inode_dirty(handle, inode);
2279 iput(inode);
2280 goto out_stop;
2281 } 2333 }
2282 } else { 2334 } else {
2283 /* clear the extent format for fast symlink */ 2335 /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
2293 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2345 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2294 goto retry; 2346 goto retry;
2295 return err; 2347 return err;
2348err_drop_inode:
2349 unlock_new_inode(inode);
2350 iput(inode);
2351 return err;
2296} 2352}
2297 2353
2298static int ext4_link(struct dentry *old_dentry, 2354static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
203 for (i = 0; i < io_end->num_io_pages; i++) { 203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 205 struct buffer_head *bh, *head;
206 int partial_write = 0; 206 loff_t offset;
207 loff_t io_end_offset;
207 208
208 head = page_buffers(page); 209 if (error) {
209 if (error)
210 SetPageError(page); 210 SetPageError(page);
211 BUG_ON(!head); 211 set_bit(AS_EIO, &page->mapping->flags);
212 if (head->b_size != PAGE_CACHE_SIZE) { 212 head = page_buffers(page);
213 loff_t offset; 213 BUG_ON(!head);
214 loff_t io_end_offset = io_end->offset + io_end->size; 214
215 io_end_offset = io_end->offset + io_end->size;
215 216
216 offset = (sector_t) page->index << PAGE_CACHE_SHIFT; 217 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
217 bh = head; 218 bh = head;
218 do { 219 do {
219 if ((offset >= io_end->offset) && 220 if ((offset >= io_end->offset) &&
220 (offset+bh->b_size <= io_end_offset)) { 221 (offset+bh->b_size <= io_end_offset))
221 if (error) 222 buffer_io_error(bh);
222 buffer_io_error(bh); 223
223
224 }
225 if (buffer_delay(bh))
226 partial_write = 1;
227 else if (!buffer_mapped(bh))
228 clear_buffer_dirty(bh);
229 else if (buffer_dirty(bh))
230 partial_write = 1;
231 offset += bh->b_size; 224 offset += bh->b_size;
232 bh = bh->b_this_page; 225 bh = bh->b_this_page;
233 } while (bh != head); 226 } while (bh != head);
234 } 227 }
235 228
236 /*
237 * If this is a partial write which happened to make
238 * all buffers uptodate then we can optimize away a
239 * bogus readpage() for the next read(). Here we
240 * 'discover' whether the page went uptodate as a
241 * result of this (potentially partial) write.
242 */
243 if (!partial_write)
244 SetPageUptodate(page);
245
246 put_io_page(io_end->pages[i]); 229 put_io_page(io_end->pages[i]);
247 } 230 }
248 io_end->num_io_pages = 0; 231 io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..d9937df7f5cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static inline int ext2_feature_set_ok(struct super_block *sb);
79static inline int ext3_feature_set_ok(struct super_block *sb);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly); 80static int ext4_feature_set_ok(struct super_block *sb, int readonly);
79static void ext4_destroy_lazyinit_thread(void); 81static void ext4_destroy_lazyinit_thread(void);
80static void ext4_unregister_li_request(struct super_block *sb); 82static void ext4_unregister_li_request(struct super_block *sb);
81static void ext4_clear_request_list(void); 83static void ext4_clear_request_list(void);
82 84
85#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
86static struct file_system_type ext2_fs_type = {
87 .owner = THIS_MODULE,
88 .name = "ext2",
89 .mount = ext4_mount,
90 .kill_sb = kill_block_super,
91 .fs_flags = FS_REQUIRES_DEV,
92};
93#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
94#else
95#define IS_EXT2_SB(sb) (0)
96#endif
97
98
83#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 99#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
84static struct file_system_type ext3_fs_type = { 100static struct file_system_type ext3_fs_type = {
85 .owner = THIS_MODULE, 101 .owner = THIS_MODULE,
@@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb)
806 invalidate_bdev(sbi->journal_bdev); 822 invalidate_bdev(sbi->journal_bdev);
807 ext4_blkdev_remove(sbi); 823 ext4_blkdev_remove(sbi);
808 } 824 }
825 if (sbi->s_mmp_tsk)
826 kthread_stop(sbi->s_mmp_tsk);
809 sb->s_fs_info = NULL; 827 sb->s_fs_info = NULL;
810 /* 828 /*
811 * Now that we are completely done shutting down the 829 * Now that we are completely done shutting down the
@@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1096 1114
1097 if (!test_opt(sb, INIT_INODE_TABLE)) 1115 if (!test_opt(sb, INIT_INODE_TABLE))
1098 seq_puts(seq, ",noinit_inode_table"); 1116 seq_puts(seq, ",noinit_inode_table");
1099 else if (sbi->s_li_wait_mult) 1117 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1100 seq_printf(seq, ",init_inode_table=%u", 1118 seq_printf(seq, ",init_inode_table=%u",
1101 (unsigned) sbi->s_li_wait_mult); 1119 (unsigned) sbi->s_li_wait_mult);
1102 1120
@@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1187 const char *data, size_t len, loff_t off); 1205 const char *data, size_t len, loff_t off);
1188 1206
1189static const struct dquot_operations ext4_quota_operations = { 1207static const struct dquot_operations ext4_quota_operations = {
1190#ifdef CONFIG_QUOTA
1191 .get_reserved_space = ext4_get_reserved_space, 1208 .get_reserved_space = ext4_get_reserved_space,
1192#endif
1193 .write_dquot = ext4_write_dquot, 1209 .write_dquot = ext4_write_dquot,
1194 .acquire_dquot = ext4_acquire_dquot, 1210 .acquire_dquot = ext4_acquire_dquot,
1195 .release_dquot = ext4_release_dquot, 1211 .release_dquot = ext4_release_dquot,
@@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1900 ext4_msg(sb, KERN_WARNING, 1916 ext4_msg(sb, KERN_WARNING,
1901 "warning: mounting fs with errors, " 1917 "warning: mounting fs with errors, "
1902 "running e2fsck is recommended"); 1918 "running e2fsck is recommended");
1903 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1919 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1904 le16_to_cpu(es->s_mnt_count) >= 1920 le16_to_cpu(es->s_mnt_count) >=
1905 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1921 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1906 ext4_msg(sb, KERN_WARNING, 1922 ext4_msg(sb, KERN_WARNING,
@@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2425 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 2441 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2426} 2442}
2427 2443
2444static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2445 struct ext4_sb_info *sbi, char *buf)
2446{
2447 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2448}
2449
2450static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2451 struct ext4_sb_info *sbi, char *buf)
2452{
2453 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2454}
2455
2428static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2456static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2429 struct ext4_sb_info *sbi, 2457 struct ext4_sb_info *sbi,
2430 const char *buf, size_t count) 2458 const char *buf, size_t count)
@@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2482EXT4_RO_ATTR(delayed_allocation_blocks); 2510EXT4_RO_ATTR(delayed_allocation_blocks);
2483EXT4_RO_ATTR(session_write_kbytes); 2511EXT4_RO_ATTR(session_write_kbytes);
2484EXT4_RO_ATTR(lifetime_write_kbytes); 2512EXT4_RO_ATTR(lifetime_write_kbytes);
2513EXT4_RO_ATTR(extent_cache_hits);
2514EXT4_RO_ATTR(extent_cache_misses);
2485EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2515EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2486 inode_readahead_blks_store, s_inode_readahead_blks); 2516 inode_readahead_blks_store, s_inode_readahead_blks);
2487EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2517EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = {
2497 ATTR_LIST(delayed_allocation_blocks), 2527 ATTR_LIST(delayed_allocation_blocks),
2498 ATTR_LIST(session_write_kbytes), 2528 ATTR_LIST(session_write_kbytes),
2499 ATTR_LIST(lifetime_write_kbytes), 2529 ATTR_LIST(lifetime_write_kbytes),
2530 ATTR_LIST(extent_cache_hits),
2531 ATTR_LIST(extent_cache_misses),
2500 ATTR_LIST(inode_readahead_blks), 2532 ATTR_LIST(inode_readahead_blks),
2501 ATTR_LIST(inode_goal), 2533 ATTR_LIST(inode_goal),
2502 ATTR_LIST(mb_stats), 2534 ATTR_LIST(mb_stats),
@@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg)
2659 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2691 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2660} 2692}
2661 2693
2662static void ext4_lazyinode_timeout(unsigned long data)
2663{
2664 struct task_struct *p = (struct task_struct *)data;
2665 wake_up_process(p);
2666}
2667
2668/* Find next suitable group and run ext4_init_inode_table */ 2694/* Find next suitable group and run ext4_init_inode_table */
2669static int ext4_run_li_request(struct ext4_li_request *elr) 2695static int ext4_run_li_request(struct ext4_li_request *elr)
2670{ 2696{
@@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2696 ret = ext4_init_inode_table(sb, group, 2722 ret = ext4_init_inode_table(sb, group,
2697 elr->lr_timeout ? 0 : 1); 2723 elr->lr_timeout ? 0 : 1);
2698 if (elr->lr_timeout == 0) { 2724 if (elr->lr_timeout == 0) {
2699 timeout = jiffies - timeout; 2725 timeout = (jiffies - timeout) *
2700 if (elr->lr_sbi->s_li_wait_mult) 2726 elr->lr_sbi->s_li_wait_mult;
2701 timeout *= elr->lr_sbi->s_li_wait_mult;
2702 else
2703 timeout *= 20;
2704 elr->lr_timeout = timeout; 2727 elr->lr_timeout = timeout;
2705 } 2728 }
2706 elr->lr_next_sched = jiffies + elr->lr_timeout; 2729 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2712 2735
2713/* 2736/*
2714 * Remove lr_request from the list_request and free the 2737 * Remove lr_request from the list_request and free the
2715 * request tructure. Should be called with li_list_mtx held 2738 * request structure. Should be called with li_list_mtx held
2716 */ 2739 */
2717static void ext4_remove_li_request(struct ext4_li_request *elr) 2740static void ext4_remove_li_request(struct ext4_li_request *elr)
2718{ 2741{
@@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
2730 2753
2731static void ext4_unregister_li_request(struct super_block *sb) 2754static void ext4_unregister_li_request(struct super_block *sb)
2732{ 2755{
2733 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; 2756 mutex_lock(&ext4_li_mtx);
2734 2757 if (!ext4_li_info) {
2735 if (!ext4_li_info) 2758 mutex_unlock(&ext4_li_mtx);
2736 return; 2759 return;
2760 }
2737 2761
2738 mutex_lock(&ext4_li_info->li_list_mtx); 2762 mutex_lock(&ext4_li_info->li_list_mtx);
2739 ext4_remove_li_request(elr); 2763 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2740 mutex_unlock(&ext4_li_info->li_list_mtx); 2764 mutex_unlock(&ext4_li_info->li_list_mtx);
2765 mutex_unlock(&ext4_li_mtx);
2741} 2766}
2742 2767
2743static struct task_struct *ext4_lazyinit_task; 2768static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg)
2756 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 2781 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2757 struct list_head *pos, *n; 2782 struct list_head *pos, *n;
2758 struct ext4_li_request *elr; 2783 struct ext4_li_request *elr;
2759 unsigned long next_wakeup; 2784 unsigned long next_wakeup, cur;
2760 DEFINE_WAIT(wait);
2761 2785
2762 BUG_ON(NULL == eli); 2786 BUG_ON(NULL == eli);
2763 2787
2764 eli->li_timer.data = (unsigned long)current;
2765 eli->li_timer.function = ext4_lazyinode_timeout;
2766
2767 eli->li_task = current;
2768 wake_up(&eli->li_wait_task);
2769
2770cont_thread: 2788cont_thread:
2771 while (true) { 2789 while (true) {
2772 next_wakeup = MAX_JIFFY_OFFSET; 2790 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2815,15 @@ cont_thread:
2797 if (freezing(current)) 2815 if (freezing(current))
2798 refrigerator(); 2816 refrigerator();
2799 2817
2800 if ((time_after_eq(jiffies, next_wakeup)) || 2818 cur = jiffies;
2819 if ((time_after_eq(cur, next_wakeup)) ||
2801 (MAX_JIFFY_OFFSET == next_wakeup)) { 2820 (MAX_JIFFY_OFFSET == next_wakeup)) {
2802 cond_resched(); 2821 cond_resched();
2803 continue; 2822 continue;
2804 } 2823 }
2805 2824
2806 eli->li_timer.expires = next_wakeup; 2825 schedule_timeout_interruptible(next_wakeup - cur);
2807 add_timer(&eli->li_timer); 2826
2808 prepare_to_wait(&eli->li_wait_daemon, &wait,
2809 TASK_INTERRUPTIBLE);
2810 if (time_before(jiffies, next_wakeup))
2811 schedule();
2812 finish_wait(&eli->li_wait_daemon, &wait);
2813 if (kthread_should_stop()) { 2827 if (kthread_should_stop()) {
2814 ext4_clear_request_list(); 2828 ext4_clear_request_list();
2815 goto exit_thread; 2829 goto exit_thread;
@@ -2833,12 +2847,7 @@ exit_thread:
2833 goto cont_thread; 2847 goto cont_thread;
2834 } 2848 }
2835 mutex_unlock(&eli->li_list_mtx); 2849 mutex_unlock(&eli->li_list_mtx);
2836 del_timer_sync(&ext4_li_info->li_timer);
2837 eli->li_task = NULL;
2838 wake_up(&eli->li_wait_task);
2839
2840 kfree(ext4_li_info); 2850 kfree(ext4_li_info);
2841 ext4_lazyinit_task = NULL;
2842 ext4_li_info = NULL; 2851 ext4_li_info = NULL;
2843 mutex_unlock(&ext4_li_mtx); 2852 mutex_unlock(&ext4_li_mtx);
2844 2853
@@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void)
2866 if (IS_ERR(ext4_lazyinit_task)) { 2875 if (IS_ERR(ext4_lazyinit_task)) {
2867 int err = PTR_ERR(ext4_lazyinit_task); 2876 int err = PTR_ERR(ext4_lazyinit_task);
2868 ext4_clear_request_list(); 2877 ext4_clear_request_list();
2869 del_timer_sync(&ext4_li_info->li_timer);
2870 kfree(ext4_li_info); 2878 kfree(ext4_li_info);
2871 ext4_li_info = NULL; 2879 ext4_li_info = NULL;
2872 printk(KERN_CRIT "EXT4: error %d creating inode table " 2880 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void)
2875 return err; 2883 return err;
2876 } 2884 }
2877 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 2885 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2878
2879 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2880 return 0; 2886 return 0;
2881} 2887}
2882 2888
@@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void)
2911 if (!eli) 2917 if (!eli)
2912 return -ENOMEM; 2918 return -ENOMEM;
2913 2919
2914 eli->li_task = NULL;
2915 INIT_LIST_HEAD(&eli->li_request_list); 2920 INIT_LIST_HEAD(&eli->li_request_list);
2916 mutex_init(&eli->li_list_mtx); 2921 mutex_init(&eli->li_list_mtx);
2917 2922
2918 init_waitqueue_head(&eli->li_wait_daemon);
2919 init_waitqueue_head(&eli->li_wait_task);
2920 init_timer(&eli->li_timer);
2921 eli->li_state |= EXT4_LAZYINIT_QUIT; 2923 eli->li_state |= EXT4_LAZYINIT_QUIT;
2922 2924
2923 ext4_li_info = eli; 2925 ext4_li_info = eli;
@@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb,
2960 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2962 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2961 int ret = 0; 2963 int ret = 0;
2962 2964
2963 if (sbi->s_li_request != NULL) 2965 if (sbi->s_li_request != NULL) {
2966 /*
2967 * Reset timeout so it can be computed again, because
2968 * s_li_wait_mult might have changed.
2969 */
2970 sbi->s_li_request->lr_timeout = 0;
2964 return 0; 2971 return 0;
2972 }
2965 2973
2966 if (first_not_zeroed == ngroups || 2974 if (first_not_zeroed == ngroups ||
2967 (sb->s_flags & MS_RDONLY) || 2975 (sb->s_flags & MS_RDONLY) ||
2968 !test_opt(sb, INIT_INODE_TABLE)) { 2976 !test_opt(sb, INIT_INODE_TABLE))
2969 sbi->s_li_request = NULL;
2970 return 0; 2977 return 0;
2971 }
2972
2973 if (first_not_zeroed == ngroups) {
2974 sbi->s_li_request = NULL;
2975 return 0;
2976 }
2977 2978
2978 elr = ext4_li_request_new(sb, first_not_zeroed); 2979 elr = ext4_li_request_new(sb, first_not_zeroed);
2979 if (!elr) 2980 if (!elr)
@@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3166 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3167 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3167 set_opt(sb, DELALLOC); 3168 set_opt(sb, DELALLOC);
3168 3169
3170 /*
3171 * set default s_li_wait_mult for lazyinit, for the case there is
3172 * no mount option specified.
3173 */
3174 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3175
3169 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3176 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3170 &journal_devnum, &journal_ioprio, NULL, 0)) { 3177 &journal_devnum, &journal_ioprio, NULL, 0)) {
3171 ext4_msg(sb, KERN_WARNING, 3178 ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3187 "feature flags set on rev 0 fs, " 3194 "feature flags set on rev 0 fs, "
3188 "running e2fsck is recommended"); 3195 "running e2fsck is recommended");
3189 3196
3197 if (IS_EXT2_SB(sb)) {
3198 if (ext2_feature_set_ok(sb))
3199 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3200 "using the ext4 subsystem");
3201 else {
3202 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3203 "to feature incompatibilities");
3204 goto failed_mount;
3205 }
3206 }
3207
3208 if (IS_EXT3_SB(sb)) {
3209 if (ext3_feature_set_ok(sb))
3210 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3211 "using the ext4 subsystem");
3212 else {
3213 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3214 "to feature incompatibilities");
3215 goto failed_mount;
3216 }
3217 }
3218
3190 /* 3219 /*
3191 * Check feature flags regardless of the revision level, since we 3220 * Check feature flags regardless of the revision level, since we
3192 * previously didn't change the revision level when setting the flags, 3221 * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3459 EXT4_HAS_INCOMPAT_FEATURE(sb, 3488 EXT4_HAS_INCOMPAT_FEATURE(sb,
3460 EXT4_FEATURE_INCOMPAT_RECOVER)); 3489 EXT4_FEATURE_INCOMPAT_RECOVER));
3461 3490
3491 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3492 !(sb->s_flags & MS_RDONLY))
3493 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3494 goto failed_mount3;
3495
3462 /* 3496 /*
3463 * The first inode we look at is the journal inode. Don't try 3497 * The first inode we look at is the journal inode. Don't try
3464 * root first: it may be modified in the journal! 3498 * root first: it may be modified in the journal!
@@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3474 goto failed_mount_wq; 3508 goto failed_mount_wq;
3475 } else { 3509 } else {
3476 clear_opt(sb, DATA_FLAGS); 3510 clear_opt(sb, DATA_FLAGS);
3477 set_opt(sb, WRITEBACK_DATA);
3478 sbi->s_journal = NULL; 3511 sbi->s_journal = NULL;
3479 needs_recovery = 0; 3512 needs_recovery = 0;
3480 goto no_journal; 3513 goto no_journal;
@@ -3707,6 +3740,8 @@ failed_mount3:
3707 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3740 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3708 percpu_counter_destroy(&sbi->s_dirs_counter); 3741 percpu_counter_destroy(&sbi->s_dirs_counter);
3709 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3742 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3743 if (sbi->s_mmp_tsk)
3744 kthread_stop(sbi->s_mmp_tsk);
3710failed_mount2: 3745failed_mount2:
3711 for (i = 0; i < db_count; i++) 3746 for (i = 0; i < db_count; i++)
3712 brelse(sbi->s_group_desc[i]); 3747 brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4242 int enable_quota = 0; 4277 int enable_quota = 0;
4243 ext4_group_t g; 4278 ext4_group_t g;
4244 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4279 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4245 int err; 4280 int err = 0;
4246#ifdef CONFIG_QUOTA 4281#ifdef CONFIG_QUOTA
4247 int i; 4282 int i;
4248#endif 4283#endif
@@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4368 goto restore_opts; 4403 goto restore_opts;
4369 if (!ext4_setup_super(sb, es, 0)) 4404 if (!ext4_setup_super(sb, es, 0))
4370 sb->s_flags &= ~MS_RDONLY; 4405 sb->s_flags &= ~MS_RDONLY;
4406 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4407 EXT4_FEATURE_INCOMPAT_MMP))
4408 if (ext4_multi_mount_protect(sb,
4409 le64_to_cpu(es->s_mmp_block))) {
4410 err = -EROFS;
4411 goto restore_opts;
4412 }
4371 enable_quota = 1; 4413 enable_quota = 1;
4372 } 4414 }
4373 } 4415 }
@@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4432 struct ext4_sb_info *sbi = EXT4_SB(sb); 4474 struct ext4_sb_info *sbi = EXT4_SB(sb);
4433 struct ext4_super_block *es = sbi->s_es; 4475 struct ext4_super_block *es = sbi->s_es;
4434 u64 fsid; 4476 u64 fsid;
4477 s64 bfree;
4435 4478
4436 if (test_opt(sb, MINIX_DF)) { 4479 if (test_opt(sb, MINIX_DF)) {
4437 sbi->s_overhead_last = 0; 4480 sbi->s_overhead_last = 0;
@@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4475 buf->f_type = EXT4_SUPER_MAGIC; 4518 buf->f_type = EXT4_SUPER_MAGIC;
4476 buf->f_bsize = sb->s_blocksize; 4519 buf->f_bsize = sb->s_blocksize;
4477 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4520 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4478 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4521 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4479 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4522 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4523 /* prevent underflow in case that few free space is available */
4524 buf->f_bfree = max_t(s64, bfree, 0);
4480 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4525 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4481 if (buf->f_bfree < ext4_r_blocks_count(es)) 4526 if (buf->f_bfree < ext4_r_blocks_count(es))
4482 buf->f_bavail = 0; 4527 buf->f_bavail = 0;
@@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
4652 if (test_opt(sb, DELALLOC)) 4697 if (test_opt(sb, DELALLOC))
4653 sync_filesystem(sb); 4698 sync_filesystem(sb);
4654 4699
4700 if (!inode)
4701 goto out;
4702
4655 /* Update modification times of quota files when userspace can 4703 /* Update modification times of quota files when userspace can
4656 * start looking at them */ 4704 * start looking at them */
4657 handle = ext4_journal_start(inode, 1); 4705 handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4772} 4820}
4773 4821
4774#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4822#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4775static struct file_system_type ext2_fs_type = {
4776 .owner = THIS_MODULE,
4777 .name = "ext2",
4778 .mount = ext4_mount,
4779 .kill_sb = kill_block_super,
4780 .fs_flags = FS_REQUIRES_DEV,
4781};
4782
4783static inline void register_as_ext2(void) 4823static inline void register_as_ext2(void)
4784{ 4824{
4785 int err = register_filesystem(&ext2_fs_type); 4825 int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void)
4792{ 4832{
4793 unregister_filesystem(&ext2_fs_type); 4833 unregister_filesystem(&ext2_fs_type);
4794} 4834}
4835
4836static inline int ext2_feature_set_ok(struct super_block *sb)
4837{
4838 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4839 return 0;
4840 if (sb->s_flags & MS_RDONLY)
4841 return 1;
4842 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4843 return 0;
4844 return 1;
4845}
4795MODULE_ALIAS("ext2"); 4846MODULE_ALIAS("ext2");
4796#else 4847#else
4797static inline void register_as_ext2(void) { } 4848static inline void register_as_ext2(void) { }
4798static inline void unregister_as_ext2(void) { } 4849static inline void unregister_as_ext2(void) { }
4850static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4799#endif 4851#endif
4800 4852
4801#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4853#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void)
4811{ 4863{
4812 unregister_filesystem(&ext3_fs_type); 4864 unregister_filesystem(&ext3_fs_type);
4813} 4865}
4866
4867static inline int ext3_feature_set_ok(struct super_block *sb)
4868{
4869 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4870 return 0;
4871 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4872 return 0;
4873 if (sb->s_flags & MS_RDONLY)
4874 return 1;
4875 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4876 return 0;
4877 return 1;
4878}
4814MODULE_ALIAS("ext3"); 4879MODULE_ALIAS("ext3");
4815#else 4880#else
4816static inline void register_as_ext3(void) { } 4881static inline void register_as_ext3(void) { }
4817static inline void unregister_as_ext3(void) { } 4882static inline void unregister_as_ext3(void) { }
4883static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4818#endif 4884#endif
4819 4885
4820static struct file_system_type ext4_fs_type = { 4886static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void)
4898 err = init_inodecache(); 4964 err = init_inodecache();
4899 if (err) 4965 if (err)
4900 goto out1; 4966 goto out1;
4901 register_as_ext2();
4902 register_as_ext3(); 4967 register_as_ext3();
4968 register_as_ext2();
4903 err = register_filesystem(&ext4_fs_type); 4969 err = register_filesystem(&ext4_fs_type);
4904 if (err) 4970 if (err)
4905 goto out; 4971 goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 goal, NULL, &error); 824 NULL, &error);
825 if (error) 825 if (error)
826 goto cleanup; 826 goto cleanup;
827 827
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a81c783..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
219 ret = err; 219 ret = err;
220 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
221 J_ASSERT(jinode->i_transaction == commit_transaction); 221 J_ASSERT(jinode->i_transaction == commit_transaction);
222 commit_transaction->t_flushed_data_blocks = 1;
223 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 222 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
224 smp_mb__after_clear_bit(); 223 smp_mb__after_clear_bit();
225 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 224 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
672 err = 0; 671 err = 0;
673 } 672 }
674 673
674 write_lock(&journal->j_state_lock);
675 J_ASSERT(commit_transaction->t_state == T_COMMIT);
676 commit_transaction->t_state = T_COMMIT_DFLUSH;
677 write_unlock(&journal->j_state_lock);
675 /* 678 /*
676 * If the journal is not located on the file system device, 679 * If the journal is not located on the file system device,
677 * then we must flush the file system device before we issue 680 * then we must flush the file system device before we issue
678 * the commit record 681 * the commit record
679 */ 682 */
680 if (commit_transaction->t_flushed_data_blocks && 683 if (commit_transaction->t_need_data_flush &&
681 (journal->j_fs_dev != journal->j_dev) && 684 (journal->j_fs_dev != journal->j_dev) &&
682 (journal->j_flags & JBD2_BARRIER)) 685 (journal->j_flags & JBD2_BARRIER))
683 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 686 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
754 required. */ 757 required. */
755 JBUFFER_TRACE(jh, "file as BJ_Forget"); 758 JBUFFER_TRACE(jh, "file as BJ_Forget");
756 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 759 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
757 /* Wake up any transactions which were waiting for this 760 /*
758 IO to complete */ 761 * Wake up any transactions which were waiting for this IO to
762 * complete. The barrier must be here so that changes by
763 * jbd2_journal_file_buffer() take effect before wake_up_bit()
764 * does the waitqueue check.
765 */
766 smp_mb();
759 wake_up_bit(&bh->b_state, BH_Unshadow); 767 wake_up_bit(&bh->b_state, BH_Unshadow);
760 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 768 JBUFFER_TRACE(jh, "brelse shadowed buffer");
761 __brelse(bh); 769 __brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
794 jbd2_journal_abort(journal, err); 802 jbd2_journal_abort(journal, err);
795 803
796 jbd_debug(3, "JBD: commit phase 5\n"); 804 jbd_debug(3, "JBD: commit phase 5\n");
805 write_lock(&journal->j_state_lock);
806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
807 commit_transaction->t_state = T_COMMIT_JFLUSH;
808 write_unlock(&journal->j_state_lock);
797 809
798 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 810 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
799 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 811 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
949 961
950 jbd_debug(3, "JBD: commit phase 7\n"); 962 jbd_debug(3, "JBD: commit phase 7\n");
951 963
952 J_ASSERT(commit_transaction->t_state == T_COMMIT); 964 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
953 965
954 commit_transaction->t_start = jiffies; 966 commit_transaction->t_start = jiffies;
955 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 967 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
479int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
480{ 480{
481 /* 481 /*
482 * Are we already doing a recent enough commit? 482 * The only transaction we can possibly wait upon is the
483 * currently running transaction (if it exists). Otherwise,
484 * the target tid must be an old one.
483 */ 485 */
484 if (!tid_geq(journal->j_commit_request, target)) { 486 if (journal->j_running_transaction &&
487 journal->j_running_transaction->t_tid == target) {
485 /* 488 /*
486 * We want a new commit: OK, mark the request and wakeup the 489 * We want a new commit: OK, mark the request and wakeup the
487 * commit thread. We do _not_ do the commit ourselves. 490 * commit thread. We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
493 journal->j_commit_sequence); 496 journal->j_commit_sequence);
494 wake_up(&journal->j_wait_commit); 497 wake_up(&journal->j_wait_commit);
495 return 1; 498 return 1;
496 } 499 } else if (!tid_geq(journal->j_commit_request, target))
500 /* This should never happen, but if it does, preserve
501 the evidence before kjournald goes into a loop and
502 increments j_commit_sequence beyond all recognition. */
503 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
504 journal->j_commit_request,
505 journal->j_commit_sequence,
506 target, journal->j_running_transaction ?
507 journal->j_running_transaction->t_tid : 0);
497 return 0; 508 return 0;
498} 509}
499 510
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
577} 588}
578 589
579/* 590/*
591 * Return 1 if a given transaction has not yet sent barrier request
592 * connected with a transaction commit. If 0 is returned, transaction
593 * may or may not have sent the barrier. Used to avoid sending barrier
594 * twice in common cases.
595 */
596int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
597{
598 int ret = 0;
599 transaction_t *commit_trans;
600
601 if (!(journal->j_flags & JBD2_BARRIER))
602 return 0;
603 read_lock(&journal->j_state_lock);
604 /* Transaction already committed? */
605 if (tid_geq(journal->j_commit_sequence, tid))
606 goto out;
607 commit_trans = journal->j_committing_transaction;
608 if (!commit_trans || commit_trans->t_tid != tid) {
609 ret = 1;
610 goto out;
611 }
612 /*
613 * Transaction is being committed and we already proceeded to
614 * submitting a flush to fs partition?
615 */
616 if (journal->j_fs_dev != journal->j_dev) {
617 if (!commit_trans->t_need_data_flush ||
618 commit_trans->t_state >= T_COMMIT_DFLUSH)
619 goto out;
620 } else {
621 if (commit_trans->t_state >= T_COMMIT_JFLUSH)
622 goto out;
623 }
624 ret = 1;
625out:
626 read_unlock(&journal->j_state_lock);
627 return ret;
628}
629EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
630
631/*
580 * Wait for a specified commit to complete. 632 * Wait for a specified commit to complete.
581 * The caller may not hold the journal lock. 633 * The caller may not hold the journal lock.
582 */ 634 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
82 */ 82 */
83 83
84/* 84/*
85 * Update transiaction's maximum wait time, if debugging is enabled. 85 * Update transaction's maximum wait time, if debugging is enabled.
86 * 86 *
87 * In order for t_max_wait to be reliable, it must be protected by a 87 * In order for t_max_wait to be reliable, it must be protected by a
88 * lock. But doing so will mean that start_this_handle() can not be 88 * lock. But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
91 * means that maximum wait time reported by the jbd2_run_stats 91 * means that maximum wait time reported by the jbd2_run_stats
92 * tracepoint will always be zero. 92 * tracepoint will always be zero.
93 */ 93 */
94static inline void update_t_max_wait(transaction_t *transaction) 94static inline void update_t_max_wait(transaction_t *transaction,
95 unsigned long ts)
95{ 96{
96#ifdef CONFIG_JBD2_DEBUG 97#ifdef CONFIG_JBD2_DEBUG
97 unsigned long ts = jiffies;
98
99 if (jbd2_journal_enable_debug && 98 if (jbd2_journal_enable_debug &&
100 time_after(transaction->t_start, ts)) { 99 time_after(transaction->t_start, ts)) {
101 ts = jbd2_time_diff(ts, transaction->t_start); 100 ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
121 tid_t tid; 120 tid_t tid;
122 int needed, need_to_start; 121 int needed, need_to_start;
123 int nblocks = handle->h_buffer_credits; 122 int nblocks = handle->h_buffer_credits;
123 unsigned long ts = jiffies;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
271 /* OK, account for the buffers that this operation expects to 271 /* OK, account for the buffers that this operation expects to
272 * use and add the handle to the running transaction. 272 * use and add the handle to the running transaction.
273 */ 273 */
274 update_t_max_wait(transaction); 274 update_t_max_wait(transaction, ts);
275 handle->h_transaction = transaction; 275 handle->h_transaction = transaction;
276 atomic_inc(&transaction->t_updates); 276 atomic_inc(&transaction->t_updates);
277 atomic_inc(&transaction->t_handle_count); 277 atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
316 * This function is visible to journal users (like ext3fs), so is not 316 * This function is visible to journal users (like ext3fs), so is not
317 * called with the journal already locked. 317 * called with the journal already locked.
318 * 318 *
319 * Return a pointer to a newly allocated handle, or NULL on failure 319 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
320 * on failure.
320 */ 321 */
321handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 322handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
322{ 323{
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
921 */ 922 */
922 JBUFFER_TRACE(jh, "cancelling revoke"); 923 JBUFFER_TRACE(jh, "cancelling revoke");
923 jbd2_journal_cancel_revoke(handle, jh); 924 jbd2_journal_cancel_revoke(handle, jh);
924 jbd2_journal_put_journal_head(jh);
925out: 925out:
926 jbd2_journal_put_journal_head(jh);
926 return err; 927 return err;
927} 928}
928 929
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2147 jinode->i_next_transaction == transaction) 2148 jinode->i_next_transaction == transaction)
2148 goto done; 2149 goto done;
2149 2150
2151 /*
2152 * We only ever set this variable to 1 so the test is safe. Since
2153 * t_need_data_flush is likely to be set, we do the test to save some
2154 * cacheline bouncing
2155 */
2156 if (!transaction->t_need_data_flush)
2157 transaction->t_need_data_flush = 1;
2150 /* On some different transaction's list - should be 2158 /* On some different transaction's list - should be
2151 * the committing one */ 2159 * the committing one */
2152 if (jinode->i_transaction) { 2160 if (jinode->i_transaction) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a32dcaec04e1..4ecb7b16b278 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -529,9 +529,10 @@ struct transaction_s
529 enum { 529 enum {
530 T_RUNNING, 530 T_RUNNING,
531 T_LOCKED, 531 T_LOCKED,
532 T_RUNDOWN,
533 T_FLUSH, 532 T_FLUSH,
534 T_COMMIT, 533 T_COMMIT,
534 T_COMMIT_DFLUSH,
535 T_COMMIT_JFLUSH,
535 T_FINISHED 536 T_FINISHED
536 } t_state; 537 } t_state;
537 538
@@ -658,7 +659,9 @@ struct transaction_s
658 * waiting for it to finish. 659 * waiting for it to finish.
659 */ 660 */
660 unsigned int t_synchronous_commit:1; 661 unsigned int t_synchronous_commit:1;
661 unsigned int t_flushed_data_blocks:1; 662
663 /* Disk flush needs to be sent to fs partition [no locking] */
664 int t_need_data_flush;
662 665
663 /* 666 /*
664 * For use by the filesystem to store fs-specific data 667 * For use by the filesystem to store fs-specific data
@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1228int jbd2_journal_force_commit_nested(journal_t *journal); 1231int jbd2_journal_force_commit_nested(journal_t *journal);
1229int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1232int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1230int jbd2_log_do_checkpoint(journal_t *journal); 1233int jbd2_log_do_checkpoint(journal_t *journal);
1234int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
1231 1235
1232void __jbd2_log_wait_for_space(journal_t *journal); 1236void __jbd2_log_wait_for_space(journal_t *journal);
1233extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); 1237extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);