aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c204
-rw-r--r--fs/ext4/xattr.c4
18 files changed, 1916 insertions, 1079 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o
10 11
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 362}
363 363
364/** 364/**
365 * ext4_add_groupblocks() -- Add given blocks to an existing group
366 * @handle: handle to this transaction
367 * @sb: super block
368 * @block: start physcial block to add to the block group
369 * @count: number of blocks to free
370 *
371 * This marks the blocks as free in the bitmap. We ask the
372 * mballoc to reload the buddy after this by setting group
373 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
374 */
375void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
376 ext4_fsblk_t block, unsigned long count)
377{
378 struct buffer_head *bitmap_bh = NULL;
379 struct buffer_head *gd_bh;
380 ext4_group_t block_group;
381 ext4_grpblk_t bit;
382 unsigned int i;
383 struct ext4_group_desc *desc;
384 struct ext4_sb_info *sbi = EXT4_SB(sb);
385 int err = 0, ret, blk_free_count;
386 ext4_grpblk_t blocks_freed;
387 struct ext4_group_info *grp;
388
389 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
390
391 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
392 grp = ext4_get_group_info(sb, block_group);
393 /*
394 * Check to see if we are freeing blocks across a group
395 * boundary.
396 */
397 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
398 goto error_return;
399 }
400 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
401 if (!bitmap_bh)
402 goto error_return;
403 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
404 if (!desc)
405 goto error_return;
406
407 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
408 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
409 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
410 in_range(block + count - 1, ext4_inode_table(sb, desc),
411 sbi->s_itb_per_group)) {
412 ext4_error(sb, "Adding blocks in system zones - "
413 "Block = %llu, count = %lu",
414 block, count);
415 goto error_return;
416 }
417
418 /*
419 * We are about to add blocks to the bitmap,
420 * so we need undo access.
421 */
422 BUFFER_TRACE(bitmap_bh, "getting undo access");
423 err = ext4_journal_get_undo_access(handle, bitmap_bh);
424 if (err)
425 goto error_return;
426
427 /*
428 * We are about to modify some metadata. Call the journal APIs
429 * to unshare ->b_data if a currently-committing transaction is
430 * using it
431 */
432 BUFFER_TRACE(gd_bh, "get_write_access");
433 err = ext4_journal_get_write_access(handle, gd_bh);
434 if (err)
435 goto error_return;
436 /*
437 * make sure we don't allow a parallel init on other groups in the
438 * same buddy cache
439 */
440 down_write(&grp->alloc_sem);
441 for (i = 0, blocks_freed = 0; i < count; i++) {
442 BUFFER_TRACE(bitmap_bh, "clear bit");
443 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
444 bit + i, bitmap_bh->b_data)) {
445 ext4_error(sb, "bit already cleared for block %llu",
446 (ext4_fsblk_t)(block + i));
447 BUFFER_TRACE(bitmap_bh, "bit already cleared");
448 } else {
449 blocks_freed++;
450 }
451 }
452 ext4_lock_group(sb, block_group);
453 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
454 ext4_free_blks_set(sb, desc, blk_free_count);
455 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
456 ext4_unlock_group(sb, block_group);
457 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
458
459 if (sbi->s_log_groups_per_flex) {
460 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
461 atomic_add(blocks_freed,
462 &sbi->s_flex_groups[flex_group].free_blocks);
463 }
464 /*
465 * request to reload the buddy with the
466 * new bitmap information
467 */
468 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
469 grp->bb_free += blocks_freed;
470 up_write(&grp->alloc_sem);
471
472 /* We dirtied the bitmap block */
473 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
474 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
475
476 /* And the group descriptor block */
477 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
478 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
479 if (!err)
480 err = ret;
481
482error_return:
483 brelse(bitmap_bh);
484 ext4_std_error(sb, err);
485 return;
486}
487
488/**
489 * ext4_has_free_blocks() 365 * ext4_has_free_blocks()
490 * @sbi: in-core super block structure. 366 * @sbi: in-core super block structure.
491 * @nblocks: number of needed blocks 367 * @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
493 * Check if filesystem has nblocks free & available for allocation. 369 * Check if filesystem has nblocks free & available for allocation.
494 * On success return 1, return 0 on failure. 370 * On success return 1, return 0 on failure.
495 */ 371 */
496static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 372static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags)
497{ 374{
498 s64 free_blocks, dirty_blocks, root_blocks; 375 s64 free_blocks, dirty_blocks, root_blocks;
499 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
507 EXT4_FREEBLOCKS_WATERMARK) { 384 EXT4_FREEBLOCKS_WATERMARK) {
508 free_blocks = percpu_counter_sum_positive(fbc); 385 free_blocks = percpu_counter_sum_positive(fbc);
509 dirty_blocks = percpu_counter_sum_positive(dbc); 386 dirty_blocks = percpu_counter_sum_positive(dbc);
510 if (dirty_blocks < 0) {
511 printk(KERN_CRIT "Dirty block accounting "
512 "went wrong %lld\n",
513 (long long)dirty_blocks);
514 }
515 } 387 }
516 /* Check whether we have space after 388 /* Check whether we have space after
517 * accounting for current dirty blocks & root reserved blocks. 389 * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
522 /* Hm, nope. Are (enough) root reserved blocks available? */ 394 /* Hm, nope. Are (enough) root reserved blocks available? */
523 if (sbi->s_resuid == current_fsuid() || 395 if (sbi->s_resuid == current_fsuid() ||
524 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
525 capable(CAP_SYS_RESOURCE)) { 397 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399
526 if (free_blocks >= (nblocks + dirty_blocks)) 400 if (free_blocks >= (nblocks + dirty_blocks))
527 return 1; 401 return 1;
528 } 402 }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
531} 405}
532 406
533int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 407int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
534 s64 nblocks) 408 s64 nblocks, unsigned int flags)
535{ 409{
536 if (ext4_has_free_blocks(sbi, nblocks)) { 410 if (ext4_has_free_blocks(sbi, nblocks, flags)) {
537 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
538 return 0; 412 return 0;
539 } else 413 } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
554 */ 428 */
555int ext4_should_retry_alloc(struct super_block *sb, int *retries) 429int ext4_should_retry_alloc(struct super_block *sb, int *retries)
556{ 430{
557 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || 431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
558 (*retries)++ > 3 || 432 (*retries)++ > 3 ||
559 !EXT4_SB(sb)->s_journal) 433 !EXT4_SB(sb)->s_journal)
560 return 0; 434 return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
577 * error stores in errp pointer 451 * error stores in errp pointer
578 */ 452 */
579ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 453ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
580 ext4_fsblk_t goal, unsigned long *count, int *errp) 454 ext4_fsblk_t goal, unsigned int flags,
455 unsigned long *count, int *errp)
581{ 456{
582 struct ext4_allocation_request ar; 457 struct ext4_allocation_request ar;
583 ext4_fsblk_t ret; 458 ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
587 ar.inode = inode; 462 ar.inode = inode;
588 ar.goal = goal; 463 ar.goal = goal;
589 ar.len = count ? *count : 1; 464 ar.len = count ? *count : 1;
465 ar.flags = flags;
590 466
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 467 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 468 if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
108#define EXT4_MB_DELALLOC_RESERVED 0x0400 108#define EXT4_MB_DELALLOC_RESERVED 0x0400
109/* We are doing stream allocation */ 109/* We are doing stream allocation */
110#define EXT4_MB_STREAM_ALLOC 0x0800 110#define EXT4_MB_STREAM_ALLOC 0x0800
111 111/* Use reserved root blocks if needed */
112#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
112 113
113struct ext4_allocation_request { 114struct ext4_allocation_request {
114 /* target inode for block we're allocating */ 115 /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
209 */ 210 */
210#define EXT4_BAD_INO 1 /* Bad blocks inode */ 211#define EXT4_BAD_INO 1 /* Bad blocks inode */
211#define EXT4_ROOT_INO 2 /* Root inode */ 212#define EXT4_ROOT_INO 2 /* Root inode */
213#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
214#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
212#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ 215#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
213#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ 216#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
214#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ 217#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
512 /* Convert extent to initialized after IO complete */ 515 /* Convert extent to initialized after IO complete */
513#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 516#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
514 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 517 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
518 /* Punch out blocks of an extent */
519#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
520 /* Don't normalize allocation size (used for fallocate) */
521#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
515 522
516/* 523/*
517 * Flags used by ext4_free_blocks 524 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1035 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
1029 __le32 s_flags; /* Miscellaneous flags */ 1036 __le32 s_flags; /* Miscellaneous flags */
1030 __le16 s_raid_stride; /* RAID stride */ 1037 __le16 s_raid_stride; /* RAID stride */
1031 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1038 __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1039 __le64 s_mmp_block; /* Block for multi-mount protection */
1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1040 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1041 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
1144 unsigned long s_ext_blocks; 1151 unsigned long s_ext_blocks;
1145 unsigned long s_ext_extents; 1152 unsigned long s_ext_extents;
1146#endif 1153#endif
1154 /* ext4 extent cache stats */
1155 unsigned long extent_cache_hits;
1156 unsigned long extent_cache_misses;
1147 1157
1148 /* for buddy allocator */ 1158 /* for buddy allocator */
1149 struct ext4_group_info ***s_group_info; 1159 struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
1201 struct ext4_li_request *s_li_request; 1211 struct ext4_li_request *s_li_request;
1202 /* Wait multiplier for lazy initialization thread */ 1212 /* Wait multiplier for lazy initialization thread */
1203 unsigned int s_li_wait_mult; 1213 unsigned int s_li_wait_mult;
1214
1215 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk;
1204}; 1217};
1205 1218
1206static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1338#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 1351#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
1339#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1352#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1340#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1353#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1354#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1341 1355
1342#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1356#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1343#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1357#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1351#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1365#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1352#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1366#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1353 1367
1368#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1369#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1370 EXT4_FEATURE_INCOMPAT_META_BG)
1371#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1372 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1373 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1374
1375#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1376#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1377 EXT4_FEATURE_INCOMPAT_RECOVER| \
1378 EXT4_FEATURE_INCOMPAT_META_BG)
1379#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1380 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1381 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1382
1354#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1383#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1355#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1384#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1356 EXT4_FEATURE_INCOMPAT_RECOVER| \ 1385 EXT4_FEATURE_INCOMPAT_RECOVER| \
1357 EXT4_FEATURE_INCOMPAT_META_BG| \ 1386 EXT4_FEATURE_INCOMPAT_META_BG| \
1358 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1387 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1359 EXT4_FEATURE_INCOMPAT_64BIT| \ 1388 EXT4_FEATURE_INCOMPAT_64BIT| \
1360 EXT4_FEATURE_INCOMPAT_FLEX_BG) 1389 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1390 EXT4_FEATURE_INCOMPAT_MMP)
1361#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1391#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1362 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1392 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1363 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1393 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1590 */ 1620 */
1591struct ext4_lazy_init { 1621struct ext4_lazy_init {
1592 unsigned long li_state; 1622 unsigned long li_state;
1593
1594 wait_queue_head_t li_wait_daemon;
1595 wait_queue_head_t li_wait_task;
1596 struct timer_list li_timer;
1597 struct task_struct *li_task;
1598
1599 struct list_head li_request_list; 1623 struct list_head li_request_list;
1600 struct mutex li_list_mtx; 1624 struct mutex li_list_mtx;
1601}; 1625};
@@ -1615,6 +1639,67 @@ struct ext4_features {
1615}; 1639};
1616 1640
1617/* 1641/*
1642 * This structure will be used for multiple mount protection. It will be
1643 * written into the block number saved in the s_mmp_block field in the
1644 * superblock. Programs that check MMP should assume that if
1645 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
1646 * to use the filesystem, regardless of how old the timestamp is.
1647 */
1648#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
1649#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
1650#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
1651#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
1652
1653struct mmp_struct {
1654 __le32 mmp_magic; /* Magic number for MMP */
1655 __le32 mmp_seq; /* Sequence no. updated periodically */
1656
1657 /*
1658 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
1659 * purposes and do not affect the correctness of the algorithm
1660 */
1661 __le64 mmp_time; /* Time last updated */
1662 char mmp_nodename[64]; /* Node which last updated MMP block */
1663 char mmp_bdevname[32]; /* Bdev which last updated MMP block */
1664
1665 /*
1666 * mmp_check_interval is used to verify if the MMP block has been
1667 * updated on the block device. The value is updated based on the
1668 * maximum time to write the MMP block during an update cycle.
1669 */
1670 __le16 mmp_check_interval;
1671
1672 __le16 mmp_pad1;
1673 __le32 mmp_pad2[227];
1674};
1675
1676/* arguments passed to the mmp thread */
1677struct mmpd_data {
1678 struct buffer_head *bh; /* bh from initial read_mmp_block() */
1679 struct super_block *sb; /* super block of the fs */
1680};
1681
1682/*
1683 * Check interval multiplier
1684 * The MMP block is written every update interval and initially checked every
1685 * update interval x the multiplier (the value is then adapted based on the
1686 * write latency). The reason is that writes can be delayed under load and we
1687 * don't want readers to incorrectly assume that the filesystem is no longer
1688 * in use.
1689 */
1690#define EXT4_MMP_CHECK_MULT 2UL
1691
1692/*
1693 * Minimum interval for MMP checking in seconds.
1694 */
1695#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
1696
1697/*
1698 * Maximum interval for MMP checking in seconds.
1699 */
1700#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
1701
1702/*
1618 * Function prototypes 1703 * Function prototypes
1619 */ 1704 */
1620 1705
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1638extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1723extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1639 ext4_group_t group); 1724 ext4_group_t group);
1640extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1725extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1641 ext4_fsblk_t goal, unsigned long *count, int *errp); 1726 ext4_fsblk_t goal,
1642extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1727 unsigned int flags,
1643extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1728 unsigned long *count,
1644 ext4_fsblk_t block, unsigned long count); 1729 int *errp);
1730extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1731 s64 nblocks, unsigned int flags);
1645extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1732extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1646extern void ext4_check_blocks_bitmap(struct super_block *); 1733extern void ext4_check_blocks_bitmap(struct super_block *);
1647extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1734extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1706 unsigned long count, int flags); 1793 unsigned long count, int flags);
1707extern int ext4_mb_add_groupinfo(struct super_block *sb, 1794extern int ext4_mb_add_groupinfo(struct super_block *sb,
1708 ext4_group_t i, struct ext4_group_desc *desc); 1795 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count);
1709extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1710 1799
1711/* inode.c */ 1800/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1729extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1730extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
1731extern void ext4_truncate(struct inode *); 1820extern void ext4_truncate(struct inode *);
1821extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
1732extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 1822extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1733extern void ext4_set_inode_flags(struct inode *); 1823extern void ext4_set_inode_flags(struct inode *);
1734extern void ext4_get_inode_flags(struct ext4_inode_info *); 1824extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
1738extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1828extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1739extern int ext4_block_truncate_page(handle_t *handle, 1829extern int ext4_block_truncate_page(handle_t *handle,
1740 struct address_space *mapping, loff_t from); 1830 struct address_space *mapping, loff_t from);
1831extern int ext4_block_zero_page_range(handle_t *handle,
1832 struct address_space *mapping, loff_t from, loff_t length);
1741extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1833extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1742extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1834extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1743extern void ext4_da_update_reserve_space(struct inode *inode, 1835extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1788 __LINE__, ## message) 1880 __LINE__, ## message)
1789extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1881extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1790 __attribute__ ((format (printf, 3, 4))); 1882 __attribute__ ((format (printf, 3, 4)));
1883extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1884 const char *, unsigned int, const char *);
1885#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1886 __LINE__, msg)
1791extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1887extern void __ext4_grp_locked_error(const char *, unsigned int, \
1792 struct super_block *, ext4_group_t, \ 1888 struct super_block *, ext4_group_t, \
1793 unsigned long, ext4_fsblk_t, \ 1889 unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2064extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2160extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2065 struct ext4_map_blocks *map, int flags); 2161 struct ext4_map_blocks *map, int flags);
2066extern void ext4_ext_truncate(struct inode *); 2162extern void ext4_ext_truncate(struct inode *);
2163extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
2164 loff_t length);
2067extern void ext4_ext_init(struct super_block *); 2165extern void ext4_ext_init(struct super_block *);
2068extern void ext4_ext_release(struct super_block *); 2166extern void ext4_ext_release(struct super_block *);
2069extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2167extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2092 int len, 2190 int len,
2093 struct writeback_control *wbc); 2191 struct writeback_control *wbc);
2094 2192
2193/* mmp.c */
2194extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2195
2095/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2196/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2096enum ext4_state_bits { 2197enum ext4_state_bits {
2097 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2198 BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11{
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21}
22
23int __ext4_journal_get_write_access(const char *where, unsigned int line, 9int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25{ 11{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132int __ext4_journal_get_write_access(const char *where, unsigned int line, 129int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
146int __ext4_handle_dirty_super(const char *where, unsigned int line, 143int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149#define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
46 46
47#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
48 48
49static int ext4_split_extent(handle_t *handle,
50 struct inode *inode,
51 struct ext4_ext_path *path,
52 struct ext4_map_blocks *map,
53 int split_flag,
54 int flags);
55
49static int ext4_ext_truncate_extend_restart(handle_t *handle, 56static int ext4_ext_truncate_extend_restart(handle_t *handle,
50 struct inode *inode, 57 struct inode *inode,
51 int needed) 58 int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
192static ext4_fsblk_t 199static ext4_fsblk_t
193ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 200ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
194 struct ext4_ext_path *path, 201 struct ext4_ext_path *path,
195 struct ext4_extent *ex, int *err) 202 struct ext4_extent *ex, int *err, unsigned int flags)
196{ 203{
197 ext4_fsblk_t goal, newblock; 204 ext4_fsblk_t goal, newblock;
198 205
199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
208 NULL, err);
201 return newblock; 209 return newblock;
202} 210}
203 211
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
474 } 482 }
475 ext_debug("\n"); 483 ext_debug("\n");
476} 484}
485
486static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
487 ext4_fsblk_t newblock, int level)
488{
489 int depth = ext_depth(inode);
490 struct ext4_extent *ex;
491
492 if (depth != level) {
493 struct ext4_extent_idx *idx;
494 idx = path[level].p_idx;
495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
496 ext_debug("%d: move %d:%llu in new index %llu\n", level,
497 le32_to_cpu(idx->ei_block),
498 ext4_idx_pblock(idx),
499 newblock);
500 idx++;
501 }
502
503 return;
504 }
505
506 ex = path[depth].p_ext;
507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
509 le32_to_cpu(ex->ee_block),
510 ext4_ext_pblock(ex),
511 ext4_ext_is_uninitialized(ex),
512 ext4_ext_get_actual_len(ex),
513 newblock);
514 ex++;
515 }
516}
517
477#else 518#else
478#define ext4_ext_show_path(inode, path) 519#define ext4_ext_show_path(inode, path)
479#define ext4_ext_show_leaf(inode, path) 520#define ext4_ext_show_leaf(inode, path)
521#define ext4_ext_show_move(inode, path, newblock, level)
480#endif 522#endif
481 523
482void ext4_ext_drop_refs(struct ext4_ext_path *path) 524void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
792 * - initializes subtree 834 * - initializes subtree
793 */ 835 */
794static int ext4_ext_split(handle_t *handle, struct inode *inode, 836static int ext4_ext_split(handle_t *handle, struct inode *inode,
795 struct ext4_ext_path *path, 837 unsigned int flags,
796 struct ext4_extent *newext, int at) 838 struct ext4_ext_path *path,
839 struct ext4_extent *newext, int at)
797{ 840{
798 struct buffer_head *bh = NULL; 841 struct buffer_head *bh = NULL;
799 int depth = ext_depth(inode); 842 int depth = ext_depth(inode);
800 struct ext4_extent_header *neh; 843 struct ext4_extent_header *neh;
801 struct ext4_extent_idx *fidx; 844 struct ext4_extent_idx *fidx;
802 struct ext4_extent *ex;
803 int i = at, k, m, a; 845 int i = at, k, m, a;
804 ext4_fsblk_t newblock, oldblock; 846 ext4_fsblk_t newblock, oldblock;
805 __le32 border; 847 __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
848 for (a = 0; a < depth - at; a++) { 890 for (a = 0; a < depth - at; a++) {
849 newblock = ext4_ext_new_meta_block(handle, inode, path, 891 newblock = ext4_ext_new_meta_block(handle, inode, path,
850 newext, &err); 892 newext, &err, flags);
851 if (newblock == 0) 893 if (newblock == 0)
852 goto cleanup; 894 goto cleanup;
853 ablocks[a] = newblock; 895 ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
877 neh->eh_magic = EXT4_EXT_MAGIC; 919 neh->eh_magic = EXT4_EXT_MAGIC;
878 neh->eh_depth = 0; 920 neh->eh_depth = 0;
879 ex = EXT_FIRST_EXTENT(neh);
880 921
881 /* move remainder of path[depth] to the new leaf */ 922 /* move remainder of path[depth] to the new leaf */
882 if (unlikely(path[depth].p_hdr->eh_entries != 923 if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
888 goto cleanup; 929 goto cleanup;
889 } 930 }
890 /* start copy from next extent */ 931 /* start copy from next extent */
891 /* TODO: we could do it by single memmove */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
892 m = 0; 933 ext4_ext_show_move(inode, path, newblock, depth);
893 path[depth].p_ext++;
894 while (path[depth].p_ext <=
895 EXT_MAX_EXTENT(path[depth].p_hdr)) {
896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
897 le32_to_cpu(path[depth].p_ext->ee_block),
898 ext4_ext_pblock(path[depth].p_ext),
899 ext4_ext_is_uninitialized(path[depth].p_ext),
900 ext4_ext_get_actual_len(path[depth].p_ext),
901 newblock);
902 /*memmove(ex++, path[depth].p_ext++,
903 sizeof(struct ext4_extent));
904 neh->eh_entries++;*/
905 path[depth].p_ext++;
906 m++;
907 }
908 if (m) { 934 if (m) {
909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 935 struct ext4_extent *ex;
936 ex = EXT_FIRST_EXTENT(neh);
937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
910 le16_add_cpu(&neh->eh_entries, m); 938 le16_add_cpu(&neh->eh_entries, m);
911 } 939 }
912 940
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
968 996
969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 i, newblock, le32_to_cpu(border), oldblock); 998 i, newblock, le32_to_cpu(border), oldblock);
971 /* copy indexes */
972 m = 0;
973 path[i].p_idx++;
974 999
975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 /* move remainder of path[i] to the new index block */
976 EXT_MAX_INDEX(path[i].p_hdr));
977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978 EXT_LAST_INDEX(path[i].p_hdr))) { 1002 EXT_LAST_INDEX(path[i].p_hdr))) {
979 EXT4_ERROR_INODE(inode, 1003 EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
982 err = -EIO; 1006 err = -EIO;
983 goto cleanup; 1007 goto cleanup;
984 } 1008 }
985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1009 /* start copy indexes */
986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987 le32_to_cpu(path[i].p_idx->ei_block), 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988 ext4_idx_pblock(path[i].p_idx), 1012 EXT_MAX_INDEX(path[i].p_hdr));
989 newblock); 1013 ext4_ext_show_move(inode, path, newblock, i);
990 /*memmove(++fidx, path[i].p_idx++,
991 sizeof(struct ext4_extent_idx));
992 neh->eh_entries++;
993 BUG_ON(neh->eh_entries > neh->eh_max);*/
994 path[i].p_idx++;
995 m++;
996 }
997 if (m) { 1014 if (m) {
998 memmove(++fidx, path[i].p_idx - m, 1015 memmove(++fidx, path[i].p_idx,
999 sizeof(struct ext4_extent_idx) * m); 1016 sizeof(struct ext4_extent_idx) * m);
1000 le16_add_cpu(&neh->eh_entries, m); 1017 le16_add_cpu(&neh->eh_entries, m);
1001 } 1018 }
@@ -1056,8 +1073,9 @@ cleanup:
1056 * just created block 1073 * just created block
1057 */ 1074 */
1058static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1075static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1059 struct ext4_ext_path *path, 1076 unsigned int flags,
1060 struct ext4_extent *newext) 1077 struct ext4_ext_path *path,
1078 struct ext4_extent *newext)
1061{ 1079{
1062 struct ext4_ext_path *curp = path; 1080 struct ext4_ext_path *curp = path;
1063 struct ext4_extent_header *neh; 1081 struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1065 ext4_fsblk_t newblock; 1083 ext4_fsblk_t newblock;
1066 int err = 0; 1084 int err = 0;
1067 1085
1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1086 newblock = ext4_ext_new_meta_block(handle, inode, path,
1087 newext, &err, flags);
1069 if (newblock == 0) 1088 if (newblock == 0)
1070 return err; 1089 return err;
1071 1090
@@ -1140,8 +1159,9 @@ out:
1140 * if no free index is found, then it requests in-depth growing. 1159 * if no free index is found, then it requests in-depth growing.
1141 */ 1160 */
1142static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1161static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1143 struct ext4_ext_path *path, 1162 unsigned int flags,
1144 struct ext4_extent *newext) 1163 struct ext4_ext_path *path,
1164 struct ext4_extent *newext)
1145{ 1165{
1146 struct ext4_ext_path *curp; 1166 struct ext4_ext_path *curp;
1147 int depth, i, err = 0; 1167 int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
1161 if (EXT_HAS_FREE_INDEX(curp)) { 1181 if (EXT_HAS_FREE_INDEX(curp)) {
1162 /* if we found index with free entry, then use that 1182 /* if we found index with free entry, then use that
1163 * entry: create all needed subtree and add new leaf */ 1183 * entry: create all needed subtree and add new leaf */
1164 err = ext4_ext_split(handle, inode, path, newext, i); 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i);
1165 if (err) 1185 if (err)
1166 goto out; 1186 goto out;
1167 1187
@@ -1174,7 +1194,8 @@ repeat:
1174 err = PTR_ERR(path); 1194 err = PTR_ERR(path);
1175 } else { 1195 } else {
1176 /* tree is full, time to grow in depth */ 1196 /* tree is full, time to grow in depth */
1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1197 err = ext4_ext_grow_indepth(handle, inode, flags,
1198 path, newext);
1178 if (err) 1199 if (err)
1179 goto out; 1200 goto out;
1180 1201
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1564 * 1 if they got merged. 1585 * 1 if they got merged.
1565 */ 1586 */
1566static int ext4_ext_try_to_merge(struct inode *inode, 1587static int ext4_ext_try_to_merge_right(struct inode *inode,
1567 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1568 struct ext4_extent *ex) 1589 struct ext4_extent *ex)
1569{ 1590{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1603} 1624}
1604 1625
1605/* 1626/*
1627 * This function tries to merge the @ex extent to neighbours in the tree.
1628 * return 1 if merge left else 0.
1629 */
1630static int ext4_ext_try_to_merge(struct inode *inode,
1631 struct ext4_ext_path *path,
1632 struct ext4_extent *ex) {
1633 struct ext4_extent_header *eh;
1634 unsigned int depth;
1635 int merge_done = 0;
1636 int ret = 0;
1637
1638 depth = ext_depth(inode);
1639 BUG_ON(path[depth].p_hdr == NULL);
1640 eh = path[depth].p_hdr;
1641
1642 if (ex > EXT_FIRST_EXTENT(eh))
1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1644
1645 if (!merge_done)
1646 ret = ext4_ext_try_to_merge_right(inode, path, ex);
1647
1648 return ret;
1649}
1650
1651/*
1606 * check if a portion of the "newext" extent overlaps with an 1652 * check if a portion of the "newext" extent overlaps with an
1607 * existing extent. 1653 * existing extent.
1608 * 1654 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1668 int depth, len, err; 1714 int depth, len, err;
1669 ext4_lblk_t next; 1715 ext4_lblk_t next;
1670 unsigned uninitialized = 0; 1716 unsigned uninitialized = 0;
1717 int flags = 0;
1671 1718
1672 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1673 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
1742 * There is no free space in the found leaf. 1789 * There is no free space in the found leaf.
1743 * We're gonna add a new leaf in the tree. 1790 * We're gonna add a new leaf in the tree.
1744 */ 1791 */
1745 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1793 flags = EXT4_MB_USE_ROOT_BLOCKS;
1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1746 if (err) 1795 if (err)
1747 goto cleanup; 1796 goto cleanup;
1748 depth = ext_depth(inode); 1797 depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2003} 2052}
2004 2053
2005/* 2054/*
2055 * ext4_ext_in_cache()
2056 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole,
2059 * this routine should be used instead of
2060 * ext4_ext_in_cache if the calling function needs to
2061 * know the size of the hole.
2062 *
2063 * @inode: The files inode
2064 * @block: The block to look for in the cache
2065 * @ex: Pointer where the cached extent will be stored
2066 * if it contains block
2067 *
2006 * Return 0 if cache is invalid; 1 if the cache is valid 2068 * Return 0 if cache is invalid; 1 if the cache is valid
2007 */ 2069 */
2008static int 2070static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2009ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2071 struct ext4_ext_cache *ex){
2010 struct ext4_extent *ex)
2011{
2012 struct ext4_ext_cache *cex; 2072 struct ext4_ext_cache *cex;
2073 struct ext4_sb_info *sbi;
2013 int ret = 0; 2074 int ret = 0;
2014 2075
2015 /* 2076 /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2017 */ 2078 */
2018 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2019 cex = &EXT4_I(inode)->i_cached_extent; 2080 cex = &EXT4_I(inode)->i_cached_extent;
2081 sbi = EXT4_SB(inode->i_sb);
2020 2082
2021 /* has cache valid data? */ 2083 /* has cache valid data? */
2022 if (cex->ec_len == 0) 2084 if (cex->ec_len == 0)
2023 goto errout; 2085 goto errout;
2024 2086
2025 if (in_range(block, cex->ec_block, cex->ec_len)) { 2087 if (in_range(block, cex->ec_block, cex->ec_len)) {
2026 ex->ee_block = cpu_to_le32(cex->ec_block); 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2027 ext4_ext_store_pblock(ex, cex->ec_start);
2028 ex->ee_len = cpu_to_le16(cex->ec_len);
2029 ext_debug("%u cached by %u:%u:%llu\n", 2089 ext_debug("%u cached by %u:%u:%llu\n",
2030 block, 2090 block,
2031 cex->ec_block, cex->ec_len, cex->ec_start); 2091 cex->ec_block, cex->ec_len, cex->ec_start);
2032 ret = 1; 2092 ret = 1;
2033 } 2093 }
2034errout: 2094errout:
2095 if (!ret)
2096 sbi->extent_cache_misses++;
2097 else
2098 sbi->extent_cache_hits++;
2035 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2036 return ret; 2100 return ret;
2037} 2101}
2038 2102
2039/* 2103/*
2104 * ext4_ext_in_cache()
2105 * Checks to see if the given block is in the cache.
2106 * If it is, the cached extent is stored in the given
2107 * extent pointer.
2108 *
2109 * @inode: The files inode
2110 * @block: The block to look for in the cache
2111 * @ex: Pointer where the cached extent will be stored
2112 * if it contains block
2113 *
2114 * Return 0 if cache is invalid; 1 if the cache is valid
2115 */
2116static int
2117ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2118 struct ext4_extent *ex)
2119{
2120 struct ext4_ext_cache cex;
2121 int ret = 0;
2122
2123 if (ext4_ext_check_cache(inode, block, &cex)) {
2124 ex->ee_block = cpu_to_le32(cex.ec_block);
2125 ext4_ext_store_pblock(ex, cex.ec_start);
2126 ex->ee_len = cpu_to_le16(cex.ec_len);
2127 ret = 1;
2128 }
2129
2130 return ret;
2131}
2132
2133
2134/*
2040 * ext4_ext_rm_idx: 2135 * ext4_ext_rm_idx:
2041 * removes index from the index block. 2136 * removes index from the index block.
2042 * It's used in truncate case only, thus all requests are for 2137 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2163 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2164 } else if (from == le32_to_cpu(ex->ee_block) 2259 } else if (from == le32_to_cpu(ex->ee_block)
2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2261 /* head removal */
2167 from, to, le32_to_cpu(ex->ee_block), ee_len); 2262 ext4_lblk_t num;
2263 ext4_fsblk_t start;
2264
2265 num = to - from;
2266 start = ext4_ext_pblock(ex);
2267
2268 ext_debug("free first %u blocks starting %llu\n", num, start);
2269 ext4_free_blocks(handle, inode, 0, start, num, flags);
2270
2168 } else { 2271 } else {
2169 printk(KERN_INFO "strange request: removal(2) " 2272 printk(KERN_INFO "strange request: removal(2) "
2170 "%u-%u from %u:%u\n", 2273 "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2173 return 0; 2276 return 0;
2174} 2277}
2175 2278
2279
2280/*
2281 * ext4_ext_rm_leaf() Removes the extents associated with the
2282 * blocks appearing between "start" and "end", and splits the extents
2283 * if "start" and "end" appear in the same extent
2284 *
2285 * @handle: The journal handle
2286 * @inode: The files inode
2287 * @path: The path to the leaf
2288 * @start: The first block to remove
2289 * @end: The last block to remove
2290 */
2176static int 2291static int
2177ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2292ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2178 struct ext4_ext_path *path, ext4_lblk_t start) 2293 struct ext4_ext_path *path, ext4_lblk_t start,
2294 ext4_lblk_t end)
2179{ 2295{
2180 int err = 0, correct_index = 0; 2296 int err = 0, correct_index = 0;
2181 int depth = ext_depth(inode), credits; 2297 int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2186 unsigned short ex_ee_len; 2302 unsigned short ex_ee_len;
2187 unsigned uninitialized = 0; 2303 unsigned uninitialized = 0;
2188 struct ext4_extent *ex; 2304 struct ext4_extent *ex;
2305 struct ext4_map_blocks map;
2189 2306
2190 /* the header must be checked already in ext4_ext_remove_space() */ 2307 /* the header must be checked already in ext4_ext_remove_space() */
2191 ext_debug("truncate since %u in leaf\n", start); 2308 ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2215 path[depth].p_ext = ex; 2332 path[depth].p_ext = ex;
2216 2333
2217 a = ex_ee_block > start ? ex_ee_block : start; 2334 a = ex_ee_block > start ? ex_ee_block : start;
2218 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2335 b = ex_ee_block+ex_ee_len - 1 < end ?
2219 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2336 ex_ee_block+ex_ee_len - 1 : end;
2220 2337
2221 ext_debug(" border %u:%u\n", a, b); 2338 ext_debug(" border %u:%u\n", a, b);
2222 2339
2223 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2340 /* If this extent is beyond the end of the hole, skip it */
2224 block = 0; 2341 if (end <= ex_ee_block) {
2225 num = 0; 2342 ex--;
2226 BUG(); 2343 ex_ee_block = le32_to_cpu(ex->ee_block);
2344 ex_ee_len = ext4_ext_get_actual_len(ex);
2345 continue;
2346 } else if (a != ex_ee_block &&
2347 b != ex_ee_block + ex_ee_len - 1) {
2348 /*
2349 * If this is a truncate, then this condition should
2350 * never happen because at least one of the end points
2351 * needs to be on the edge of the extent.
2352 */
2353 if (end == EXT_MAX_BLOCK) {
2354 ext_debug(" bad truncate %u:%u\n",
2355 start, end);
2356 block = 0;
2357 num = 0;
2358 err = -EIO;
2359 goto out;
2360 }
2361 /*
2362 * else this is a hole punch, so the extent needs to
2363 * be split since neither edge of the hole is on the
2364 * extent edge
2365 */
2366 else{
2367 map.m_pblk = ext4_ext_pblock(ex);
2368 map.m_lblk = ex_ee_block;
2369 map.m_len = b - ex_ee_block;
2370
2371 err = ext4_split_extent(handle,
2372 inode, path, &map, 0,
2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2374 EXT4_GET_BLOCKS_PRE_IO);
2375
2376 if (err < 0)
2377 goto out;
2378
2379 ex_ee_len = ext4_ext_get_actual_len(ex);
2380
2381 b = ex_ee_block+ex_ee_len - 1 < end ?
2382 ex_ee_block+ex_ee_len - 1 : end;
2383
2384 /* Then remove tail of this extent */
2385 block = ex_ee_block;
2386 num = a - block;
2387 }
2227 } else if (a != ex_ee_block) { 2388 } else if (a != ex_ee_block) {
2228 /* remove tail of the extent */ 2389 /* remove tail of the extent */
2229 block = ex_ee_block; 2390 block = ex_ee_block;
2230 num = a - block; 2391 num = a - block;
2231 } else if (b != ex_ee_block + ex_ee_len - 1) { 2392 } else if (b != ex_ee_block + ex_ee_len - 1) {
2232 /* remove head of the extent */ 2393 /* remove head of the extent */
2233 block = a; 2394 block = b;
2234 num = b - a; 2395 num = ex_ee_block + ex_ee_len - b;
2235 /* there is no "make a hole" API yet */ 2396
2236 BUG(); 2397 /*
2398 * If this is a truncate, this condition
2399 * should never happen
2400 */
2401 if (end == EXT_MAX_BLOCK) {
2402 ext_debug(" bad truncate %u:%u\n",
2403 start, end);
2404 err = -EIO;
2405 goto out;
2406 }
2237 } else { 2407 } else {
2238 /* remove whole extent: excellent! */ 2408 /* remove whole extent: excellent! */
2239 block = ex_ee_block; 2409 block = ex_ee_block;
2240 num = 0; 2410 num = 0;
2241 BUG_ON(a != ex_ee_block); 2411 if (a != ex_ee_block) {
2242 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2412 ext_debug(" bad truncate %u:%u\n",
2413 start, end);
2414 err = -EIO;
2415 goto out;
2416 }
2417
2418 if (b != ex_ee_block + ex_ee_len - 1) {
2419 ext_debug(" bad truncate %u:%u\n",
2420 start, end);
2421 err = -EIO;
2422 goto out;
2423 }
2243 } 2424 }
2244 2425
2245 /* 2426 /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2270 if (num == 0) { 2451 if (num == 0) {
2271 /* this extent is removed; mark slot entirely unused */ 2452 /* this extent is removed; mark slot entirely unused */
2272 ext4_ext_store_pblock(ex, 0); 2453 ext4_ext_store_pblock(ex, 0);
2273 le16_add_cpu(&eh->eh_entries, -1); 2454 } else if (block != ex_ee_block) {
2455 /*
2456 * If this was a head removal, then we need to update
2457 * the physical block since it is now at a different
2458 * location
2459 */
2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2274 } 2461 }
2275 2462
2276 ex->ee_block = cpu_to_le32(block); 2463 ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2286 if (err) 2473 if (err)
2287 goto out; 2474 goto out;
2288 2475
2476 /*
2477 * If the extent was completely released,
2478 * we need to remove it from the leaf
2479 */
2480 if (num == 0) {
2481 if (end != EXT_MAX_BLOCK) {
2482 /*
2483 * For hole punching, we need to scoot all the
2484 * extents up when an extent is removed so that
2485 * we dont have blank extents in the middle
2486 */
2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2488 sizeof(struct ext4_extent));
2489
2490 /* Now get rid of the one at the end */
2491 memset(EXT_LAST_EXTENT(eh), 0,
2492 sizeof(struct ext4_extent));
2493 }
2494 le16_add_cpu(&eh->eh_entries, -1);
2495 }
2496
2289 ext_debug("new extent: %u:%u:%llu\n", block, num, 2497 ext_debug("new extent: %u:%u:%llu\n", block, num,
2290 ext4_ext_pblock(ex)); 2498 ext4_ext_pblock(ex));
2291 ex--; 2499 ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2326 return 1; 2534 return 1;
2327} 2535}
2328 2536
2329static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2538 ext4_lblk_t end)
2330{ 2539{
2331 struct super_block *sb = inode->i_sb; 2540 struct super_block *sb = inode->i_sb;
2332 int depth = ext_depth(inode); 2541 int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
2365 while (i >= 0 && err == 0) { 2574 while (i >= 0 && err == 0) {
2366 if (i == depth) { 2575 if (i == depth) {
2367 /* this is leaf block */ 2576 /* this is leaf block */
2368 err = ext4_ext_rm_leaf(handle, inode, path, start); 2577 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end);
2369 /* root level has p_bh == NULL, brelse() eats this */ 2579 /* root level has p_bh == NULL, brelse() eats this */
2370 brelse(path[i].p_bh); 2580 brelse(path[i].p_bh);
2371 path[i].p_bh = NULL; 2581 path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529 return ret; 2739 return ret;
2530} 2740}
2531 2741
2742/*
2743 * used by extent splitting.
2744 */
2745#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2746 due to ENOSPC */
2747#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2748#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2749
2750/*
2751 * ext4_split_extent_at() splits an extent at given block.
2752 *
2753 * @handle: the journal handle
2754 * @inode: the file inode
2755 * @path: the path to the extent
2756 * @split: the logical block where the extent is splitted.
2757 * @split_flags: indicates if the extent could be zeroout if split fails, and
2758 * the states(init or uninit) of new extents.
2759 * @flags: flags used to insert new extent to extent tree.
2760 *
2761 *
2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2763 * of which are deterimined by split_flag.
2764 *
2765 * There are two cases:
2766 * a> the extent are splitted into two extent.
2767 * b> split is not needed, and just mark the extent.
2768 *
2769 * return 0 on success.
2770 */
2771static int ext4_split_extent_at(handle_t *handle,
2772 struct inode *inode,
2773 struct ext4_ext_path *path,
2774 ext4_lblk_t split,
2775 int split_flag,
2776 int flags)
2777{
2778 ext4_fsblk_t newblock;
2779 ext4_lblk_t ee_block;
2780 struct ext4_extent *ex, newex, orig_ex;
2781 struct ext4_extent *ex2 = NULL;
2782 unsigned int ee_len, depth;
2783 int err = 0;
2784
2785 ext_debug("ext4_split_extents_at: inode %lu, logical"
2786 "block %llu\n", inode->i_ino, (unsigned long long)split);
2787
2788 ext4_ext_show_leaf(inode, path);
2789
2790 depth = ext_depth(inode);
2791 ex = path[depth].p_ext;
2792 ee_block = le32_to_cpu(ex->ee_block);
2793 ee_len = ext4_ext_get_actual_len(ex);
2794 newblock = split - ee_block + ext4_ext_pblock(ex);
2795
2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2797
2798 err = ext4_ext_get_access(handle, inode, path + depth);
2799 if (err)
2800 goto out;
2801
2802 if (split == ee_block) {
2803 /*
2804 * case b: block @split is the block that the extent begins with
2805 * then we just change the state of the extent, and splitting
2806 * is not needed.
2807 */
2808 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809 ext4_ext_mark_uninitialized(ex);
2810 else
2811 ext4_ext_mark_initialized(ex);
2812
2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2814 ext4_ext_try_to_merge(inode, path, ex);
2815
2816 err = ext4_ext_dirty(handle, inode, path + depth);
2817 goto out;
2818 }
2819
2820 /* case a */
2821 memcpy(&orig_ex, ex, sizeof(orig_ex));
2822 ex->ee_len = cpu_to_le16(split - ee_block);
2823 if (split_flag & EXT4_EXT_MARK_UNINIT1)
2824 ext4_ext_mark_uninitialized(ex);
2825
2826 /*
2827 * path may lead to new leaf, not to original leaf any more
2828 * after ext4_ext_insert_extent() returns,
2829 */
2830 err = ext4_ext_dirty(handle, inode, path + depth);
2831 if (err)
2832 goto fix_extent_len;
2833
2834 ex2 = &newex;
2835 ex2->ee_block = cpu_to_le32(split);
2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
2837 ext4_ext_store_pblock(ex2, newblock);
2838 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2839 ext4_ext_mark_uninitialized(ex2);
2840
2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2843 err = ext4_ext_zeroout(inode, &orig_ex);
2844 if (err)
2845 goto fix_extent_len;
2846 /* update the extent length and mark as initialized */
2847 ex->ee_len = cpu_to_le32(ee_len);
2848 ext4_ext_try_to_merge(inode, path, ex);
2849 err = ext4_ext_dirty(handle, inode, path + depth);
2850 goto out;
2851 } else if (err)
2852 goto fix_extent_len;
2853
2854out:
2855 ext4_ext_show_leaf(inode, path);
2856 return err;
2857
2858fix_extent_len:
2859 ex->ee_len = orig_ex.ee_len;
2860 ext4_ext_dirty(handle, inode, path + depth);
2861 return err;
2862}
2863
2864/*
2865 * ext4_split_extents() splits an extent and mark extent which is covered
2866 * by @map as split_flags indicates
2867 *
2868 * It may result in splitting the extent into multiple extents (upto three)
2869 * There are three possibilities:
2870 * a> There is no split required
2871 * b> Splits in two extents: Split is happening at either end of the extent
2872 * c> Splits in three extents: Somone is splitting in middle of the extent
2873 *
2874 */
2875static int ext4_split_extent(handle_t *handle,
2876 struct inode *inode,
2877 struct ext4_ext_path *path,
2878 struct ext4_map_blocks *map,
2879 int split_flag,
2880 int flags)
2881{
2882 ext4_lblk_t ee_block;
2883 struct ext4_extent *ex;
2884 unsigned int ee_len, depth;
2885 int err = 0;
2886 int uninitialized;
2887 int split_flag1, flags1;
2888
2889 depth = ext_depth(inode);
2890 ex = path[depth].p_ext;
2891 ee_block = le32_to_cpu(ex->ee_block);
2892 ee_len = ext4_ext_get_actual_len(ex);
2893 uninitialized = ext4_ext_is_uninitialized(ex);
2894
2895 if (map->m_lblk + map->m_len < ee_block + ee_len) {
2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2897 EXT4_EXT_MAY_ZEROOUT : 0;
2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2899 if (uninitialized)
2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2901 EXT4_EXT_MARK_UNINIT2;
2902 err = ext4_split_extent_at(handle, inode, path,
2903 map->m_lblk + map->m_len, split_flag1, flags1);
2904 if (err)
2905 goto out;
2906 }
2907
2908 ext4_ext_drop_refs(path);
2909 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2910 if (IS_ERR(path))
2911 return PTR_ERR(path);
2912
2913 if (map->m_lblk >= ee_block) {
2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2915 EXT4_EXT_MAY_ZEROOUT : 0;
2916 if (uninitialized)
2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2918 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2920 err = ext4_split_extent_at(handle, inode, path,
2921 map->m_lblk, split_flag1, flags);
2922 if (err)
2923 goto out;
2924 }
2925
2926 ext4_ext_show_leaf(inode, path);
2927out:
2928 return err ? err : map->m_len;
2929}
2930
2532#define EXT4_EXT_ZERO_LEN 7 2931#define EXT4_EXT_ZERO_LEN 7
2533/* 2932/*
2534 * This function is called by ext4_ext_map_blocks() if someone tries to write 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2545 struct ext4_map_blocks *map, 2944 struct ext4_map_blocks *map,
2546 struct ext4_ext_path *path) 2945 struct ext4_ext_path *path)
2547{ 2946{
2548 struct ext4_extent *ex, newex, orig_ex; 2947 struct ext4_map_blocks split_map;
2549 struct ext4_extent *ex1 = NULL; 2948 struct ext4_extent zero_ex;
2550 struct ext4_extent *ex2 = NULL; 2949 struct ext4_extent *ex;
2551 struct ext4_extent *ex3 = NULL;
2552 struct ext4_extent_header *eh;
2553 ext4_lblk_t ee_block, eof_block; 2950 ext4_lblk_t ee_block, eof_block;
2554 unsigned int allocated, ee_len, depth; 2951 unsigned int allocated, ee_len, depth;
2555 ext4_fsblk_t newblock;
2556 int err = 0; 2952 int err = 0;
2557 int ret = 0; 2953 int split_flag = 0;
2558 int may_zeroout;
2559 2954
2560 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2561 "block %llu, max_blocks %u\n", inode->i_ino, 2956 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2567 eof_block = map->m_lblk + map->m_len; 2962 eof_block = map->m_lblk + map->m_len;
2568 2963
2569 depth = ext_depth(inode); 2964 depth = ext_depth(inode);
2570 eh = path[depth].p_hdr;
2571 ex = path[depth].p_ext; 2965 ex = path[depth].p_ext;
2572 ee_block = le32_to_cpu(ex->ee_block); 2966 ee_block = le32_to_cpu(ex->ee_block);
2573 ee_len = ext4_ext_get_actual_len(ex); 2967 ee_len = ext4_ext_get_actual_len(ex);
2574 allocated = ee_len - (map->m_lblk - ee_block); 2968 allocated = ee_len - (map->m_lblk - ee_block);
2575 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2576
2577 ex2 = ex;
2578 orig_ex.ee_block = ex->ee_block;
2579 orig_ex.ee_len = cpu_to_le16(ee_len);
2580 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2581 2969
2970 WARN_ON(map->m_lblk < ee_block);
2582 /* 2971 /*
2583 * It is safe to convert extent to initialized via explicit 2972 * It is safe to convert extent to initialized via explicit
2584 * zeroout only if extent is fully insde i_size or new_size. 2973 * zeroout only if extent is fully insde i_size or new_size.
2585 */ 2974 */
2586 may_zeroout = ee_block + ee_len <= eof_block; 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2587 2976
2588 err = ext4_ext_get_access(handle, inode, path + depth);
2589 if (err)
2590 goto out;
2591 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2592 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2593 err = ext4_ext_zeroout(inode, &orig_ex); 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2980 err = ext4_ext_zeroout(inode, ex);
2594 if (err) 2981 if (err)
2595 goto fix_extent_len;
2596 /* update the extent length and mark as initialized */
2597 ex->ee_block = orig_ex.ee_block;
2598 ex->ee_len = orig_ex.ee_len;
2599 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2600 ext4_ext_dirty(handle, inode, path + depth);
2601 /* zeroed the full extent */
2602 return allocated;
2603 }
2604
2605 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2606 if (map->m_lblk > ee_block) {
2607 ex1 = ex;
2608 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2609 ext4_ext_mark_uninitialized(ex1);
2610 ex2 = &newex;
2611 }
2612 /*
2613 * for sanity, update the length of the ex2 extent before
2614 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2615 * overlap of blocks.
2616 */
2617 if (!ex1 && allocated > map->m_len)
2618 ex2->ee_len = cpu_to_le16(map->m_len);
2619 /* ex3: to ee_block + ee_len : uninitialised */
2620 if (allocated > map->m_len) {
2621 unsigned int newdepth;
2622 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2623 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2624 /*
2625 * map->m_lblk == ee_block is handled by the zerouout
2626 * at the beginning.
2627 * Mark first half uninitialized.
2628 * Mark second half initialized and zero out the
2629 * initialized extent
2630 */
2631 ex->ee_block = orig_ex.ee_block;
2632 ex->ee_len = cpu_to_le16(ee_len - allocated);
2633 ext4_ext_mark_uninitialized(ex);
2634 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2635 ext4_ext_dirty(handle, inode, path + depth);
2636
2637 ex3 = &newex;
2638 ex3->ee_block = cpu_to_le32(map->m_lblk);
2639 ext4_ext_store_pblock(ex3, newblock);
2640 ex3->ee_len = cpu_to_le16(allocated);
2641 err = ext4_ext_insert_extent(handle, inode, path,
2642 ex3, 0);
2643 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err)
2646 goto fix_extent_len;
2647 ex->ee_block = orig_ex.ee_block;
2648 ex->ee_len = orig_ex.ee_len;
2649 ext4_ext_store_pblock(ex,
2650 ext4_ext_pblock(&orig_ex));
2651 ext4_ext_dirty(handle, inode, path + depth);
2652 /* blocks available from map->m_lblk */
2653 return allocated;
2654
2655 } else if (err)
2656 goto fix_extent_len;
2657
2658 /*
2659 * We need to zero out the second half because
2660 * an fallocate request can update file size and
2661 * converting the second half to initialized extent
2662 * implies that we can leak some junk data to user
2663 * space.
2664 */
2665 err = ext4_ext_zeroout(inode, ex3);
2666 if (err) {
2667 /*
2668 * We should actually mark the
2669 * second half as uninit and return error
2670 * Insert would have changed the extent
2671 */
2672 depth = ext_depth(inode);
2673 ext4_ext_drop_refs(path);
2674 path = ext4_ext_find_extent(inode, map->m_lblk,
2675 path);
2676 if (IS_ERR(path)) {
2677 err = PTR_ERR(path);
2678 return err;
2679 }
2680 /* get the second half extent details */
2681 ex = path[depth].p_ext;
2682 err = ext4_ext_get_access(handle, inode,
2683 path + depth);
2684 if (err)
2685 return err;
2686 ext4_ext_mark_uninitialized(ex);
2687 ext4_ext_dirty(handle, inode, path + depth);
2688 return err;
2689 }
2690
2691 /* zeroed the second half */
2692 return allocated;
2693 }
2694 ex3 = &newex;
2695 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2696 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2697 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2698 ext4_ext_mark_uninitialized(ex3);
2699 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2700 if (err == -ENOSPC && may_zeroout) {
2701 err = ext4_ext_zeroout(inode, &orig_ex);
2702 if (err)
2703 goto fix_extent_len;
2704 /* update the extent length and mark as initialized */
2705 ex->ee_block = orig_ex.ee_block;
2706 ex->ee_len = orig_ex.ee_len;
2707 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2708 ext4_ext_dirty(handle, inode, path + depth);
2709 /* zeroed the full extent */
2710 /* blocks available from map->m_lblk */
2711 return allocated;
2712
2713 } else if (err)
2714 goto fix_extent_len;
2715 /*
2716 * The depth, and hence eh & ex might change
2717 * as part of the insert above.
2718 */
2719 newdepth = ext_depth(inode);
2720 /*
2721 * update the extent length after successful insert of the
2722 * split extent
2723 */
2724 ee_len -= ext4_ext_get_actual_len(ex3);
2725 orig_ex.ee_len = cpu_to_le16(ee_len);
2726 may_zeroout = ee_block + ee_len <= eof_block;
2727
2728 depth = newdepth;
2729 ext4_ext_drop_refs(path);
2730 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2731 if (IS_ERR(path)) {
2732 err = PTR_ERR(path);
2733 goto out; 2982 goto out;
2734 }
2735 eh = path[depth].p_hdr;
2736 ex = path[depth].p_ext;
2737 if (ex2 != &newex)
2738 ex2 = ex;
2739 2983
2740 err = ext4_ext_get_access(handle, inode, path + depth); 2984 err = ext4_ext_get_access(handle, inode, path + depth);
2741 if (err) 2985 if (err)
2742 goto out; 2986 goto out;
2743 2987 ext4_ext_mark_initialized(ex);
2744 allocated = map->m_len; 2988 ext4_ext_try_to_merge(inode, path, ex);
2745 2989 err = ext4_ext_dirty(handle, inode, path + depth);
2746 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2990 goto out;
2747 * to insert a extent in the middle zerout directly
2748 * otherwise give the extent a chance to merge to left
2749 */
2750 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2751 map->m_lblk != ee_block && may_zeroout) {
2752 err = ext4_ext_zeroout(inode, &orig_ex);
2753 if (err)
2754 goto fix_extent_len;
2755 /* update the extent length and mark as initialized */
2756 ex->ee_block = orig_ex.ee_block;
2757 ex->ee_len = orig_ex.ee_len;
2758 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2759 ext4_ext_dirty(handle, inode, path + depth);
2760 /* zero out the first half */
2761 /* blocks available from map->m_lblk */
2762 return allocated;
2763 }
2764 }
2765 /*
2766 * If there was a change of depth as part of the
2767 * insertion of ex3 above, we need to update the length
2768 * of the ex1 extent again here
2769 */
2770 if (ex1 && ex1 != ex) {
2771 ex1 = ex;
2772 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2773 ext4_ext_mark_uninitialized(ex1);
2774 ex2 = &newex;
2775 }
2776 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2777 ex2->ee_block = cpu_to_le32(map->m_lblk);
2778 ext4_ext_store_pblock(ex2, newblock);
2779 ex2->ee_len = cpu_to_le16(allocated);
2780 if (ex2 != ex)
2781 goto insert;
2782 /*
2783 * New (initialized) extent starts from the first block
2784 * in the current extent. i.e., ex2 == ex
2785 * We have to see if it can be merged with the extent
2786 * on the left.
2787 */
2788 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2789 /*
2790 * To merge left, pass "ex2 - 1" to try_to_merge(),
2791 * since it merges towards right _only_.
2792 */
2793 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2794 if (ret) {
2795 err = ext4_ext_correct_indexes(handle, inode, path);
2796 if (err)
2797 goto out;
2798 depth = ext_depth(inode);
2799 ex2--;
2800 }
2801 } 2991 }
2992
2802 /* 2993 /*
2803 * Try to Merge towards right. This might be required 2994 * four cases:
2804 * only when the whole extent is being written to. 2995 * 1. split the extent into three extents.
2805 * i.e. ex2 == ex and ex3 == NULL. 2996 * 2. split the extent into two extents, zeroout the first half.
2997 * 3. split the extent into two extents, zeroout the second half.
2998 * 4. split the extent into two extents with out zeroout.
2806 */ 2999 */
2807 if (!ex3) { 3000 split_map.m_lblk = map->m_lblk;
2808 ret = ext4_ext_try_to_merge(inode, path, ex2); 3001 split_map.m_len = map->m_len;
2809 if (ret) { 3002
2810 err = ext4_ext_correct_indexes(handle, inode, path); 3003 if (allocated > map->m_len) {
3004 if (allocated <= EXT4_EXT_ZERO_LEN &&
3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3006 /* case 3 */
3007 zero_ex.ee_block =
3008 cpu_to_le32(map->m_lblk);
3009 zero_ex.ee_len = cpu_to_le16(allocated);
3010 ext4_ext_store_pblock(&zero_ex,
3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3012 err = ext4_ext_zeroout(inode, &zero_ex);
2811 if (err) 3013 if (err)
2812 goto out; 3014 goto out;
3015 split_map.m_lblk = map->m_lblk;
3016 split_map.m_len = allocated;
3017 } else if ((map->m_lblk - ee_block + map->m_len <
3018 EXT4_EXT_ZERO_LEN) &&
3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3020 /* case 2 */
3021 if (map->m_lblk != ee_block) {
3022 zero_ex.ee_block = ex->ee_block;
3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3024 ee_block);
3025 ext4_ext_store_pblock(&zero_ex,
3026 ext4_ext_pblock(ex));
3027 err = ext4_ext_zeroout(inode, &zero_ex);
3028 if (err)
3029 goto out;
3030 }
3031
3032 split_map.m_lblk = ee_block;
3033 split_map.m_len = map->m_lblk - ee_block + map->m_len;
3034 allocated = map->m_len;
2813 } 3035 }
2814 } 3036 }
2815 /* Mark modified extent as dirty */ 3037
2816 err = ext4_ext_dirty(handle, inode, path + depth); 3038 allocated = ext4_split_extent(handle, inode, path,
2817 goto out; 3039 &split_map, split_flag, 0);
2818insert: 3040 if (allocated < 0)
2819 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3041 err = allocated;
2820 if (err == -ENOSPC && may_zeroout) { 3042
2821 err = ext4_ext_zeroout(inode, &orig_ex);
2822 if (err)
2823 goto fix_extent_len;
2824 /* update the extent length and mark as initialized */
2825 ex->ee_block = orig_ex.ee_block;
2826 ex->ee_len = orig_ex.ee_len;
2827 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2828 ext4_ext_dirty(handle, inode, path + depth);
2829 /* zero out the first half */
2830 return allocated;
2831 } else if (err)
2832 goto fix_extent_len;
2833out: 3043out:
2834 ext4_ext_show_leaf(inode, path);
2835 return err ? err : allocated; 3044 return err ? err : allocated;
2836
2837fix_extent_len:
2838 ex->ee_block = orig_ex.ee_block;
2839 ex->ee_len = orig_ex.ee_len;
2840 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2841 ext4_ext_mark_uninitialized(ex);
2842 ext4_ext_dirty(handle, inode, path + depth);
2843 return err;
2844} 3045}
2845 3046
2846/* 3047/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2871 struct ext4_ext_path *path, 3072 struct ext4_ext_path *path,
2872 int flags) 3073 int flags)
2873{ 3074{
2874 struct ext4_extent *ex, newex, orig_ex; 3075 ext4_lblk_t eof_block;
2875 struct ext4_extent *ex1 = NULL; 3076 ext4_lblk_t ee_block;
2876 struct ext4_extent *ex2 = NULL; 3077 struct ext4_extent *ex;
2877 struct ext4_extent *ex3 = NULL; 3078 unsigned int ee_len;
2878 ext4_lblk_t ee_block, eof_block; 3079 int split_flag = 0, depth;
2879 unsigned int allocated, ee_len, depth;
2880 ext4_fsblk_t newblock;
2881 int err = 0;
2882 int may_zeroout;
2883 3080
2884 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2885 "block %llu, max_blocks %u\n", inode->i_ino, 3082 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2889 inode->i_sb->s_blocksize_bits; 3086 inode->i_sb->s_blocksize_bits;
2890 if (eof_block < map->m_lblk + map->m_len) 3087 if (eof_block < map->m_lblk + map->m_len)
2891 eof_block = map->m_lblk + map->m_len; 3088 eof_block = map->m_lblk + map->m_len;
2892
2893 depth = ext_depth(inode);
2894 ex = path[depth].p_ext;
2895 ee_block = le32_to_cpu(ex->ee_block);
2896 ee_len = ext4_ext_get_actual_len(ex);
2897 allocated = ee_len - (map->m_lblk - ee_block);
2898 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2899
2900 ex2 = ex;
2901 orig_ex.ee_block = ex->ee_block;
2902 orig_ex.ee_len = cpu_to_le16(ee_len);
2903 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2904
2905 /* 3089 /*
2906 * It is safe to convert extent to initialized via explicit 3090 * It is safe to convert extent to initialized via explicit
2907 * zeroout only if extent is fully insde i_size or new_size. 3091 * zeroout only if extent is fully insde i_size or new_size.
2908 */ 3092 */
2909 may_zeroout = ee_block + ee_len <= eof_block; 3093 depth = ext_depth(inode);
2910 3094 ex = path[depth].p_ext;
2911 /* 3095 ee_block = le32_to_cpu(ex->ee_block);
2912 * If the uninitialized extent begins at the same logical 3096 ee_len = ext4_ext_get_actual_len(ex);
2913 * block where the write begins, and the write completely
2914 * covers the extent, then we don't need to split it.
2915 */
2916 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2917 return allocated;
2918
2919 err = ext4_ext_get_access(handle, inode, path + depth);
2920 if (err)
2921 goto out;
2922 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2923 if (map->m_lblk > ee_block) {
2924 ex1 = ex;
2925 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2926 ext4_ext_mark_uninitialized(ex1);
2927 ex2 = &newex;
2928 }
2929 /*
2930 * for sanity, update the length of the ex2 extent before
2931 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2932 * overlap of blocks.
2933 */
2934 if (!ex1 && allocated > map->m_len)
2935 ex2->ee_len = cpu_to_le16(map->m_len);
2936 /* ex3: to ee_block + ee_len : uninitialised */
2937 if (allocated > map->m_len) {
2938 unsigned int newdepth;
2939 ex3 = &newex;
2940 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2941 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2942 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2943 ext4_ext_mark_uninitialized(ex3);
2944 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2945 if (err == -ENOSPC && may_zeroout) {
2946 err = ext4_ext_zeroout(inode, &orig_ex);
2947 if (err)
2948 goto fix_extent_len;
2949 /* update the extent length and mark as initialized */
2950 ex->ee_block = orig_ex.ee_block;
2951 ex->ee_len = orig_ex.ee_len;
2952 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2953 ext4_ext_dirty(handle, inode, path + depth);
2954 /* zeroed the full extent */
2955 /* blocks available from map->m_lblk */
2956 return allocated;
2957
2958 } else if (err)
2959 goto fix_extent_len;
2960 /*
2961 * The depth, and hence eh & ex might change
2962 * as part of the insert above.
2963 */
2964 newdepth = ext_depth(inode);
2965 /*
2966 * update the extent length after successful insert of the
2967 * split extent
2968 */
2969 ee_len -= ext4_ext_get_actual_len(ex3);
2970 orig_ex.ee_len = cpu_to_le16(ee_len);
2971 may_zeroout = ee_block + ee_len <= eof_block;
2972
2973 depth = newdepth;
2974 ext4_ext_drop_refs(path);
2975 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2976 if (IS_ERR(path)) {
2977 err = PTR_ERR(path);
2978 goto out;
2979 }
2980 ex = path[depth].p_ext;
2981 if (ex2 != &newex)
2982 ex2 = ex;
2983 3097
2984 err = ext4_ext_get_access(handle, inode, path + depth); 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2985 if (err) 3099 split_flag |= EXT4_EXT_MARK_UNINIT2;
2986 goto out;
2987 3100
2988 allocated = map->m_len; 3101 flags |= EXT4_GET_BLOCKS_PRE_IO;
2989 } 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
2990 /*
2991 * If there was a change of depth as part of the
2992 * insertion of ex3 above, we need to update the length
2993 * of the ex1 extent again here
2994 */
2995 if (ex1 && ex1 != ex) {
2996 ex1 = ex;
2997 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2998 ext4_ext_mark_uninitialized(ex1);
2999 ex2 = &newex;
3000 }
3001 /*
3002 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3003 * using direct I/O, uninitialised still.
3004 */
3005 ex2->ee_block = cpu_to_le32(map->m_lblk);
3006 ext4_ext_store_pblock(ex2, newblock);
3007 ex2->ee_len = cpu_to_le16(allocated);
3008 ext4_ext_mark_uninitialized(ex2);
3009 if (ex2 != ex)
3010 goto insert;
3011 /* Mark modified extent as dirty */
3012 err = ext4_ext_dirty(handle, inode, path + depth);
3013 ext_debug("out here\n");
3014 goto out;
3015insert:
3016 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3017 if (err == -ENOSPC && may_zeroout) {
3018 err = ext4_ext_zeroout(inode, &orig_ex);
3019 if (err)
3020 goto fix_extent_len;
3021 /* update the extent length and mark as initialized */
3022 ex->ee_block = orig_ex.ee_block;
3023 ex->ee_len = orig_ex.ee_len;
3024 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3025 ext4_ext_dirty(handle, inode, path + depth);
3026 /* zero out the first half */
3027 return allocated;
3028 } else if (err)
3029 goto fix_extent_len;
3030out:
3031 ext4_ext_show_leaf(inode, path);
3032 return err ? err : allocated;
3033
3034fix_extent_len:
3035 ex->ee_block = orig_ex.ee_block;
3036 ex->ee_len = orig_ex.ee_len;
3037 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3038 ext4_ext_mark_uninitialized(ex);
3039 ext4_ext_dirty(handle, inode, path + depth);
3040 return err;
3041} 3103}
3104
3042static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3105static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3043 struct inode *inode, 3106 struct inode *inode,
3044 struct ext4_ext_path *path) 3107 struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3047 struct ext4_extent_header *eh; 3110 struct ext4_extent_header *eh;
3048 int depth; 3111 int depth;
3049 int err = 0; 3112 int err = 0;
3050 int ret = 0;
3051 3113
3052 depth = ext_depth(inode); 3114 depth = ext_depth(inode);
3053 eh = path[depth].p_hdr; 3115 eh = path[depth].p_hdr;
3054 ex = path[depth].p_ext; 3116 ex = path[depth].p_ext;
3055 3117
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3119 "block %llu, max_blocks %u\n", inode->i_ino,
3120 (unsigned long long)le32_to_cpu(ex->ee_block),
3121 ext4_ext_get_actual_len(ex));
3122
3056 err = ext4_ext_get_access(handle, inode, path + depth); 3123 err = ext4_ext_get_access(handle, inode, path + depth);
3057 if (err) 3124 if (err)
3058 goto out; 3125 goto out;
3059 /* first mark the extent as initialized */ 3126 /* first mark the extent as initialized */
3060 ext4_ext_mark_initialized(ex); 3127 ext4_ext_mark_initialized(ex);
3061 3128
3062 /* 3129 /* note: ext4_ext_correct_indexes() isn't needed here because
3063 * We have to see if it can be merged with the extent 3130 * borders are not changed
3064 * on the left.
3065 */
3066 if (ex > EXT_FIRST_EXTENT(eh)) {
3067 /*
3068 * To merge left, pass "ex - 1" to try_to_merge(),
3069 * since it merges towards right _only_.
3070 */
3071 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3072 if (ret) {
3073 err = ext4_ext_correct_indexes(handle, inode, path);
3074 if (err)
3075 goto out;
3076 depth = ext_depth(inode);
3077 ex--;
3078 }
3079 }
3080 /*
3081 * Try to Merge towards right.
3082 */ 3131 */
3083 ret = ext4_ext_try_to_merge(inode, path, ex); 3132 ext4_ext_try_to_merge(inode, path, ex);
3084 if (ret) { 3133
3085 err = ext4_ext_correct_indexes(handle, inode, path);
3086 if (err)
3087 goto out;
3088 depth = ext_depth(inode);
3089 }
3090 /* Mark modified extent as dirty */ 3134 /* Mark modified extent as dirty */
3091 err = ext4_ext_dirty(handle, inode, path + depth); 3135 err = ext4_ext_dirty(handle, inode, path + depth);
3092out: 3136out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3302 ext4_fsblk_t newblock = 0; 3346 ext4_fsblk_t newblock = 0;
3303 int err = 0, depth, ret; 3347 int err = 0, depth, ret;
3304 unsigned int allocated = 0; 3348 unsigned int allocated = 0;
3349 unsigned int punched_out = 0;
3350 unsigned int result = 0;
3305 struct ext4_allocation_request ar; 3351 struct ext4_allocation_request ar;
3306 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3353 struct ext4_map_blocks punch_map;
3307 3354
3308 ext_debug("blocks %u/%u requested for inode %lu\n", 3355 ext_debug("blocks %u/%u requested for inode %lu\n",
3309 map->m_lblk, map->m_len, inode->i_ino); 3356 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3311 3358
3312 /* check in cache */ 3359 /* check in cache */
3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
3314 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3315 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3316 /* 3364 /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3375 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3376 ee_block, ee_len, newblock); 3424 ee_block, ee_len, newblock);
3377 3425
3378 /* Do not put uninitialized extent in the cache */ 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3379 if (!ext4_ext_is_uninitialized(ex)) { 3427 /*
3380 ext4_ext_put_in_cache(inode, ee_block, 3428 * Do not put uninitialized extent
3381 ee_len, ee_start); 3429 * in the cache
3382 goto out; 3430 */
3431 if (!ext4_ext_is_uninitialized(ex)) {
3432 ext4_ext_put_in_cache(inode, ee_block,
3433 ee_len, ee_start);
3434 goto out;
3435 }
3436 ret = ext4_ext_handle_uninitialized_extents(
3437 handle, inode, map, path, flags,
3438 allocated, newblock);
3439 return ret;
3383 } 3440 }
3384 ret = ext4_ext_handle_uninitialized_extents(handle, 3441
3385 inode, map, path, flags, allocated, 3442 /*
3386 newblock); 3443 * Punch out the map length, but only to the
3387 return ret; 3444 * end of the extent
3445 */
3446 punched_out = allocated < map->m_len ?
3447 allocated : map->m_len;
3448
3449 /*
3450 * Sense extents need to be converted to
3451 * uninitialized, they must fit in an
3452 * uninitialized extent
3453 */
3454 if (punched_out > EXT_UNINIT_MAX_LEN)
3455 punched_out = EXT_UNINIT_MAX_LEN;
3456
3457 punch_map.m_lblk = map->m_lblk;
3458 punch_map.m_pblk = newblock;
3459 punch_map.m_len = punched_out;
3460 punch_map.m_flags = 0;
3461
3462 /* Check to see if the extent needs to be split */
3463 if (punch_map.m_len != ee_len ||
3464 punch_map.m_lblk != ee_block) {
3465
3466 ret = ext4_split_extent(handle, inode,
3467 path, &punch_map, 0,
3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3469 EXT4_GET_BLOCKS_PRE_IO);
3470
3471 if (ret < 0) {
3472 err = ret;
3473 goto out2;
3474 }
3475 /*
3476 * find extent for the block at
3477 * the start of the hole
3478 */
3479 ext4_ext_drop_refs(path);
3480 kfree(path);
3481
3482 path = ext4_ext_find_extent(inode,
3483 map->m_lblk, NULL);
3484 if (IS_ERR(path)) {
3485 err = PTR_ERR(path);
3486 path = NULL;
3487 goto out2;
3488 }
3489
3490 depth = ext_depth(inode);
3491 ex = path[depth].p_ext;
3492 ee_len = ext4_ext_get_actual_len(ex);
3493 ee_block = le32_to_cpu(ex->ee_block);
3494 ee_start = ext4_ext_pblock(ex);
3495
3496 }
3497
3498 ext4_ext_mark_uninitialized(ex);
3499
3500 err = ext4_ext_remove_space(inode, map->m_lblk,
3501 map->m_lblk + punched_out);
3502
3503 goto out2;
3388 } 3504 }
3389 } 3505 }
3390 3506
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3446 else 3562 else
3447 /* disable in-core preallocation for non-regular files */ 3563 /* disable in-core preallocation for non-regular files */
3448 ar.flags = 0; 3564 ar.flags = 0;
3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3449 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err);
3450 if (!newblock) 3568 if (!newblock)
3451 goto out2; 3569 goto out2;
@@ -3529,7 +3647,11 @@ out2:
3529 } 3647 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated); 3649 newblock, map->m_len, err ? err : allocated);
3532 return err ? err : allocated; 3650
3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3652 punched_out : allocated;
3653
3654 return err ? err : result;
3533} 3655}
3534 3656
3535void ext4_ext_truncate(struct inode *inode) 3657void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
3577 3699
3578 last_block = (inode->i_size + sb->s_blocksize - 1) 3700 last_block = (inode->i_size + sb->s_blocksize - 1)
3579 >> EXT4_BLOCK_SIZE_BITS(sb); 3701 >> EXT4_BLOCK_SIZE_BITS(sb);
3580 err = ext4_ext_remove_space(inode, last_block); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
3581 3703
3582 /* In a multi-transaction truncate, we only make the final 3704 /* In a multi-transaction truncate, we only make the final
3583 * transaction synchronous. 3705 * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
3585 if (IS_SYNC(inode)) 3707 if (IS_SYNC(inode))
3586 ext4_handle_sync(handle); 3708 ext4_handle_sync(handle);
3587 3709
3588out_stop:
3589 up_write(&EXT4_I(inode)->i_data_sem); 3710 up_write(&EXT4_I(inode)->i_data_sem);
3711
3712out_stop:
3590 /* 3713 /*
3591 * If this was a simple ftruncate() and the file will remain alive, 3714 * If this was a simple ftruncate() and the file will remain alive,
3592 * then we need to clear up the orphan record which we created above. 3715 * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3651 struct ext4_map_blocks map; 3774 struct ext4_map_blocks map;
3652 unsigned int credits, blkbits = inode->i_blkbits; 3775 unsigned int credits, blkbits = inode->i_blkbits;
3653 3776
3654 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3655 if (mode & ~FALLOC_FL_KEEP_SIZE)
3656 return -EOPNOTSUPP;
3657
3658 /* 3777 /*
3659 * currently supporting (pre)allocate mode for extent-based 3778 * currently supporting (pre)allocate mode for extent-based
3660 * files _only_ 3779 * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3663 return -EOPNOTSUPP; 3782 return -EOPNOTSUPP;
3664 3783
3784 /* Return error if mode is not supported */
3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3786 return -EOPNOTSUPP;
3787
3788 if (mode & FALLOC_FL_PUNCH_HOLE)
3789 return ext4_punch_hole(file, offset, len);
3790
3665 trace_ext4_fallocate_enter(inode, offset, len, mode); 3791 trace_ext4_fallocate_enter(inode, offset, len, mode);
3666 map.m_lblk = offset >> blkbits; 3792 map.m_lblk = offset >> blkbits;
3667 /* 3793 /*
@@ -3691,7 +3817,8 @@ retry:
3691 break; 3817 break;
3692 } 3818 }
3693 ret = ext4_map_blocks(handle, inode, &map, 3819 ret = ext4_map_blocks(handle, inode, &map,
3694 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3821 EXT4_GET_BLOCKS_NO_NORMALIZE);
3695 if (ret <= 0) { 3822 if (ret <= 0) {
3696#ifdef EXT4FS_DEBUG 3823#ifdef EXT4FS_DEBUG
3697 WARN_ON(ret <= 0); 3824 WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 pgoff_t last_offset; 3949 pgoff_t last_offset;
3823 pgoff_t offset; 3950 pgoff_t offset;
3824 pgoff_t index; 3951 pgoff_t index;
3952 pgoff_t start_index = 0;
3825 struct page **pages = NULL; 3953 struct page **pages = NULL;
3826 struct buffer_head *bh = NULL; 3954 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL; 3955 struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
3848 kfree(pages); 3976 kfree(pages);
3849 return EXT_CONTINUE; 3977 return EXT_CONTINUE;
3850 } 3978 }
3979 index = 0;
3851 3980
3981next_page:
3852 /* Try to find the 1st mapped buffer. */ 3982 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 3983 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3854 blksize_bits; 3984 blksize_bits;
3855 if (!page_has_buffers(pages[0])) 3985 if (!page_has_buffers(pages[index]))
3856 goto out; 3986 goto out;
3857 head = page_buffers(pages[0]); 3987 head = page_buffers(pages[index]);
3858 if (!head) 3988 if (!head)
3859 goto out; 3989 goto out;
3860 3990
3991 index++;
3861 bh = head; 3992 bh = head;
3862 do { 3993 do {
3863 if (buffer_mapped(bh)) { 3994 if (end >= newex->ec_block +
3995 newex->ec_len)
3996 /* The buffer is out of
3997 * the request range.
3998 */
3999 goto out;
4000
4001 if (buffer_mapped(bh) &&
4002 end >= newex->ec_block) {
4003 start_index = index - 1;
3864 /* get the 1st mapped buffer. */ 4004 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer; 4005 goto found_mapped_buffer;
3872 } 4006 }
4007
3873 bh = bh->b_this_page; 4008 bh = bh->b_this_page;
3874 end++; 4009 end++;
3875 } while (bh != head); 4010 } while (bh != head);
3876 4011
3877 /* No mapped buffer found. */ 4012 /* No mapped buffer in the range found in this page,
3878 goto out; 4013 * We need to look up next page.
4014 */
4015 if (index >= ret) {
4016 /* There is no page left, but we need to limit
4017 * newex->ec_len.
4018 */
4019 newex->ec_len = end - newex->ec_block;
4020 goto out;
4021 }
4022 goto next_page;
3879 } else { 4023 } else {
3880 /*Find contiguous delayed buffers. */ 4024 /*Find contiguous delayed buffers. */
3881 if (ret > 0 && pages[0]->index == last_offset) 4025 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]); 4026 head = page_buffers(pages[0]);
3883 bh = head; 4027 bh = head;
4028 index = 1;
4029 start_index = 0;
3884 } 4030 }
3885 4031
3886found_mapped_buffer: 4032found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
3903 end++; 4049 end++;
3904 } while (bh != head); 4050 } while (bh != head);
3905 4051
3906 for (index = 1; index < ret; index++) { 4052 for (; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) { 4053 if (!page_has_buffers(pages[index])) {
3908 bh = NULL; 4054 bh = NULL;
3909 break; 4055 break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
3913 bh = NULL; 4059 bh = NULL;
3914 break; 4060 break;
3915 } 4061 }
4062
3916 if (pages[index]->index != 4063 if (pages[index]->index !=
3917 pages[0]->index + index) { 4064 pages[start_index]->index + index
4065 - start_index) {
3918 /* Blocks are not contiguous. */ 4066 /* Blocks are not contiguous. */
3919 bh = NULL; 4067 bh = NULL;
3920 break; 4068 break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
4006 return (error < 0 ? error : 0); 4154 return (error < 0 ? error : 0);
4007} 4155}
4008 4156
4157/*
4158 * ext4_ext_punch_hole
4159 *
4160 * Punches a hole of "length" bytes in a file starting
4161 * at byte "offset"
4162 *
4163 * @inode: The inode of the file to punch a hole in
4164 * @offset: The starting byte offset of the hole
4165 * @length: The length of the hole
4166 *
4167 * Returns the number of blocks removed or negative on err
4168 */
4169int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4170{
4171 struct inode *inode = file->f_path.dentry->d_inode;
4172 struct super_block *sb = inode->i_sb;
4173 struct ext4_ext_cache cache_ex;
4174 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4175 struct address_space *mapping = inode->i_mapping;
4176 struct ext4_map_blocks map;
4177 handle_t *handle;
4178 loff_t first_block_offset, last_block_offset, block_len;
4179 loff_t first_page, last_page, first_page_offset, last_page_offset;
4180 int ret, credits, blocks_released, err = 0;
4181
4182 first_block = (offset + sb->s_blocksize - 1) >>
4183 EXT4_BLOCK_SIZE_BITS(sb);
4184 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4185
4186 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4187 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4188
4189 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4190 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4191
4192 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4193 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4194
4195 /*
4196 * Write out all dirty pages to avoid race conditions
4197 * Then release them.
4198 */
4199 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4200 err = filemap_write_and_wait_range(mapping,
4201 first_page_offset == 0 ? 0 : first_page_offset-1,
4202 last_page_offset);
4203
4204 if (err)
4205 return err;
4206 }
4207
4208 /* Now release the pages */
4209 if (last_page_offset > first_page_offset) {
4210 truncate_inode_pages_range(mapping, first_page_offset,
4211 last_page_offset-1);
4212 }
4213
4214 /* finish any pending end_io work */
4215 ext4_flush_completed_IO(inode);
4216
4217 credits = ext4_writepage_trans_blocks(inode);
4218 handle = ext4_journal_start(inode, credits);
4219 if (IS_ERR(handle))
4220 return PTR_ERR(handle);
4221
4222 err = ext4_orphan_add(handle, inode);
4223 if (err)
4224 goto out;
4225
4226 /*
4227 * Now we need to zero out the un block aligned data.
4228 * If the file is smaller than a block, just
4229 * zero out the middle
4230 */
4231 if (first_block > last_block)
4232 ext4_block_zero_page_range(handle, mapping, offset, length);
4233 else {
4234 /* zero out the head of the hole before the first block */
4235 block_len = first_block_offset - offset;
4236 if (block_len > 0)
4237 ext4_block_zero_page_range(handle, mapping,
4238 offset, block_len);
4239
4240 /* zero out the tail of the hole after the last block */
4241 block_len = offset + length - last_block_offset;
4242 if (block_len > 0) {
4243 ext4_block_zero_page_range(handle, mapping,
4244 last_block_offset, block_len);
4245 }
4246 }
4247
4248 /* If there are no blocks to remove, return now */
4249 if (first_block >= last_block)
4250 goto out;
4251
4252 down_write(&EXT4_I(inode)->i_data_sem);
4253 ext4_ext_invalidate_cache(inode);
4254 ext4_discard_preallocations(inode);
4255
4256 /*
4257 * Loop over all the blocks and identify blocks
4258 * that need to be punched out
4259 */
4260 iblock = first_block;
4261 blocks_released = 0;
4262 while (iblock < last_block) {
4263 max_blocks = last_block - iblock;
4264 num_blocks = 1;
4265 memset(&map, 0, sizeof(map));
4266 map.m_lblk = iblock;
4267 map.m_len = max_blocks;
4268 ret = ext4_ext_map_blocks(handle, inode, &map,
4269 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4270
4271 if (ret > 0) {
4272 blocks_released += ret;
4273 num_blocks = ret;
4274 } else if (ret == 0) {
4275 /*
4276 * If map blocks could not find the block,
4277 * then it is in a hole. If the hole was
4278 * not already cached, then map blocks should
4279 * put it in the cache. So we can get the hole
4280 * out of the cache
4281 */
4282 memset(&cache_ex, 0, sizeof(cache_ex));
4283 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4284 !cache_ex.ec_start) {
4285
4286 /* The hole is cached */
4287 num_blocks = cache_ex.ec_block +
4288 cache_ex.ec_len - iblock;
4289
4290 } else {
4291 /* The block could not be identified */
4292 err = -EIO;
4293 break;
4294 }
4295 } else {
4296 /* Map blocks error */
4297 err = ret;
4298 break;
4299 }
4300
4301 if (num_blocks == 0) {
4302 /* This condition should never happen */
4303 ext_debug("Block lookup failed");
4304 err = -EIO;
4305 break;
4306 }
4307
4308 iblock += num_blocks;
4309 }
4310
4311 if (blocks_released > 0) {
4312 ext4_ext_invalidate_cache(inode);
4313 ext4_discard_preallocations(inode);
4314 }
4315
4316 if (IS_SYNC(inode))
4317 ext4_handle_sync(handle);
4318
4319 up_write(&EXT4_I(inode)->i_data_sem);
4320
4321out:
4322 ext4_orphan_del(handle, inode);
4323 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4324 ext4_mark_inode_dirty(handle, inode);
4325 ext4_journal_stop(handle);
4326 return err;
4327}
4009int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4328int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4010 __u64 start, __u64 len) 4329 __u64 start, __u64 len)
4011{ 4330{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4042 4361
4043 return error; 4362 return error;
4044} 4363}
4045
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
272}; 272};
273 273
274const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
275 .truncate = ext4_truncate,
276 .setattr = ext4_setattr, 275 .setattr = ext4_setattr,
277 .getattr = ext4_getattr, 276 .getattr = ext4_getattr,
278#ifdef CONFIG_EXT4_FS_XATTR 277#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
36 36
37static void dump_completed_IO(struct inode * inode) 37static void dump_completed_IO(struct inode * inode)
38{ 38{
39#ifdef EXT4_DEBUG 39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after; 40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1; 41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags; 42 unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
173 int ret; 173 int ret;
174 tid_t commit_tid; 174 tid_t commit_tid;
175 bool needs_barrier = false;
175 176
176 J_ASSERT(ext4_journal_current_handle() == NULL); 177 J_ASSERT(ext4_journal_current_handle() == NULL);
177 178
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
211 } 212 }
212 213
213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 214 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
214 if (jbd2_log_start_commit(journal, commit_tid)) { 215 if (journal->j_flags & JBD2_BARRIER &&
215 /* 216 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
216 * When the journal is on a different device than the 217 needs_barrier = true;
217 * fs data disk, we need to issue the barrier in 218 jbd2_log_start_commit(journal, commit_tid);
218 * writeback mode. (In ordered mode, the jbd2 layer 219 ret = jbd2_log_wait_commit(journal, commit_tid);
219 * will take care of issuing the barrier. In 220 if (needs_barrier)
220 * data=journal, all of the data blocks are written to
221 * the journal device.)
222 */
223 if (ext4_should_writeback_data(inode) &&
224 (journal->j_fs_dev != journal->j_dev) &&
225 (journal->j_flags & JBD2_BARRIER))
226 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
227 NULL);
228 ret = jbd2_log_wait_commit(journal, commit_tid);
229 } else if (journal->j_flags & JBD2_BARRIER)
230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 221 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out: 222 out:
232 trace_ext4_sync_file_exit(inode, ret); 223 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 goal, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
@@ -1930,7 +1930,7 @@ repeat:
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 if (PageWriteback(page)) 2799 wait_on_page_writeback(page);
2800 wait_on_page_writeback(page);
2801
2802 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2803 2801
2804 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
3513 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3514 3512
3515 if (end > isize) 3513 if (end > isize)
3516 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3517 } 3515 }
3518 } 3516 }
3519 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
3916int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
3917 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
3918{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
3919 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3920 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3921 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
3922 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
3923 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
3924 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
3931 return -EINVAL; 3950 return -EINVAL;
3932 3951
3933 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
3934 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
3935 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3936 3963
3937 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4380 4407
4381int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4382{ 4409{
4383 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4384 return 0;
4385 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4386 return 1; 4411 return 1;
4387 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4392} 4417}
4393 4418
4394/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4395 * ext4_truncate() 4445 * ext4_truncate()
4396 * 4446 *
4397 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4617 /* 4667 /*
4618 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4619 */ 4669 */
4620 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4621 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4622 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4623 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5311 5361
5312 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5313 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5314 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5315 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5316 handle_t *handle; 5365 handle_t *handle;
5317 5366
5318 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5346 goto err_out; 5395 goto err_out;
5347 } 5396 }
5348 } 5397 }
5349 /* ext4_truncate will clear the flag */
5350 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5351 ext4_truncate(inode);
5352 } 5398 }
5353 5399
5354 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5355 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5356 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5357 5407
5358 if (!rc) { 5408 if (!rc) {
5359 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5811 goto out_unlock; 5861 goto out_unlock;
5812 } 5862 }
5813 ret = 0; 5863 ret = 0;
5814 if (PageMappedToDisk(page)) 5864
5815 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
5816 5871
5817 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
5818 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
5819 else 5874 else
5820 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
5821 5876
5822 lock_page(page);
5823 /* 5877 /*
5824 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
5825 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5829 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
5830 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5831 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
5832 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
5833 goto out_unlock; 5887 return VM_FAULT_LOCKED;
5834 } 5888 }
5835 } 5889 }
5836 unlock_page(page); 5890 unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5850 if (ret < 0) 5904 if (ret < 0)
5851 goto out_unlock; 5905 goto out_unlock;
5852 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
5853out_unlock: 5917out_unlock:
5854 if (ret) 5918 if (ret)
5855 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
787 struct inode *inode; 787 struct inode *inode;
788 char *data; 788 char *data;
789 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
790 791
791 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
792 793
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
819 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
820 break; 821 break;
821 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
822 err = -EIO; 835 err = -EIO;
823 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
824 if (desc == NULL) 837 if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
871 } 884 }
872 885
873 /* wait for I/O completion */ 886 /* wait for I/O completion */
874 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
875 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
876 890
877 err = -EIO; 891 err = -EIO;
878 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
879 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
880 goto out; 894 goto out;
881 895
882 err = 0; 896 err = 0;
883 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
884 /* init the page */
885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
886 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
887 int group; 899 int group;
888 struct ext4_group_info *grinfo;
889 900
890 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
891 if (group >= ngroups) 902 if (group >= ngroups)
892 break; 903 break;
893 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
894 /* 909 /*
895 * data carry information regarding this 910 * data carry information regarding this
896 * particular group in the format specified 911 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
919 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
920 */ 935 */
921 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
922 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
923 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
924 incore = NULL; 941 incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
948 965
949out: 966out:
950 if (bh) { 967 if (bh) {
951 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
952 brelse(bh[i]); 969 brelse(bh[i]);
953 if (bh != &bhs) 970 if (bh != &bhs)
954 kfree(bh); 971 kfree(bh);
@@ -957,22 +974,21 @@ out:
957} 974}
958 975
959/* 976/*
960 * lock the group_info alloc_sem of all the groups 977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
961 * belonging to the same buddy cache page. This 978 * on the same buddy page doesn't happen whild holding the buddy page lock.
962 * make sure other parallel operation on the buddy 979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
963 * cache doesn't happen whild holding the buddy cache 980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
964 * lock
965 */ 981 */
966static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, 982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
967 ext4_group_t group) 983 ext4_group_t group, struct ext4_buddy *e4b)
968{ 984{
969 int i; 985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
970 int block, pnum; 986 int block, pnum, poff;
971 int blocks_per_page; 987 int blocks_per_page;
972 int groups_per_page; 988 struct page *page;
973 ext4_group_t ngroups = ext4_get_groups_count(sb); 989
974 ext4_group_t first_group; 990 e4b->bd_buddy_page = NULL;
975 struct ext4_group_info *grp; 991 e4b->bd_bitmap_page = NULL;
976 992
977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
978 /* 994 /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
982 */ 998 */
983 block = group * 2; 999 block = group * 2;
984 pnum = block / blocks_per_page; 1000 pnum = block / blocks_per_page;
985 first_group = pnum * blocks_per_page / 2; 1001 poff = block % blocks_per_page;
986 1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
987 groups_per_page = blocks_per_page >> 1; 1003 if (!page)
988 if (groups_per_page == 0) 1004 return -EIO;
989 groups_per_page = 1; 1005 BUG_ON(page->mapping != inode->i_mapping);
990 /* read all groups the page covers into the cache */ 1006 e4b->bd_bitmap_page = page;
991 for (i = 0; i < groups_per_page; i++) { 1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
992 1008
993 if ((first_group + i) >= ngroups) 1009 if (blocks_per_page >= 2) {
994 break; 1010 /* buddy and bitmap are on the same page */
995 grp = ext4_get_group_info(sb, first_group + i); 1011 return 0;
996 /* take all groups write allocation
997 * semaphore. This make sure there is
998 * no block allocation going on in any
999 * of that groups
1000 */
1001 down_write_nested(&grp->alloc_sem, i);
1002 } 1012 }
1003 return i; 1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1004} 1023}
1005 1024
1006static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1007 ext4_group_t group, int locked_group)
1008{ 1026{
1009 int i; 1027 if (e4b->bd_bitmap_page) {
1010 int block, pnum; 1028 unlock_page(e4b->bd_bitmap_page);
1011 int blocks_per_page; 1029 page_cache_release(e4b->bd_bitmap_page);
1012 ext4_group_t first_group; 1030 }
1013 struct ext4_group_info *grp; 1031 if (e4b->bd_buddy_page) {
1014 1032 unlock_page(e4b->bd_buddy_page);
1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1033 page_cache_release(e4b->bd_buddy_page);
1016 /*
1017 * the buddy cache inode stores the block bitmap
1018 * and buddy information in consecutive blocks.
1019 * So for each group we need two blocks.
1020 */
1021 block = group * 2;
1022 pnum = block / blocks_per_page;
1023 first_group = pnum * blocks_per_page / 2;
1024 /* release locks on all the groups */
1025 for (i = 0; i < locked_group; i++) {
1026
1027 grp = ext4_get_group_info(sb, first_group + i);
1028 /* take all groups write allocation
1029 * semaphore. This make sure there is
1030 * no block allocation going on in any
1031 * of that groups
1032 */
1033 up_write(&grp->alloc_sem);
1034 } 1034 }
1035
1036} 1035}
1037 1036
1038/* 1037/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
1044int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1045{ 1044{
1046 1045
1047 int ret = 0;
1048 void *bitmap;
1049 int blocks_per_page;
1050 int block, pnum, poff;
1051 int num_grp_locked = 0;
1052 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
1054 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
1055 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
1056 1050
1057 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1059 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
1060 /* 1053 /*
1061 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
1062 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
1063 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
1064 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
1065 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
1066 */ 1059 */
1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1069 /* 1062 /*
1070 * somebody initialized the group 1063 * somebody initialized the group
1071 * return without doing anything 1064 * return without doing anything
1072 */ 1065 */
1073 ret = 0;
1074 goto err; 1066 goto err;
1075 } 1067 }
1076 /* 1068
1077 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
1078 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
1079 * So for each group we need two blocks. 1071 if (ret)
1080 */ 1072 goto err;
1081 block = group * 2; 1073 if (!PageUptodate(page)) {
1082 pnum = block / blocks_per_page;
1083 poff = block % blocks_per_page;
1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1085 if (page) {
1086 BUG_ON(page->mapping != inode->i_mapping);
1087 ret = ext4_mb_init_cache(page, NULL);
1088 if (ret) {
1089 unlock_page(page);
1090 goto err;
1091 }
1092 unlock_page(page);
1093 }
1094 if (page == NULL || !PageUptodate(page)) {
1095 ret = -EIO; 1074 ret = -EIO;
1096 goto err; 1075 goto err;
1097 } 1076 }
1098 mark_page_accessed(page); 1077 mark_page_accessed(page);
1099 bitmap_page = page;
1100 bitmap = page_address(page) + (poff * sb->s_blocksize);
1101 1078
1102 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1103 block++;
1104 pnum = block / blocks_per_page;
1105 poff = block % blocks_per_page;
1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1107 if (page == bitmap_page) {
1108 /* 1080 /*
1109 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1110 * the same page we don't need to force 1082 * the same page we don't need to force
1111 * init the buddy 1083 * init the buddy
1112 */ 1084 */
1113 unlock_page(page); 1085 ret = 0;
1114 } else if (page) { 1086 goto err;
1115 BUG_ON(page->mapping != inode->i_mapping);
1116 ret = ext4_mb_init_cache(page, bitmap);
1117 if (ret) {
1118 unlock_page(page);
1119 goto err;
1120 }
1121 unlock_page(page);
1122 } 1087 }
1123 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1124 ret = -EIO; 1094 ret = -EIO;
1125 goto err; 1095 goto err;
1126 } 1096 }
1127 mark_page_accessed(page); 1097 mark_page_accessed(page);
1128err: 1098err:
1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1130 if (bitmap_page)
1131 page_cache_release(bitmap_page);
1132 if (page)
1133 page_cache_release(page);
1134 return ret; 1100 return ret;
1135} 1101}
1136 1102
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1164 e4b->bd_group = group; 1130 e4b->bd_group = group;
1165 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1166 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1167 e4b->alloc_semp = &grp->alloc_sem;
1168
1169 /* Take the read lock on the group alloc
1170 * sem. This would make sure a parallel
1171 * ext4_mb_init_group happening on other
1172 * groups mapped by the page is blocked
1173 * till we are done with allocation
1174 */
1175repeat_load_buddy:
1176 down_read(e4b->alloc_semp);
1177 1133
1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1179 /* we need to check for group need init flag
1180 * with alloc_semp held so that we can be sure
1181 * that new blocks didn't get added to the group
1182 * when we are loading the buddy cache
1183 */
1184 up_read(e4b->alloc_semp);
1185 /* 1135 /*
1186 * we need full data about the group 1136 * we need full data about the group
1187 * to make a good selection 1137 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
1189 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1190 if (ret) 1140 if (ret)
1191 return ret; 1141 return ret;
1192 goto repeat_load_buddy;
1193 } 1142 }
1194 1143
1195 /* 1144 /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
1273 return 0; 1222 return 0;
1274 1223
1275err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1276 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1277 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1278 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1279 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1280 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1281 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1282
1283 /* Done with the buddy cache */
1284 up_read(e4b->alloc_semp);
1285 return ret; 1233 return ret;
1286} 1234}
1287 1235
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1291 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1292 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1293 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1294 /* Done with the buddy cache */
1295 if (e4b->alloc_semp)
1296 up_read(e4b->alloc_semp);
1297} 1242}
1298 1243
1299 1244
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1606 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1607 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1608 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1609 /* on allocation we use ac to track the held semaphore */
1610 ac->alloc_semp = e4b->alloc_semp;
1611 e4b->alloc_semp = NULL;
1612 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1613 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1614 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2659 struct super_block *sb = journal->j_private; 2601 struct super_block *sb = journal->j_private;
2660 struct ext4_buddy e4b; 2602 struct ext4_buddy e4b;
2661 struct ext4_group_info *db; 2603 struct ext4_group_info *db;
2662 int err, ret, count = 0, count2 = 0; 2604 int err, count = 0, count2 = 0;
2663 struct ext4_free_data *entry; 2605 struct ext4_free_data *entry;
2664 struct list_head *l, *ltmp; 2606 struct list_head *l, *ltmp;
2665 2607
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2669 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2611 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2670 entry->count, entry->group, entry); 2612 entry->count, entry->group, entry);
2671 2613
2672 if (test_opt(sb, DISCARD)) { 2614 if (test_opt(sb, DISCARD))
2673 ret = ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2674 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2675 if (unlikely(ret == -EOPNOTSUPP)) {
2676 ext4_warning(sb, "discard not supported, "
2677 "disabling");
2678 clear_opt(sb, DISCARD);
2679 }
2680 }
2681 2617
2682 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2683 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4226 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4227 } 4163 }
4228 } 4164 }
4229 if (ac->alloc_semp)
4230 up_read(ac->alloc_semp);
4231 if (pa) { 4165 if (pa) {
4232 /* 4166 /*
4233 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4234 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4235 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4236 * doesn't grow big. We need to release 4170 * doesn't grow big.
4237 * alloc_semp before calling ext4_mb_add_n_trim()
4238 */ 4171 */
4239 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4240 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4303 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4304 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4305 */ 4238 */
4306 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4307 /* let others to free the space */ 4242 /* let others to free the space */
4308 yield(); 4243 yield();
4309 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 return 0; 4248 return 0;
4314 } 4249 }
4315 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4316 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4317 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4318 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4319 } 4260 }
4320 inquota = ar->len; 4261 inquota = ar->len;
4321 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
4704} 4645}
4705 4646
4706/** 4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4707 * ext4_trim_extent -- function to TRIM one single free extent in the group 4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4708 * @sb: super block for the file system 4770 * @sb: super block for the file system
4709 * @start: starting block of the free extent in the alloc. group 4771 * @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
4715 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4716 * be called with under the group lock. 4778 * be called with under the group lock.
4717 */ 4779 */
4718static int ext4_trim_extent(struct super_block *sb, int start, int count, 4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4719 ext4_group_t group, struct ext4_buddy *e4b) 4781 ext4_group_t group, struct ext4_buddy *e4b)
4720{ 4782{
4721 struct ext4_free_extent ex; 4783 struct ext4_free_extent ex;
4722 int ret = 0;
4723 4784
4724 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4725 4786
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4733 */ 4794 */
4734 mb_mark_used(e4b, &ex); 4795 mb_mark_used(e4b, &ex);
4735 ext4_unlock_group(sb, group); 4796 ext4_unlock_group(sb, group);
4736 4797 ext4_issue_discard(sb, group, start, count);
4737 ret = ext4_issue_discard(sb, group, start, count);
4738
4739 ext4_lock_group(sb, group); 4798 ext4_lock_group(sb, group);
4740 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4741 return ret;
4742} 4800}
4743 4801
4744/** 4802/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4760 * the group buddy bitmap. This is done until whole group is scanned. 4818 * the group buddy bitmap. This is done until whole group is scanned.
4761 */ 4819 */
4762static ext4_grpblk_t 4820static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4765{ 4824{
4766 void *bitmap; 4825 void *bitmap;
4767 ext4_grpblk_t next, count = 0; 4826 ext4_grpblk_t next, count = 0;
4768 ext4_group_t group; 4827 struct ext4_buddy e4b;
4769 int ret = 0; 4828 int ret;
4770 4829
4771 BUG_ON(e4b == NULL); 4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4772 4837
4773 bitmap = e4b->bd_bitmap;
4774 group = e4b->bd_group;
4775 start = (e4b->bd_info->bb_first_free > start) ?
4776 e4b->bd_info->bb_first_free : start;
4777 ext4_lock_group(sb, group); 4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4778 4841
4779 while (start < max) { 4842 while (start < max) {
4780 start = mb_find_next_zero_bit(bitmap, max, start); 4843 start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4783 next = mb_find_next_bit(bitmap, max, start); 4846 next = mb_find_next_bit(bitmap, max, start);
4784 4847
4785 if ((next - start) >= minblocks) { 4848 if ((next - start) >= minblocks) {
4786 ret = ext4_trim_extent(sb, start, 4849 ext4_trim_extent(sb, start,
4787 next - start, group, e4b); 4850 next - start, group, &e4b);
4788 if (ret < 0)
4789 break;
4790 count += next - start; 4851 count += next - start;
4791 } 4852 }
4792 start = next + 1; 4853 start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4802 ext4_lock_group(sb, group); 4863 ext4_lock_group(sb, group);
4803 } 4864 }
4804 4865
4805 if ((e4b->bd_info->bb_free - count) < minblocks) 4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4806 break; 4867 break;
4807 } 4868 }
4808 ext4_unlock_group(sb, group); 4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4809 4871
4810 ext4_debug("trimmed %d blocks in the group %d\n", 4872 ext4_debug("trimmed %d blocks in the group %d\n",
4811 count, group); 4873 count, group);
4812 4874
4813 if (ret < 0)
4814 count = ret;
4815
4816 return count; 4875 return count;
4817} 4876}
4818 4877
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4830 */ 4889 */
4831int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4832{ 4891{
4833 struct ext4_buddy e4b; 4892 struct ext4_group_info *grp;
4834 ext4_group_t first_group, last_group; 4893 ext4_group_t first_group, last_group;
4835 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4836 ext4_grpblk_t cnt = 0, first_block, last_block; 4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4837 uint64_t start, len, minlen, trimmed; 4896 uint64_t start, len, minlen, trimmed = 0;
4838 ext4_fsblk_t first_data_blk = 4897 ext4_fsblk_t first_data_blk =
4839 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4840 int ret = 0; 4899 int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4842 start = range->start >> sb->s_blocksize_bits; 4901 start = range->start >> sb->s_blocksize_bits;
4843 len = range->len >> sb->s_blocksize_bits; 4902 len = range->len >> sb->s_blocksize_bits;
4844 minlen = range->minlen >> sb->s_blocksize_bits; 4903 minlen = range->minlen >> sb->s_blocksize_bits;
4845 trimmed = 0;
4846 4904
4847 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4848 return -EINVAL; 4906 return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 return -EINVAL; 4921 return -EINVAL;
4864 4922
4865 for (group = first_group; group <= last_group; group++) { 4923 for (group = first_group; group <= last_group; group++) {
4866 ret = ext4_mb_load_buddy(sb, group, &e4b); 4924 grp = ext4_get_group_info(sb, group);
4867 if (ret) { 4925 /* We only do this if the grp has never been initialized */
4868 ext4_error(sb, "Error in loading buddy " 4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4869 "information for %u", group); 4927 ret = ext4_mb_init_group(sb, group);
4870 break; 4928 if (ret)
4929 break;
4871 } 4930 }
4872 4931
4873 /* 4932 /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4880 last_block = first_block + len; 4939 last_block = first_block + len;
4881 len -= last_block - first_block; 4940 len -= last_block - first_block;
4882 4941
4883 if (e4b.bd_info->bb_free >= minlen) { 4942 if (grp->bb_free >= minlen) {
4884 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4943 cnt = ext4_trim_all_free(sb, group, first_block,
4885 last_block, minlen); 4944 last_block, minlen);
4886 if (cnt < 0) { 4945 if (cnt < 0) {
4887 ret = cnt; 4946 ret = cnt;
4888 ext4_mb_unload_buddy(&e4b);
4889 break; 4947 break;
4890 } 4948 }
4891 } 4949 }
4892 ext4_mb_unload_buddy(&e4b);
4893 trimmed += cnt; 4950 trimmed += cnt;
4894 first_block = 0; 4951 first_block = 0;
4895 } 4952 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
193 __u8 ac_op; /* operation, for history only */ 193 __u8 ac_op; /* operation, for history only */
194 struct page *ac_bitmap_page; 194 struct page *ac_bitmap_page;
195 struct page *ac_buddy_page; 195 struct page *ac_buddy_page;
196 /*
197 * pointer to the held semaphore upon successful
198 * block allocation
199 */
200 struct rw_semaphore *alloc_semp;
201 struct ext4_prealloc_space *ac_pa; 196 struct ext4_prealloc_space *ac_pa;
202 struct ext4_locality_group *ac_lg; 197 struct ext4_locality_group *ac_lg;
203}; 198};
@@ -215,7 +210,6 @@ struct ext4_buddy {
215 struct super_block *bd_sb; 210 struct super_block *bd_sb;
216 __u16 bd_blkbits; 211 __u16 bd_blkbits;
217 ext4_group_t bd_group; 212 ext4_group_t bd_group;
218 struct rw_semaphore *alloc_semp;
219}; 213};
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 214#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 215#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); 379 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
1#include <linux/fs.h>
2#include <linux/random.h>
3#include <linux/buffer_head.h>
4#include <linux/utsname.h>
5#include <linux/kthread.h>
6
7#include "ext4.h"
8
9/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster.
12 */
13static int write_mmp_block(struct buffer_head *bh)
14{
15 mark_buffer_dirty(bh);
16 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync;
18 get_bh(bh);
19 submit_bh(WRITE_SYNC, bh);
20 wait_on_buffer(bh);
21 if (unlikely(!buffer_uptodate(bh)))
22 return 1;
23
24 return 0;
25}
26
27/*
28 * Read the MMP block. It _must_ be read from disk and hence we clear the
29 * uptodate flag on the buffer.
30 */
31static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
32 ext4_fsblk_t mmp_block)
33{
34 struct mmp_struct *mmp;
35
36 if (*bh)
37 clear_buffer_uptodate(*bh);
38
39 /* This would be sb_bread(sb, mmp_block), except we need to be sure
40 * that the MD RAID device cache has been bypassed, and that the read
41 * is not blocked in the elevator. */
42 if (!*bh)
43 *bh = sb_getblk(sb, mmp_block);
44 if (*bh) {
45 get_bh(*bh);
46 lock_buffer(*bh);
47 (*bh)->b_end_io = end_buffer_read_sync;
48 submit_bh(READ_SYNC, *bh);
49 wait_on_buffer(*bh);
50 if (!buffer_uptodate(*bh)) {
51 brelse(*bh);
52 *bh = NULL;
53 }
54 }
55 if (!*bh) {
56 ext4_warning(sb, "Error while reading MMP block %llu",
57 mmp_block);
58 return -EIO;
59 }
60
61 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
63 return -EINVAL;
64
65 return 0;
66}
67
68/*
69 * Dump as much information as possible to help the admin.
70 */
71void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
72 const char *function, unsigned int line, const char *msg)
73{
74 __ext4_warning(sb, function, line, msg);
75 __ext4_warning(sb, function, line,
76 "MMP failure info: last update time: %llu, last update "
77 "node: %s, last update device: %s\n",
78 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
79 mmp->mmp_nodename, mmp->mmp_bdevname);
80}
81
82/*
83 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
84 */
85static int kmmpd(void *data)
86{
87 struct super_block *sb = ((struct mmpd_data *) data)->sb;
88 struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
89 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
90 struct mmp_struct *mmp;
91 ext4_fsblk_t mmp_block;
92 u32 seq = 0;
93 unsigned long failed_writes = 0;
94 int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
95 unsigned mmp_check_interval;
96 unsigned long last_update_time;
97 unsigned long diff;
98 int retval;
99
100 mmp_block = le64_to_cpu(es->s_mmp_block);
101 mmp = (struct mmp_struct *)(bh->b_data);
102 mmp->mmp_time = cpu_to_le64(get_seconds());
103 /*
104 * Start with the higher mmp_check_interval and reduce it if
105 * the MMP block is being updated on time.
106 */
107 mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
108 EXT4_MMP_MIN_CHECK_INTERVAL);
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname,
113 sizeof(mmp->mmp_nodename));
114
115 while (!kthread_should_stop()) {
116 if (++seq > EXT4_MMP_SEQ_MAX)
117 seq = 1;
118
119 mmp->mmp_seq = cpu_to_le32(seq);
120 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies;
122
123 retval = write_mmp_block(bh);
124 /*
125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds.
127 */
128 if (retval && (failed_writes % 60) == 0) {
129 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++;
131 }
132
133 if (!(le32_to_cpu(es->s_feature_incompat) &
134 EXT4_FEATURE_INCOMPAT_MMP)) {
135 ext4_warning(sb, "kmmpd being stopped since MMP feature"
136 " has been disabled.");
137 EXT4_SB(sb)->s_mmp_tsk = NULL;
138 goto failed;
139 }
140
141 if (sb->s_flags & MS_RDONLY) {
142 ext4_warning(sb, "kmmpd being stopped since filesystem "
143 "has been remounted as readonly.");
144 EXT4_SB(sb)->s_mmp_tsk = NULL;
145 goto failed;
146 }
147
148 diff = jiffies - last_update_time;
149 if (diff < mmp_update_interval * HZ)
150 schedule_timeout_interruptible(mmp_update_interval *
151 HZ - diff);
152
153 /*
154 * We need to make sure that more than mmp_check_interval
155 * seconds have not passed since writing. If that has happened
156 * we need to check if the MMP block is as we left it.
157 */
158 diff = jiffies - last_update_time;
159 if (diff > mmp_check_interval * HZ) {
160 struct buffer_head *bh_check = NULL;
161 struct mmp_struct *mmp_check;
162
163 retval = read_mmp_block(sb, &bh_check, mmp_block);
164 if (retval) {
165 ext4_error(sb, "error reading MMP data: %d",
166 retval);
167
168 EXT4_SB(sb)->s_mmp_tsk = NULL;
169 goto failed;
170 }
171
172 mmp_check = (struct mmp_struct *)(bh_check->b_data);
173 if (mmp->mmp_seq != mmp_check->mmp_seq ||
174 memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
175 sizeof(mmp->mmp_nodename))) {
176 dump_mmp_msg(sb, mmp_check,
177 "Error while updating MMP info. "
178 "The filesystem seems to have been"
179 " multiply mounted.");
180 ext4_error(sb, "abort");
181 goto failed;
182 }
183 put_bh(bh_check);
184 }
185
186 /*
187 * Adjust the mmp_check_interval depending on how much time
188 * it took for the MMP block to be written.
189 */
190 mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
191 EXT4_MMP_MAX_CHECK_INTERVAL),
192 EXT4_MMP_MIN_CHECK_INTERVAL);
193 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
194 }
195
196 /*
197 * Unmount seems to be clean.
198 */
199 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
200 mmp->mmp_time = cpu_to_le64(get_seconds());
201
202 retval = write_mmp_block(bh);
203
204failed:
205 kfree(data);
206 brelse(bh);
207 return retval;
208}
209
210/*
211 * Get a random new sequence number but make sure it is not greater than
212 * EXT4_MMP_SEQ_MAX.
213 */
214static unsigned int mmp_new_seq(void)
215{
216 u32 new_seq;
217
218 do {
219 get_random_bytes(&new_seq, sizeof(u32));
220 } while (new_seq > EXT4_MMP_SEQ_MAX);
221
222 return new_seq;
223}
224
225/*
226 * Protect the filesystem from being mounted more than once.
227 */
228int ext4_multi_mount_protect(struct super_block *sb,
229 ext4_fsblk_t mmp_block)
230{
231 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
232 struct buffer_head *bh = NULL;
233 struct mmp_struct *mmp = NULL;
234 struct mmpd_data *mmpd_data;
235 u32 seq;
236 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
237 unsigned int wait_time = 0;
238 int retval;
239
240 if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
241 mmp_block >= ext4_blocks_count(es)) {
242 ext4_warning(sb, "Invalid MMP block in superblock");
243 goto failed;
244 }
245
246 retval = read_mmp_block(sb, &bh, mmp_block);
247 if (retval)
248 goto failed;
249
250 mmp = (struct mmp_struct *)(bh->b_data);
251
252 if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
253 mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
254
255 /*
256 * If check_interval in MMP block is larger, use that instead of
257 * update_interval from the superblock.
258 */
259 if (mmp->mmp_check_interval > mmp_check_interval)
260 mmp_check_interval = mmp->mmp_check_interval;
261
262 seq = le32_to_cpu(mmp->mmp_seq);
263 if (seq == EXT4_MMP_SEQ_CLEAN)
264 goto skip;
265
266 if (seq == EXT4_MMP_SEQ_FSCK) {
267 dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
268 goto failed;
269 }
270
271 wait_time = min(mmp_check_interval * 2 + 1,
272 mmp_check_interval + 60);
273
274 /* Print MMP interval if more than 20 secs. */
275 if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
276 ext4_warning(sb, "MMP interval %u higher than expected, please"
277 " wait.\n", wait_time * 2);
278
279 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
280 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
281 goto failed;
282 }
283
284 retval = read_mmp_block(sb, &bh, mmp_block);
285 if (retval)
286 goto failed;
287 mmp = (struct mmp_struct *)(bh->b_data);
288 if (seq != le32_to_cpu(mmp->mmp_seq)) {
289 dump_mmp_msg(sb, mmp,
290 "Device is already active on another node.");
291 goto failed;
292 }
293
294skip:
295 /*
296 * write a new random sequence number.
297 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
299
300 retval = write_mmp_block(bh);
301 if (retval)
302 goto failed;
303
304 /*
305 * wait for MMP interval and check mmp_seq.
306 */
307 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
308 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
309 goto failed;
310 }
311
312 retval = read_mmp_block(sb, &bh, mmp_block);
313 if (retval)
314 goto failed;
315 mmp = (struct mmp_struct *)(bh->b_data);
316 if (seq != le32_to_cpu(mmp->mmp_seq)) {
317 dump_mmp_msg(sb, mmp,
318 "Device is already active on another node.");
319 goto failed;
320 }
321
322 mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
323 if (!mmpd_data) {
324 ext4_warning(sb, "not enough memory for mmpd_data");
325 goto failed;
326 }
327 mmpd_data->sb = sb;
328 mmpd_data->bh = bh;
329
330 /*
331 * Start a kernel thread to update the MMP block periodically.
332 */
333 EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
334 bdevname(bh->b_bdev,
335 mmp->mmp_bdevname));
336 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
337 EXT4_SB(sb)->s_mmp_tsk = NULL;
338 kfree(mmpd_data);
339 ext4_warning(sb, "Unable to create kmmpd thread for %s.",
340 sb->s_id);
341 goto failed;
342 }
343
344 return 0;
345
346failed:
347 brelse(bh);
348 return 1;
349}
350
351
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
876 * It needs to call wait_on_page_writeback() to wait for the 876 * It needs to call wait_on_page_writeback() to wait for the
877 * writeback of the page. 877 * writeback of the page.
878 */ 878 */
879 if (PageWriteback(page)) 879 wait_on_page_writeback(page);
880 wait_on_page_writeback(page);
881 880
882 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
883 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1413 frame->at = entries; 1413 frame->at = entries;
1414 frame->bh = bh; 1414 frame->bh = bh;
1415 bh = bh2; 1415 bh = bh2;
1416
1417 ext4_handle_dirty_metadata(handle, dir, frame->bh);
1418 ext4_handle_dirty_metadata(handle, dir, bh);
1419
1416 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1420 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1417 dx_release (frames); 1421 if (!de) {
1418 if (!(de)) 1422 /*
1423 * Even if the block split failed, we have to properly write
1424 * out all the changes we did so far. Otherwise we can end up
1425 * with corrupted filesystem.
1426 */
1427 ext4_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1419 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1420 1432
1421 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1422 brelse(bh); 1434 brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
2240 handle_t *handle; 2252 handle_t *handle;
2241 struct inode *inode; 2253 struct inode *inode;
2242 int l, err, retries = 0; 2254 int l, err, retries = 0;
2255 int credits;
2243 2256
2244 l = strlen(symname)+1; 2257 l = strlen(symname)+1;
2245 if (l > dir->i_sb->s_blocksize) 2258 if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
2247 2260
2248 dquot_initialize(dir); 2261 dquot_initialize(dir);
2249 2262
2263 if (l > EXT4_N_BLOCKS * 4) {
2264 /*
2265 * For non-fast symlinks, we just allocate inode and put it on
2266 * orphan list in the first transaction => we need bitmap,
2267 * group descriptor, sb, inode block, quota blocks.
2268 */
2269 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2270 } else {
2271 /*
2272 * Fast symlink. We have to add entry to directory
2273 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2274 * allocate new inode (bitmap, group descriptor, inode block,
2275 * quota blocks, sb is already counted in previous macros).
2276 */
2277 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2278 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2279 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2280 }
2250retry: 2281retry:
2251 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 handle = ext4_journal_start(dir, credits);
2252 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2253 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2254 if (IS_ERR(handle)) 2283 if (IS_ERR(handle))
2255 return PTR_ERR(handle); 2284 return PTR_ERR(handle);
2256 2285
@@ -2263,21 +2292,44 @@ retry:
2263 if (IS_ERR(inode)) 2292 if (IS_ERR(inode))
2264 goto out_stop; 2293 goto out_stop;
2265 2294
2266 if (l > sizeof(EXT4_I(inode)->i_data)) { 2295 if (l > EXT4_N_BLOCKS * 4) {
2267 inode->i_op = &ext4_symlink_inode_operations; 2296 inode->i_op = &ext4_symlink_inode_operations;
2268 ext4_set_aops(inode); 2297 ext4_set_aops(inode);
2269 /* 2298 /*
2270 * page_symlink() calls into ext4_prepare/commit_write. 2299 * We cannot call page_symlink() with transaction started
2271 * We have a transaction open. All is sweetness. It also sets 2300 * because it calls into ext4_write_begin() which can wait
2272 * i_size in generic_commit_write(). 2301 * for transaction commit if we are running out of space
2302 * and thus we deadlock. So we have to stop transaction now
2303 * and restart it when symlink contents is written.
2304 *
2305 * To keep fs consistent in case of crash, we have to put inode
2306 * to orphan list in the mean time.
2273 */ 2307 */
2308 drop_nlink(inode);
2309 err = ext4_orphan_add(handle, inode);
2310 ext4_journal_stop(handle);
2311 if (err)
2312 goto err_drop_inode;
2274 err = __page_symlink(inode, symname, l, 1); 2313 err = __page_symlink(inode, symname, l, 1);
2314 if (err)
2315 goto err_drop_inode;
2316 /*
2317 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2318 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2319 */
2320 handle = ext4_journal_start(dir,
2321 EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2322 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2323 if (IS_ERR(handle)) {
2324 err = PTR_ERR(handle);
2325 goto err_drop_inode;
2326 }
2327 inc_nlink(inode);
2328 err = ext4_orphan_del(handle, inode);
2275 if (err) { 2329 if (err) {
2330 ext4_journal_stop(handle);
2276 clear_nlink(inode); 2331 clear_nlink(inode);
2277 unlock_new_inode(inode); 2332 goto err_drop_inode;
2278 ext4_mark_inode_dirty(handle, inode);
2279 iput(inode);
2280 goto out_stop;
2281 } 2333 }
2282 } else { 2334 } else {
2283 /* clear the extent format for fast symlink */ 2335 /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
2293 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2345 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2294 goto retry; 2346 goto retry;
2295 return err; 2347 return err;
2348err_drop_inode:
2349 unlock_new_inode(inode);
2350 iput(inode);
2351 return err;
2296} 2352}
2297 2353
2298static int ext4_link(struct dentry *old_dentry, 2354static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
203 for (i = 0; i < io_end->num_io_pages; i++) { 203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 205 struct buffer_head *bh, *head;
206 int partial_write = 0; 206 loff_t offset;
207 loff_t io_end_offset;
207 208
208 head = page_buffers(page); 209 if (error) {
209 if (error)
210 SetPageError(page); 210 SetPageError(page);
211 BUG_ON(!head); 211 set_bit(AS_EIO, &page->mapping->flags);
212 if (head->b_size != PAGE_CACHE_SIZE) { 212 head = page_buffers(page);
213 loff_t offset; 213 BUG_ON(!head);
214 loff_t io_end_offset = io_end->offset + io_end->size; 214
215 io_end_offset = io_end->offset + io_end->size;
215 216
216 offset = (sector_t) page->index << PAGE_CACHE_SHIFT; 217 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
217 bh = head; 218 bh = head;
218 do { 219 do {
219 if ((offset >= io_end->offset) && 220 if ((offset >= io_end->offset) &&
220 (offset+bh->b_size <= io_end_offset)) { 221 (offset+bh->b_size <= io_end_offset))
221 if (error) 222 buffer_io_error(bh);
222 buffer_io_error(bh); 223
223
224 }
225 if (buffer_delay(bh))
226 partial_write = 1;
227 else if (!buffer_mapped(bh))
228 clear_buffer_dirty(bh);
229 else if (buffer_dirty(bh))
230 partial_write = 1;
231 offset += bh->b_size; 224 offset += bh->b_size;
232 bh = bh->b_this_page; 225 bh = bh->b_this_page;
233 } while (bh != head); 226 } while (bh != head);
234 } 227 }
235 228
236 /*
237 * If this is a partial write which happened to make
238 * all buffers uptodate then we can optimize away a
239 * bogus readpage() for the next read(). Here we
240 * 'discover' whether the page went uptodate as a
241 * result of this (potentially partial) write.
242 */
243 if (!partial_write)
244 SetPageUptodate(page);
245
246 put_io_page(io_end->pages[i]); 229 put_io_page(io_end->pages[i]);
247 } 230 }
248 io_end->num_io_pages = 0; 231 io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..d9937df7f5cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static inline int ext2_feature_set_ok(struct super_block *sb);
79static inline int ext3_feature_set_ok(struct super_block *sb);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly); 80static int ext4_feature_set_ok(struct super_block *sb, int readonly);
79static void ext4_destroy_lazyinit_thread(void); 81static void ext4_destroy_lazyinit_thread(void);
80static void ext4_unregister_li_request(struct super_block *sb); 82static void ext4_unregister_li_request(struct super_block *sb);
81static void ext4_clear_request_list(void); 83static void ext4_clear_request_list(void);
82 84
85#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
86static struct file_system_type ext2_fs_type = {
87 .owner = THIS_MODULE,
88 .name = "ext2",
89 .mount = ext4_mount,
90 .kill_sb = kill_block_super,
91 .fs_flags = FS_REQUIRES_DEV,
92};
93#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
94#else
95#define IS_EXT2_SB(sb) (0)
96#endif
97
98
83#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 99#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
84static struct file_system_type ext3_fs_type = { 100static struct file_system_type ext3_fs_type = {
85 .owner = THIS_MODULE, 101 .owner = THIS_MODULE,
@@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb)
806 invalidate_bdev(sbi->journal_bdev); 822 invalidate_bdev(sbi->journal_bdev);
807 ext4_blkdev_remove(sbi); 823 ext4_blkdev_remove(sbi);
808 } 824 }
825 if (sbi->s_mmp_tsk)
826 kthread_stop(sbi->s_mmp_tsk);
809 sb->s_fs_info = NULL; 827 sb->s_fs_info = NULL;
810 /* 828 /*
811 * Now that we are completely done shutting down the 829 * Now that we are completely done shutting down the
@@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1096 1114
1097 if (!test_opt(sb, INIT_INODE_TABLE)) 1115 if (!test_opt(sb, INIT_INODE_TABLE))
1098 seq_puts(seq, ",noinit_inode_table"); 1116 seq_puts(seq, ",noinit_inode_table");
1099 else if (sbi->s_li_wait_mult) 1117 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1100 seq_printf(seq, ",init_inode_table=%u", 1118 seq_printf(seq, ",init_inode_table=%u",
1101 (unsigned) sbi->s_li_wait_mult); 1119 (unsigned) sbi->s_li_wait_mult);
1102 1120
@@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1187 const char *data, size_t len, loff_t off); 1205 const char *data, size_t len, loff_t off);
1188 1206
1189static const struct dquot_operations ext4_quota_operations = { 1207static const struct dquot_operations ext4_quota_operations = {
1190#ifdef CONFIG_QUOTA
1191 .get_reserved_space = ext4_get_reserved_space, 1208 .get_reserved_space = ext4_get_reserved_space,
1192#endif
1193 .write_dquot = ext4_write_dquot, 1209 .write_dquot = ext4_write_dquot,
1194 .acquire_dquot = ext4_acquire_dquot, 1210 .acquire_dquot = ext4_acquire_dquot,
1195 .release_dquot = ext4_release_dquot, 1211 .release_dquot = ext4_release_dquot,
@@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1900 ext4_msg(sb, KERN_WARNING, 1916 ext4_msg(sb, KERN_WARNING,
1901 "warning: mounting fs with errors, " 1917 "warning: mounting fs with errors, "
1902 "running e2fsck is recommended"); 1918 "running e2fsck is recommended");
1903 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1919 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1904 le16_to_cpu(es->s_mnt_count) >= 1920 le16_to_cpu(es->s_mnt_count) >=
1905 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1921 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1906 ext4_msg(sb, KERN_WARNING, 1922 ext4_msg(sb, KERN_WARNING,
@@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2425 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 2441 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2426} 2442}
2427 2443
2444static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2445 struct ext4_sb_info *sbi, char *buf)
2446{
2447 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2448}
2449
2450static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2451 struct ext4_sb_info *sbi, char *buf)
2452{
2453 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2454}
2455
2428static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2456static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2429 struct ext4_sb_info *sbi, 2457 struct ext4_sb_info *sbi,
2430 const char *buf, size_t count) 2458 const char *buf, size_t count)
@@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2482EXT4_RO_ATTR(delayed_allocation_blocks); 2510EXT4_RO_ATTR(delayed_allocation_blocks);
2483EXT4_RO_ATTR(session_write_kbytes); 2511EXT4_RO_ATTR(session_write_kbytes);
2484EXT4_RO_ATTR(lifetime_write_kbytes); 2512EXT4_RO_ATTR(lifetime_write_kbytes);
2513EXT4_RO_ATTR(extent_cache_hits);
2514EXT4_RO_ATTR(extent_cache_misses);
2485EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2515EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2486 inode_readahead_blks_store, s_inode_readahead_blks); 2516 inode_readahead_blks_store, s_inode_readahead_blks);
2487EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2517EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = {
2497 ATTR_LIST(delayed_allocation_blocks), 2527 ATTR_LIST(delayed_allocation_blocks),
2498 ATTR_LIST(session_write_kbytes), 2528 ATTR_LIST(session_write_kbytes),
2499 ATTR_LIST(lifetime_write_kbytes), 2529 ATTR_LIST(lifetime_write_kbytes),
2530 ATTR_LIST(extent_cache_hits),
2531 ATTR_LIST(extent_cache_misses),
2500 ATTR_LIST(inode_readahead_blks), 2532 ATTR_LIST(inode_readahead_blks),
2501 ATTR_LIST(inode_goal), 2533 ATTR_LIST(inode_goal),
2502 ATTR_LIST(mb_stats), 2534 ATTR_LIST(mb_stats),
@@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg)
2659 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2691 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2660} 2692}
2661 2693
2662static void ext4_lazyinode_timeout(unsigned long data)
2663{
2664 struct task_struct *p = (struct task_struct *)data;
2665 wake_up_process(p);
2666}
2667
2668/* Find next suitable group and run ext4_init_inode_table */ 2694/* Find next suitable group and run ext4_init_inode_table */
2669static int ext4_run_li_request(struct ext4_li_request *elr) 2695static int ext4_run_li_request(struct ext4_li_request *elr)
2670{ 2696{
@@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2696 ret = ext4_init_inode_table(sb, group, 2722 ret = ext4_init_inode_table(sb, group,
2697 elr->lr_timeout ? 0 : 1); 2723 elr->lr_timeout ? 0 : 1);
2698 if (elr->lr_timeout == 0) { 2724 if (elr->lr_timeout == 0) {
2699 timeout = jiffies - timeout; 2725 timeout = (jiffies - timeout) *
2700 if (elr->lr_sbi->s_li_wait_mult) 2726 elr->lr_sbi->s_li_wait_mult;
2701 timeout *= elr->lr_sbi->s_li_wait_mult;
2702 else
2703 timeout *= 20;
2704 elr->lr_timeout = timeout; 2727 elr->lr_timeout = timeout;
2705 } 2728 }
2706 elr->lr_next_sched = jiffies + elr->lr_timeout; 2729 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2712 2735
2713/* 2736/*
2714 * Remove lr_request from the list_request and free the 2737 * Remove lr_request from the list_request and free the
2715 * request tructure. Should be called with li_list_mtx held 2738 * request structure. Should be called with li_list_mtx held
2716 */ 2739 */
2717static void ext4_remove_li_request(struct ext4_li_request *elr) 2740static void ext4_remove_li_request(struct ext4_li_request *elr)
2718{ 2741{
@@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
2730 2753
2731static void ext4_unregister_li_request(struct super_block *sb) 2754static void ext4_unregister_li_request(struct super_block *sb)
2732{ 2755{
2733 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; 2756 mutex_lock(&ext4_li_mtx);
2734 2757 if (!ext4_li_info) {
2735 if (!ext4_li_info) 2758 mutex_unlock(&ext4_li_mtx);
2736 return; 2759 return;
2760 }
2737 2761
2738 mutex_lock(&ext4_li_info->li_list_mtx); 2762 mutex_lock(&ext4_li_info->li_list_mtx);
2739 ext4_remove_li_request(elr); 2763 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2740 mutex_unlock(&ext4_li_info->li_list_mtx); 2764 mutex_unlock(&ext4_li_info->li_list_mtx);
2765 mutex_unlock(&ext4_li_mtx);
2741} 2766}
2742 2767
2743static struct task_struct *ext4_lazyinit_task; 2768static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg)
2756 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 2781 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2757 struct list_head *pos, *n; 2782 struct list_head *pos, *n;
2758 struct ext4_li_request *elr; 2783 struct ext4_li_request *elr;
2759 unsigned long next_wakeup; 2784 unsigned long next_wakeup, cur;
2760 DEFINE_WAIT(wait);
2761 2785
2762 BUG_ON(NULL == eli); 2786 BUG_ON(NULL == eli);
2763 2787
2764 eli->li_timer.data = (unsigned long)current;
2765 eli->li_timer.function = ext4_lazyinode_timeout;
2766
2767 eli->li_task = current;
2768 wake_up(&eli->li_wait_task);
2769
2770cont_thread: 2788cont_thread:
2771 while (true) { 2789 while (true) {
2772 next_wakeup = MAX_JIFFY_OFFSET; 2790 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2815,15 @@ cont_thread:
2797 if (freezing(current)) 2815 if (freezing(current))
2798 refrigerator(); 2816 refrigerator();
2799 2817
2800 if ((time_after_eq(jiffies, next_wakeup)) || 2818 cur = jiffies;
2819 if ((time_after_eq(cur, next_wakeup)) ||
2801 (MAX_JIFFY_OFFSET == next_wakeup)) { 2820 (MAX_JIFFY_OFFSET == next_wakeup)) {
2802 cond_resched(); 2821 cond_resched();
2803 continue; 2822 continue;
2804 } 2823 }
2805 2824
2806 eli->li_timer.expires = next_wakeup; 2825 schedule_timeout_interruptible(next_wakeup - cur);
2807 add_timer(&eli->li_timer); 2826
2808 prepare_to_wait(&eli->li_wait_daemon, &wait,
2809 TASK_INTERRUPTIBLE);
2810 if (time_before(jiffies, next_wakeup))
2811 schedule();
2812 finish_wait(&eli->li_wait_daemon, &wait);
2813 if (kthread_should_stop()) { 2827 if (kthread_should_stop()) {
2814 ext4_clear_request_list(); 2828 ext4_clear_request_list();
2815 goto exit_thread; 2829 goto exit_thread;
@@ -2833,12 +2847,7 @@ exit_thread:
2833 goto cont_thread; 2847 goto cont_thread;
2834 } 2848 }
2835 mutex_unlock(&eli->li_list_mtx); 2849 mutex_unlock(&eli->li_list_mtx);
2836 del_timer_sync(&ext4_li_info->li_timer);
2837 eli->li_task = NULL;
2838 wake_up(&eli->li_wait_task);
2839
2840 kfree(ext4_li_info); 2850 kfree(ext4_li_info);
2841 ext4_lazyinit_task = NULL;
2842 ext4_li_info = NULL; 2851 ext4_li_info = NULL;
2843 mutex_unlock(&ext4_li_mtx); 2852 mutex_unlock(&ext4_li_mtx);
2844 2853
@@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void)
2866 if (IS_ERR(ext4_lazyinit_task)) { 2875 if (IS_ERR(ext4_lazyinit_task)) {
2867 int err = PTR_ERR(ext4_lazyinit_task); 2876 int err = PTR_ERR(ext4_lazyinit_task);
2868 ext4_clear_request_list(); 2877 ext4_clear_request_list();
2869 del_timer_sync(&ext4_li_info->li_timer);
2870 kfree(ext4_li_info); 2878 kfree(ext4_li_info);
2871 ext4_li_info = NULL; 2879 ext4_li_info = NULL;
2872 printk(KERN_CRIT "EXT4: error %d creating inode table " 2880 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void)
2875 return err; 2883 return err;
2876 } 2884 }
2877 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 2885 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2878
2879 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2880 return 0; 2886 return 0;
2881} 2887}
2882 2888
@@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void)
2911 if (!eli) 2917 if (!eli)
2912 return -ENOMEM; 2918 return -ENOMEM;
2913 2919
2914 eli->li_task = NULL;
2915 INIT_LIST_HEAD(&eli->li_request_list); 2920 INIT_LIST_HEAD(&eli->li_request_list);
2916 mutex_init(&eli->li_list_mtx); 2921 mutex_init(&eli->li_list_mtx);
2917 2922
2918 init_waitqueue_head(&eli->li_wait_daemon);
2919 init_waitqueue_head(&eli->li_wait_task);
2920 init_timer(&eli->li_timer);
2921 eli->li_state |= EXT4_LAZYINIT_QUIT; 2923 eli->li_state |= EXT4_LAZYINIT_QUIT;
2922 2924
2923 ext4_li_info = eli; 2925 ext4_li_info = eli;
@@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb,
2960 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2962 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2961 int ret = 0; 2963 int ret = 0;
2962 2964
2963 if (sbi->s_li_request != NULL) 2965 if (sbi->s_li_request != NULL) {
2966 /*
2967 * Reset timeout so it can be computed again, because
2968 * s_li_wait_mult might have changed.
2969 */
2970 sbi->s_li_request->lr_timeout = 0;
2964 return 0; 2971 return 0;
2972 }
2965 2973
2966 if (first_not_zeroed == ngroups || 2974 if (first_not_zeroed == ngroups ||
2967 (sb->s_flags & MS_RDONLY) || 2975 (sb->s_flags & MS_RDONLY) ||
2968 !test_opt(sb, INIT_INODE_TABLE)) { 2976 !test_opt(sb, INIT_INODE_TABLE))
2969 sbi->s_li_request = NULL;
2970 return 0; 2977 return 0;
2971 }
2972
2973 if (first_not_zeroed == ngroups) {
2974 sbi->s_li_request = NULL;
2975 return 0;
2976 }
2977 2978
2978 elr = ext4_li_request_new(sb, first_not_zeroed); 2979 elr = ext4_li_request_new(sb, first_not_zeroed);
2979 if (!elr) 2980 if (!elr)
@@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3166 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3167 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3167 set_opt(sb, DELALLOC); 3168 set_opt(sb, DELALLOC);
3168 3169
3170 /*
3171 * set default s_li_wait_mult for lazyinit, for the case there is
3172 * no mount option specified.
3173 */
3174 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3175
3169 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3176 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3170 &journal_devnum, &journal_ioprio, NULL, 0)) { 3177 &journal_devnum, &journal_ioprio, NULL, 0)) {
3171 ext4_msg(sb, KERN_WARNING, 3178 ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3187 "feature flags set on rev 0 fs, " 3194 "feature flags set on rev 0 fs, "
3188 "running e2fsck is recommended"); 3195 "running e2fsck is recommended");
3189 3196
3197 if (IS_EXT2_SB(sb)) {
3198 if (ext2_feature_set_ok(sb))
3199 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3200 "using the ext4 subsystem");
3201 else {
3202 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3203 "to feature incompatibilities");
3204 goto failed_mount;
3205 }
3206 }
3207
3208 if (IS_EXT3_SB(sb)) {
3209 if (ext3_feature_set_ok(sb))
3210 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3211 "using the ext4 subsystem");
3212 else {
3213 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3214 "to feature incompatibilities");
3215 goto failed_mount;
3216 }
3217 }
3218
3190 /* 3219 /*
3191 * Check feature flags regardless of the revision level, since we 3220 * Check feature flags regardless of the revision level, since we
3192 * previously didn't change the revision level when setting the flags, 3221 * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3459 EXT4_HAS_INCOMPAT_FEATURE(sb, 3488 EXT4_HAS_INCOMPAT_FEATURE(sb,
3460 EXT4_FEATURE_INCOMPAT_RECOVER)); 3489 EXT4_FEATURE_INCOMPAT_RECOVER));
3461 3490
3491 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3492 !(sb->s_flags & MS_RDONLY))
3493 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3494 goto failed_mount3;
3495
3462 /* 3496 /*
3463 * The first inode we look at is the journal inode. Don't try 3497 * The first inode we look at is the journal inode. Don't try
3464 * root first: it may be modified in the journal! 3498 * root first: it may be modified in the journal!
@@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3474 goto failed_mount_wq; 3508 goto failed_mount_wq;
3475 } else { 3509 } else {
3476 clear_opt(sb, DATA_FLAGS); 3510 clear_opt(sb, DATA_FLAGS);
3477 set_opt(sb, WRITEBACK_DATA);
3478 sbi->s_journal = NULL; 3511 sbi->s_journal = NULL;
3479 needs_recovery = 0; 3512 needs_recovery = 0;
3480 goto no_journal; 3513 goto no_journal;
@@ -3707,6 +3740,8 @@ failed_mount3:
3707 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3740 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3708 percpu_counter_destroy(&sbi->s_dirs_counter); 3741 percpu_counter_destroy(&sbi->s_dirs_counter);
3709 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3742 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3743 if (sbi->s_mmp_tsk)
3744 kthread_stop(sbi->s_mmp_tsk);
3710failed_mount2: 3745failed_mount2:
3711 for (i = 0; i < db_count; i++) 3746 for (i = 0; i < db_count; i++)
3712 brelse(sbi->s_group_desc[i]); 3747 brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4242 int enable_quota = 0; 4277 int enable_quota = 0;
4243 ext4_group_t g; 4278 ext4_group_t g;
4244 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4279 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4245 int err; 4280 int err = 0;
4246#ifdef CONFIG_QUOTA 4281#ifdef CONFIG_QUOTA
4247 int i; 4282 int i;
4248#endif 4283#endif
@@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4368 goto restore_opts; 4403 goto restore_opts;
4369 if (!ext4_setup_super(sb, es, 0)) 4404 if (!ext4_setup_super(sb, es, 0))
4370 sb->s_flags &= ~MS_RDONLY; 4405 sb->s_flags &= ~MS_RDONLY;
4406 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4407 EXT4_FEATURE_INCOMPAT_MMP))
4408 if (ext4_multi_mount_protect(sb,
4409 le64_to_cpu(es->s_mmp_block))) {
4410 err = -EROFS;
4411 goto restore_opts;
4412 }
4371 enable_quota = 1; 4413 enable_quota = 1;
4372 } 4414 }
4373 } 4415 }
@@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4432 struct ext4_sb_info *sbi = EXT4_SB(sb); 4474 struct ext4_sb_info *sbi = EXT4_SB(sb);
4433 struct ext4_super_block *es = sbi->s_es; 4475 struct ext4_super_block *es = sbi->s_es;
4434 u64 fsid; 4476 u64 fsid;
4477 s64 bfree;
4435 4478
4436 if (test_opt(sb, MINIX_DF)) { 4479 if (test_opt(sb, MINIX_DF)) {
4437 sbi->s_overhead_last = 0; 4480 sbi->s_overhead_last = 0;
@@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4475 buf->f_type = EXT4_SUPER_MAGIC; 4518 buf->f_type = EXT4_SUPER_MAGIC;
4476 buf->f_bsize = sb->s_blocksize; 4519 buf->f_bsize = sb->s_blocksize;
4477 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4520 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4478 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4521 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4479 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4522 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4523 /* prevent underflow in case that few free space is available */
4524 buf->f_bfree = max_t(s64, bfree, 0);
4480 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4525 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4481 if (buf->f_bfree < ext4_r_blocks_count(es)) 4526 if (buf->f_bfree < ext4_r_blocks_count(es))
4482 buf->f_bavail = 0; 4527 buf->f_bavail = 0;
@@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
4652 if (test_opt(sb, DELALLOC)) 4697 if (test_opt(sb, DELALLOC))
4653 sync_filesystem(sb); 4698 sync_filesystem(sb);
4654 4699
4700 if (!inode)
4701 goto out;
4702
4655 /* Update modification times of quota files when userspace can 4703 /* Update modification times of quota files when userspace can
4656 * start looking at them */ 4704 * start looking at them */
4657 handle = ext4_journal_start(inode, 1); 4705 handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4772} 4820}
4773 4821
4774#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4822#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4775static struct file_system_type ext2_fs_type = {
4776 .owner = THIS_MODULE,
4777 .name = "ext2",
4778 .mount = ext4_mount,
4779 .kill_sb = kill_block_super,
4780 .fs_flags = FS_REQUIRES_DEV,
4781};
4782
4783static inline void register_as_ext2(void) 4823static inline void register_as_ext2(void)
4784{ 4824{
4785 int err = register_filesystem(&ext2_fs_type); 4825 int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void)
4792{ 4832{
4793 unregister_filesystem(&ext2_fs_type); 4833 unregister_filesystem(&ext2_fs_type);
4794} 4834}
4835
4836static inline int ext2_feature_set_ok(struct super_block *sb)
4837{
4838 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4839 return 0;
4840 if (sb->s_flags & MS_RDONLY)
4841 return 1;
4842 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4843 return 0;
4844 return 1;
4845}
4795MODULE_ALIAS("ext2"); 4846MODULE_ALIAS("ext2");
4796#else 4847#else
4797static inline void register_as_ext2(void) { } 4848static inline void register_as_ext2(void) { }
4798static inline void unregister_as_ext2(void) { } 4849static inline void unregister_as_ext2(void) { }
4850static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4799#endif 4851#endif
4800 4852
4801#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4853#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void)
4811{ 4863{
4812 unregister_filesystem(&ext3_fs_type); 4864 unregister_filesystem(&ext3_fs_type);
4813} 4865}
4866
4867static inline int ext3_feature_set_ok(struct super_block *sb)
4868{
4869 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4870 return 0;
4871 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4872 return 0;
4873 if (sb->s_flags & MS_RDONLY)
4874 return 1;
4875 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4876 return 0;
4877 return 1;
4878}
4814MODULE_ALIAS("ext3"); 4879MODULE_ALIAS("ext3");
4815#else 4880#else
4816static inline void register_as_ext3(void) { } 4881static inline void register_as_ext3(void) { }
4817static inline void unregister_as_ext3(void) { } 4882static inline void unregister_as_ext3(void) { }
4883static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4818#endif 4884#endif
4819 4885
4820static struct file_system_type ext4_fs_type = { 4886static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void)
4898 err = init_inodecache(); 4964 err = init_inodecache();
4899 if (err) 4965 if (err)
4900 goto out1; 4966 goto out1;
4901 register_as_ext2();
4902 register_as_ext3(); 4967 register_as_ext3();
4968 register_as_ext2();
4903 err = register_filesystem(&ext4_fs_type); 4969 err = register_filesystem(&ext4_fs_type);
4904 if (err) 4970 if (err)
4905 goto out; 4971 goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 goal, NULL, &error); 824 NULL, &error);
825 if (error) 825 if (error)
826 goto cleanup; 826 goto cleanup;
827 827