aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-01 11:04:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-01 11:04:12 -0400
commit149b306089b88e186942a8d6647028ae6683aaf9 (patch)
tree1b7436034261947bae3efad41c55a91a8ef0f68d /fs
parentb0ca4d0123608cfec73fc689c74295da89fc934e (diff)
parent0d606e2c9fccdd4e67febf1e2da500e1bfe9e045 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Mostly performance and bug fixes, plus some cleanups. The one new feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which allows installation of a hidden inode designed for boot loaders." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits) ext4: fix type-widening bug in inode table readahead code ext4: add check for inodes_count overflow in new resize ioctl ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG ext4: fix online resizing for ext3-compat file systems jbd2: trace when lock_buffer in do_get_write_access takes a long time ext4: mark metadata blocks using bh flags buffer: add BH_Prio and BH_Meta flags ext4: mark all metadata I/O with REQ_META ext4: fix readdir error in case inline_data+^dir_index. ext4: fix readdir error in the case of inline_data+dir_index jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset ext4: mext_insert_extents should update extent block checksum ext4: move quota initialization out of inode allocation transaction ext4: reserve xattr index for Rich ACL support jbd2: reduce journal_head size ext4: clear buffer_uninit flag when submitting IO ext4: use io_end for multiple bios ext4: make ext4_bio_write_page() use BH_Async_Write flags ext4: Use kstrtoul() instead of parse_strtoul() ext4: defragmentation code cleanup ...
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/balloc.c53
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h101
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/ext4_jbd2.h12
-rw-r--r--fs/ext4/extents.c522
-rw-r--r--fs/ext4/fsync.c3
-rw-r--r--fs/ext4/ialloc.c88
-rw-r--r--fs/ext4/indirect.c473
-rw-r--r--fs/ext4/inline.c178
-rw-r--r--fs/ext4/inode.c580
-rw-r--r--fs/ext4/ioctl.c218
-rw-r--r--fs/ext4/mballoc.c253
-rw-r--r--fs/ext4/migrate.c62
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c73
-rw-r--r--fs/ext4/namei.c48
-rw-r--r--fs/ext4/page-io.c280
-rw-r--r--fs/ext4/resize.c16
-rw-r--r--fs/ext4/super.c131
-rw-r--r--fs/ext4/xattr.c13
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/jbd2/commit.c50
-rw-r--r--fs/jbd2/journal.c31
-rw-r--r--fs/jbd2/transaction.c9
28 files changed, 1845 insertions, 1397 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 10ef81e10b20..bc1fe14aaa3e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
2987 /* Take care of bh's that straddle the end of the device */ 2987 /* Take care of bh's that straddle the end of the device */
2988 guard_bh_eod(rw, bio, bh); 2988 guard_bh_eod(rw, bio, bh);
2989 2989
2990 if (buffer_meta(bh))
2991 rw |= REQ_META;
2992 if (buffer_prio(bh))
2993 rw |= REQ_PRIO;
2994
2990 bio_get(bio); 2995 bio_get(bio);
2991 submit_bio(rw, bio); 2996 submit_bio(rw, bio);
2992 2997
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb9..efea5d5c44ce 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
71 Enables run-time debugging support for the ext4 filesystem. 71 Enables run-time debugging support for the ext4 filesystem.
72 72
73 If you select Y here, then you will be able to turn on debugging 73 If you select Y here, then you will be able to turn on debugging
74 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" 74 with a command such as:
75 echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92e68b33fffd..d0f13eada0ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
30 */ 30 */
31 31
32/* 32/*
33 * Calculate block group number for a given block number
34 */
35ext4_group_t ext4_get_group_number(struct super_block *sb,
36 ext4_fsblk_t block)
37{
38 ext4_group_t group;
39
40 if (test_opt2(sb, STD_GROUP_SIZE))
41 group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
42 block) >>
43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
44 else
45 ext4_get_group_no_and_offset(sb, block, &group, NULL);
46 return group;
47}
48
49/*
33 * Calculate the block group number and offset into the block/cluster 50 * Calculate the block group number and offset into the block/cluster
34 * allocation bitmap, given a block number 51 * allocation bitmap, given a block number
35 */ 52 */
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
49 66
50} 67}
51 68
52static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, 69/*
53 ext4_group_t block_group) 70 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
71 * and 0 otherwise.
72 */
73static inline int ext4_block_in_group(struct super_block *sb,
74 ext4_fsblk_t block,
75 ext4_group_t block_group)
54{ 76{
55 ext4_group_t actual_group; 77 ext4_group_t actual_group;
56 ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); 78
57 if (actual_group == block_group) 79 actual_group = ext4_get_group_number(sb, block);
58 return 1; 80 return (actual_group == block_group) ? 1 : 0;
59 return 0;
60} 81}
61 82
62/* Return the number of clusters used for file system metadata; this 83/* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
420 trace_ext4_read_block_bitmap_load(sb, block_group); 441 trace_ext4_read_block_bitmap_load(sb, block_group);
421 bh->b_end_io = ext4_end_bitmap_read; 442 bh->b_end_io = ext4_end_bitmap_read;
422 get_bh(bh); 443 get_bh(bh);
423 submit_bh(READ, bh); 444 submit_bh(READ | REQ_META | REQ_PRIO, bh);
424 return bh; 445 return bh;
425verify: 446verify:
426 ext4_validate_block_bitmap(sb, desc, block_group, bh); 447 ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
478static int ext4_has_free_clusters(struct ext4_sb_info *sbi, 499static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
479 s64 nclusters, unsigned int flags) 500 s64 nclusters, unsigned int flags)
480{ 501{
481 s64 free_clusters, dirty_clusters, root_clusters; 502 s64 free_clusters, dirty_clusters, rsv, resv_clusters;
482 struct percpu_counter *fcc = &sbi->s_freeclusters_counter; 503 struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
483 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; 504 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
484 505
485 free_clusters = percpu_counter_read_positive(fcc); 506 free_clusters = percpu_counter_read_positive(fcc);
486 dirty_clusters = percpu_counter_read_positive(dcc); 507 dirty_clusters = percpu_counter_read_positive(dcc);
508 resv_clusters = atomic64_read(&sbi->s_resv_clusters);
487 509
488 /* 510 /*
489 * r_blocks_count should always be multiple of the cluster ratio so 511 * r_blocks_count should always be multiple of the cluster ratio so
490 * we are safe to do a plane bit shift only. 512 * we are safe to do a plane bit shift only.
491 */ 513 */
492 root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; 514 rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
515 resv_clusters;
493 516
494 if (free_clusters - (nclusters + root_clusters + dirty_clusters) < 517 if (free_clusters - (nclusters + rsv + dirty_clusters) <
495 EXT4_FREECLUSTERS_WATERMARK) { 518 EXT4_FREECLUSTERS_WATERMARK) {
496 free_clusters = percpu_counter_sum_positive(fcc); 519 free_clusters = percpu_counter_sum_positive(fcc);
497 dirty_clusters = percpu_counter_sum_positive(dcc); 520 dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
499 /* Check whether we have space after accounting for current 522 /* Check whether we have space after accounting for current
500 * dirty clusters & root reserved clusters. 523 * dirty clusters & root reserved clusters.
501 */ 524 */
502 if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) 525 if (free_clusters >= (rsv + nclusters + dirty_clusters))
503 return 1; 526 return 1;
504 527
505 /* Hm, nope. Are (enough) root reserved clusters available? */ 528 /* Hm, nope. Are (enough) root reserved clusters available? */
506 if (uid_eq(sbi->s_resuid, current_fsuid()) || 529 if (uid_eq(sbi->s_resuid, current_fsuid()) ||
507 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || 530 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
508 capable(CAP_SYS_RESOURCE) || 531 capable(CAP_SYS_RESOURCE) ||
509 (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 532 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
510 533
534 if (free_clusters >= (nclusters + dirty_clusters +
535 resv_clusters))
536 return 1;
537 }
538 /* No free blocks. Let's see if we can dip into reserved pool */
539 if (flags & EXT4_MB_USE_RESERVED) {
511 if (free_clusters >= (nclusters + dirty_clusters)) 540 if (free_clusters >= (nclusters + dirty_clusters))
512 return 1; 541 return 1;
513 } 542 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d8cd1f0f4661..f8d56e4254e0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
47 EXT4_FEATURE_COMPAT_DIR_INDEX) && 47 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || 48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
49 ((inode->i_size >> sb->s_blocksize_bits) == 1))) 49 ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
50 ext4_has_inline_data(inode)))
50 return 1; 51 return 1;
51 52
52 return 0; 53 return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
115 int ret = 0; 116 int ret = 0;
116 int dir_has_error = 0; 117 int dir_has_error = 0;
117 118
118 if (ext4_has_inline_data(inode)) {
119 int has_inline_data = 1;
120 ret = ext4_read_inline_dir(filp, dirent, filldir,
121 &has_inline_data);
122 if (has_inline_data)
123 return ret;
124 }
125
126 if (is_dx_dir(inode)) { 119 if (is_dx_dir(inode)) {
127 err = ext4_dx_readdir(filp, dirent, filldir); 120 err = ext4_dx_readdir(filp, dirent, filldir);
128 if (err != ERR_BAD_DX_DIR) { 121 if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
136 ext4_clear_inode_flag(file_inode(filp), 129 ext4_clear_inode_flag(file_inode(filp),
137 EXT4_INODE_INDEX); 130 EXT4_INODE_INDEX);
138 } 131 }
132
133 if (ext4_has_inline_data(inode)) {
134 int has_inline_data = 1;
135 ret = ext4_read_inline_dir(filp, dirent, filldir,
136 &has_inline_data);
137 if (has_inline_data)
138 return ret;
139 }
140
139 stored = 0; 141 stored = 0;
140 offset = filp->f_pos & (sb->s_blocksize - 1); 142 offset = filp->f_pos & (sb->s_blocksize - 1);
141 143
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b83cd604796..0aabb344b02e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
121#define EXT4_MB_STREAM_ALLOC 0x0800 121#define EXT4_MB_STREAM_ALLOC 0x0800
122/* Use reserved root blocks if needed */ 122/* Use reserved root blocks if needed */
123#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 123#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
124/* Use blocks from reserved pool */
125#define EXT4_MB_USE_RESERVED 0x2000
124 126
125struct ext4_allocation_request { 127struct ext4_allocation_request {
126 /* target inode for block we're allocating */ 128 /* target inode for block we're allocating */
@@ -196,19 +198,8 @@ struct mpage_da_data {
196#define EXT4_IO_END_ERROR 0x0002 198#define EXT4_IO_END_ERROR 0x0002
197#define EXT4_IO_END_DIRECT 0x0004 199#define EXT4_IO_END_DIRECT 0x0004
198 200
199struct ext4_io_page {
200 struct page *p_page;
201 atomic_t p_count;
202};
203
204#define MAX_IO_PAGES 128
205
206/* 201/*
207 * For converting uninitialized extents on a work queue. 202 * For converting uninitialized extents on a work queue.
208 *
209 * 'page' is only used from the writepage() path; 'pages' is only used for
210 * buffered writes; they are used to keep page references until conversion
211 * takes place. For AIO/DIO, neither field is filled in.
212 */ 203 */
213typedef struct ext4_io_end { 204typedef struct ext4_io_end {
214 struct list_head list; /* per-file finished IO list */ 205 struct list_head list; /* per-file finished IO list */
@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
218 ssize_t size; /* size of the extent */ 209 ssize_t size; /* size of the extent */
219 struct kiocb *iocb; /* iocb struct for AIO */ 210 struct kiocb *iocb; /* iocb struct for AIO */
220 int result; /* error value for AIO */ 211 int result; /* error value for AIO */
221 int num_io_pages; /* for writepages() */ 212 atomic_t count; /* reference counter */
222 struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
223} ext4_io_end_t; 213} ext4_io_end_t;
224 214
225struct ext4_io_submit { 215struct ext4_io_submit {
226 int io_op; 216 int io_op;
227 struct bio *io_bio; 217 struct bio *io_bio;
228 ext4_io_end_t *io_end; 218 ext4_io_end_t *io_end;
229 struct ext4_io_page *io_page;
230 sector_t io_next_block; 219 sector_t io_next_block;
231}; 220};
232 221
@@ -403,7 +392,7 @@ struct flex_groups {
403#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 392#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
404 393
405#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 394#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
406#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ 395#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
407 396
408/* Flags that should be inherited by new inodes from their parent. */ 397/* Flags that should be inherited by new inodes from their parent. */
409#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 398#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +546,8 @@ enum {
557#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 546#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
558#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 547#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
559 EXT4_GET_BLOCKS_CREATE) 548 EXT4_GET_BLOCKS_CREATE)
560 /* Caller is from the delayed allocation writeout path, 549 /* Caller is from the delayed allocation writeout path
561 so set the magic i_delalloc_reserve_flag after taking the 550 * finally doing the actual allocation of delayed blocks */
562 inode allocation semaphore for */
563#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 551#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
564 /* caller is from the direct IO path, request to creation of an 552 /* caller is from the direct IO path, request to creation of an
565 unitialized extents if not allocated, split the uninitialized 553 unitialized extents if not allocated, split the uninitialized
@@ -571,8 +559,9 @@ enum {
571 /* Convert extent to initialized after IO complete */ 559 /* Convert extent to initialized after IO complete */
572#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 560#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
573 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 561 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
574 /* Punch out blocks of an extent */ 562 /* Eventual metadata allocation (due to growing extent tree)
575#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 563 * should not fail, so try to use reserved blocks for that.*/
564#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
576 /* Don't normalize allocation size (used for fallocate) */ 565 /* Don't normalize allocation size (used for fallocate) */
577#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 566#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
578 /* Request will not result in inode size update (user for fallocate) */ 567 /* Request will not result in inode size update (user for fallocate) */
@@ -616,6 +605,7 @@ enum {
616#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 605#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
617#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 606#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
618#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 607#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
608#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
619 609
620#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 610#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
621/* 611/*
@@ -949,7 +939,7 @@ struct ext4_inode_info {
949#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ 939#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
950 940
951/* 941/*
952 * Mount flags 942 * Mount flags set via mount options or defaults
953 */ 943 */
954#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 944#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
955#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 945#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -981,8 +971,16 @@ struct ext4_inode_info {
981#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 971#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
982#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 972#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
983 973
974/*
975 * Mount flags set either automatically (could not be set by mount option)
976 * based on per file system feature or property or in special cases such as
977 * distinguishing between explicit mount option definition and default.
978 */
984#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly 979#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
985 specified delalloc */ 980 specified delalloc */
981#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
982 size of blocksize * 8
983 blocks */
986 984
987#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 985#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
988 ~EXT4_MOUNT_##opt 986 ~EXT4_MOUNT_##opt
@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
1179 unsigned int s_mount_flags; 1177 unsigned int s_mount_flags;
1180 unsigned int s_def_mount_opt; 1178 unsigned int s_def_mount_opt;
1181 ext4_fsblk_t s_sb_block; 1179 ext4_fsblk_t s_sb_block;
1180 atomic64_t s_resv_clusters;
1182 kuid_t s_resuid; 1181 kuid_t s_resuid;
1183 kgid_t s_resgid; 1182 kgid_t s_resgid;
1184 unsigned short s_mount_state; 1183 unsigned short s_mount_state;
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1333 return ino == EXT4_ROOT_INO || 1332 return ino == EXT4_ROOT_INO ||
1334 ino == EXT4_USR_QUOTA_INO || 1333 ino == EXT4_USR_QUOTA_INO ||
1335 ino == EXT4_GRP_QUOTA_INO || 1334 ino == EXT4_GRP_QUOTA_INO ||
1335 ino == EXT4_BOOT_LOADER_INO ||
1336 ino == EXT4_JOURNAL_INO || 1336 ino == EXT4_JOURNAL_INO ||
1337 ino == EXT4_RESIZE_INO || 1337 ino == EXT4_RESIZE_INO ||
1338 (ino >= EXT4_FIRST_INO(sb) && 1338 (ino >= EXT4_FIRST_INO(sb) &&
@@ -1374,6 +1374,7 @@ enum {
1374 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1374 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1375 nolocking */ 1375 nolocking */
1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1377 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1377}; 1378};
1378 1379
1379#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1380#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1784 */ 1785 */
1785#define ERR_BAD_DX_DIR -75000 1786#define ERR_BAD_DX_DIR -75000
1786 1787
1787void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1788 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1789
1790/* 1788/*
1791 * Timeout and state flag for lazy initialization inode thread. 1789 * Timeout and state flag for lazy initialization inode thread.
1792 */ 1790 */
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
1908 struct buffer_head *bh); 1906 struct buffer_head *bh);
1909 1907
1910/* balloc.c */ 1908/* balloc.c */
1909extern void ext4_get_group_no_and_offset(struct super_block *sb,
1910 ext4_fsblk_t blocknr,
1911 ext4_group_t *blockgrpp,
1912 ext4_grpblk_t *offsetp);
1913extern ext4_group_t ext4_get_group_number(struct super_block *sb,
1914 ext4_fsblk_t block);
1915
1911extern void ext4_validate_block_bitmap(struct super_block *sb, 1916extern void ext4_validate_block_bitmap(struct super_block *sb,
1912 struct ext4_group_desc *desc, 1917 struct ext4_group_desc *desc,
1913 unsigned int block_group, 1918 unsigned int block_group,
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2108 unsigned long nr_segs); 2113 unsigned long nr_segs);
2109extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2114extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2110extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2115extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
2111extern void ext4_ind_truncate(struct inode *inode); 2116extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2112extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); 2117extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2118 ext4_lblk_t first, ext4_lblk_t stop);
2113 2119
2114/* ioctl.c */ 2120/* ioctl.c */
2115extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 2121extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
2117 2123
2118/* migrate.c */ 2124/* migrate.c */
2119extern int ext4_ext_migrate(struct inode *); 2125extern int ext4_ext_migrate(struct inode *);
2126extern int ext4_ind_migrate(struct inode *inode);
2120 2127
2121/* namei.c */ 2128/* namei.c */
2122extern int ext4_dirent_csum_verify(struct inode *inode, 2129extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
2511extern int ext4_read_inline_dir(struct file *filp, 2518extern int ext4_read_inline_dir(struct file *filp,
2512 void *dirent, filldir_t filldir, 2519 void *dirent, filldir_t filldir,
2513 int *has_inline_data); 2520 int *has_inline_data);
2521extern int htree_inlinedir_to_tree(struct file *dir_file,
2522 struct inode *dir, ext4_lblk_t block,
2523 struct dx_hash_info *hinfo,
2524 __u32 start_hash, __u32 start_minor_hash,
2525 int *has_inline_data);
2514extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, 2526extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
2515 const struct qstr *d_name, 2527 const struct qstr *d_name,
2516 struct ext4_dir_entry_2 **res_dir, 2528 struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
2547extern int ext4_handle_dirty_dirent_node(handle_t *handle, 2559extern int ext4_handle_dirty_dirent_node(handle_t *handle,
2548 struct inode *inode, 2560 struct inode *inode,
2549 struct buffer_head *bh); 2561 struct buffer_head *bh);
2562#define S_SHIFT 12
2563static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
2564 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
2565 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
2566 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
2567 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
2568 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
2569 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
2570 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
2571};
2572
2573static inline void ext4_set_de_type(struct super_block *sb,
2574 struct ext4_dir_entry_2 *de,
2575 umode_t mode) {
2576 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
2577 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
2578}
2579
2550 2580
2551/* symlink.c */ 2581/* symlink.c */
2552extern const struct inode_operations ext4_symlink_inode_operations; 2582extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2573 int chunk); 2603 int chunk);
2574extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2604extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2575 struct ext4_map_blocks *map, int flags); 2605 struct ext4_map_blocks *map, int flags);
2576extern void ext4_ext_truncate(struct inode *); 2606extern void ext4_ext_truncate(handle_t *, struct inode *);
2577extern int ext4_ext_punch_hole(struct file *file, loff_t offset, 2607extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2578 loff_t length); 2608 ext4_lblk_t end);
2579extern void ext4_ext_init(struct super_block *); 2609extern void ext4_ext_init(struct super_block *);
2580extern void ext4_ext_release(struct super_block *); 2610extern void ext4_ext_release(struct super_block *);
2581extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2611extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2609 2639
2610 2640
2611/* move_extent.c */ 2641/* move_extent.c */
2642extern void ext4_double_down_write_data_sem(struct inode *first,
2643 struct inode *second);
2644extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2645 struct inode *donor_inode);
2646void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
2647void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
2612extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2648extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2613 __u64 start_orig, __u64 start_donor, 2649 __u64 start_orig, __u64 start_donor,
2614 __u64 len, __u64 *moved_len); 2650 __u64 len, __u64 *moved_len);
2615 2651
2616/* page-io.c */ 2652/* page-io.c */
2617extern int __init ext4_init_pageio(void); 2653extern int __init ext4_init_pageio(void);
2618extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2619extern void ext4_exit_pageio(void); 2654extern void ext4_exit_pageio(void);
2620extern void ext4_ioend_shutdown(struct inode *); 2655extern void ext4_ioend_shutdown(struct inode *);
2621extern void ext4_free_io_end(ext4_io_end_t *io);
2622extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2656extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2657extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2658extern int ext4_put_io_end(ext4_io_end_t *io_end);
2659extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2660extern void ext4_io_submit_init(struct ext4_io_submit *io,
2661 struct writeback_control *wbc);
2623extern void ext4_end_io_work(struct work_struct *work); 2662extern void ext4_end_io_work(struct work_struct *work);
2624extern void ext4_io_submit(struct ext4_io_submit *io); 2663extern void ext4_io_submit(struct ext4_io_submit *io);
2625extern int ext4_bio_write_page(struct ext4_io_submit *io, 2664extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 8643ff5bbeb7..51bc821ade90 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
270 0xffff); 270 0xffff);
271} 271}
272 272
273#define ext4_ext_dirty(handle, inode, path) \
274 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
275int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
276 struct inode *inode, struct ext4_ext_path *path);
277
273#endif /* _EXT4_EXTENTS */ 278#endif /* _EXT4_EXTENTS */
274 279
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7058975e3a55..451eb4045330 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
43{ 43{
44 journal_t *journal; 44 journal_t *journal;
45 45
46 might_sleep();
47
46 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
47 if (sb->s_flags & MS_RDONLY) 49 if (sb->s_flags & MS_RDONLY)
48 return ERR_PTR(-EROFS); 50 return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
113{ 115{
114 int err = 0; 116 int err = 0;
115 117
118 might_sleep();
119
116 if (ext4_handle_valid(handle)) { 120 if (ext4_handle_valid(handle)) {
117 err = jbd2_journal_get_write_access(handle, bh); 121 err = jbd2_journal_get_write_access(handle, bh);
118 if (err) 122 if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
209{ 213{
210 int err = 0; 214 int err = 0;
211 215
216 might_sleep();
217
218 set_buffer_meta(bh);
219 set_buffer_prio(bh);
212 if (ext4_handle_valid(handle)) { 220 if (ext4_handle_valid(handle)) {
213 err = jbd2_journal_dirty_metadata(handle, bh); 221 err = jbd2_journal_dirty_metadata(handle, bh);
214 if (err) { 222 if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4c216b1bf20c..c8c6885406db 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
29 * block to complete the transaction. 29 * block to complete the transaction.
30 * 30 *
31 * For extents-enabled fs we may have to allocate and modify up to 31 * For extents-enabled fs we may have to allocate and modify up to
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree, data block (for each of these we need bitmap + group
33 * summaries), root which is stored in the inode, sb
34 */
33 35
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 36#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 37 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 ? 27U : 8U) 38 ? 20U : 8U)
37 39
38/* Extended attribute operations touch at most two data buffers, 40/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 41 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
194 * ext4_journal_callback_del: delete a registered callback 196 * ext4_journal_callback_del: delete a registered callback
195 * @handle: active journal transaction handle on which callback was registered 197 * @handle: active journal transaction handle on which callback was registered
196 * @jce: registered journal callback entry to unregister 198 * @jce: registered journal callback entry to unregister
199 * Return true if object was sucessfully removed
197 */ 200 */
198static inline void ext4_journal_callback_del(handle_t *handle, 201static inline bool ext4_journal_callback_try_del(handle_t *handle,
199 struct ext4_journal_cb_entry *jce) 202 struct ext4_journal_cb_entry *jce)
200{ 203{
204 bool deleted;
201 struct ext4_sb_info *sbi = 205 struct ext4_sb_info *sbi =
202 EXT4_SB(handle->h_transaction->t_journal->j_private); 206 EXT4_SB(handle->h_transaction->t_journal->j_private);
203 207
204 spin_lock(&sbi->s_md_lock); 208 spin_lock(&sbi->s_md_lock);
209 deleted = !list_empty(&jce->jce_list);
205 list_del_init(&jce->jce_list); 210 list_del_init(&jce->jce_list);
206 spin_unlock(&sbi->s_md_lock); 211 spin_unlock(&sbi->s_md_lock);
212 return deleted;
207} 213}
208 214
209int 215int
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c6d06dcef8b..107936db244e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
157 * - ENOMEM 157 * - ENOMEM
158 * - EIO 158 * - EIO
159 */ 159 */
160#define ext4_ext_dirty(handle, inode, path) \ 160int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 161 struct inode *inode, struct ext4_ext_path *path)
162static int __ext4_ext_dirty(const char *where, unsigned int line,
163 handle_t *handle, struct inode *inode,
164 struct ext4_ext_path *path)
165{ 162{
166 int err; 163 int err;
167 if (path->p_bh) { 164 if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1813 } 1810 }
1814 depth = ext_depth(inode); 1811 depth = ext_depth(inode);
1815 ex = path[depth].p_ext; 1812 ex = path[depth].p_ext;
1813 eh = path[depth].p_hdr;
1816 if (unlikely(path[depth].p_hdr == NULL)) { 1814 if (unlikely(path[depth].p_hdr == NULL)) {
1817 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1815 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1818 return -EIO; 1816 return -EIO;
1819 } 1817 }
1820 1818
1821 /* try to insert block into found extent and return */ 1819 /* try to insert block into found extent and return */
1822 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1820 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
1823 && ext4_can_extents_be_merged(inode, ex, newext)) {
1824 ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1825 ext4_ext_is_uninitialized(newext),
1826 ext4_ext_get_actual_len(newext),
1827 le32_to_cpu(ex->ee_block),
1828 ext4_ext_is_uninitialized(ex),
1829 ext4_ext_get_actual_len(ex),
1830 ext4_ext_pblock(ex));
1831 err = ext4_ext_get_access(handle, inode, path + depth);
1832 if (err)
1833 return err;
1834 1821
1835 /* 1822 /*
1836 * ext4_can_extents_be_merged should have checked that either 1823 * Try to see whether we should rather test the extent on
1837 * both extents are uninitialized, or both aren't. Thus we 1824 * right from ex, or from the left of ex. This is because
1838 * need to check only one of them here. 1825 * ext4_ext_find_extent() can return either extent on the
1826 * left, or on the right from the searched position. This
1827 * will make merging more effective.
1839 */ 1828 */
1840 if (ext4_ext_is_uninitialized(ex)) 1829 if (ex < EXT_LAST_EXTENT(eh) &&
1841 uninitialized = 1; 1830 (le32_to_cpu(ex->ee_block) +
1842 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1831 ext4_ext_get_actual_len(ex) <
1832 le32_to_cpu(newext->ee_block))) {
1833 ex += 1;
1834 goto prepend;
1835 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1836 (le32_to_cpu(newext->ee_block) +
1837 ext4_ext_get_actual_len(newext) <
1838 le32_to_cpu(ex->ee_block)))
1839 ex -= 1;
1840
1841 /* Try to append newex to the ex */
1842 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1843 ext_debug("append [%d]%d block to %u:[%d]%d"
1844 "(from %llu)\n",
1845 ext4_ext_is_uninitialized(newext),
1846 ext4_ext_get_actual_len(newext),
1847 le32_to_cpu(ex->ee_block),
1848 ext4_ext_is_uninitialized(ex),
1849 ext4_ext_get_actual_len(ex),
1850 ext4_ext_pblock(ex));
1851 err = ext4_ext_get_access(handle, inode,
1852 path + depth);
1853 if (err)
1854 return err;
1855
1856 /*
1857 * ext4_can_extents_be_merged should have checked
1858 * that either both extents are uninitialized, or
1859 * both aren't. Thus we need to check only one of
1860 * them here.
1861 */
1862 if (ext4_ext_is_uninitialized(ex))
1863 uninitialized = 1;
1864 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1843 + ext4_ext_get_actual_len(newext)); 1865 + ext4_ext_get_actual_len(newext));
1844 if (uninitialized) 1866 if (uninitialized)
1845 ext4_ext_mark_uninitialized(ex); 1867 ext4_ext_mark_uninitialized(ex);
1846 eh = path[depth].p_hdr; 1868 eh = path[depth].p_hdr;
1847 nearex = ex; 1869 nearex = ex;
1848 goto merge; 1870 goto merge;
1871 }
1872
1873prepend:
1874 /* Try to prepend newex to the ex */
1875 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1876 ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1877 "(from %llu)\n",
1878 le32_to_cpu(newext->ee_block),
1879 ext4_ext_is_uninitialized(newext),
1880 ext4_ext_get_actual_len(newext),
1881 le32_to_cpu(ex->ee_block),
1882 ext4_ext_is_uninitialized(ex),
1883 ext4_ext_get_actual_len(ex),
1884 ext4_ext_pblock(ex));
1885 err = ext4_ext_get_access(handle, inode,
1886 path + depth);
1887 if (err)
1888 return err;
1889
1890 /*
1891 * ext4_can_extents_be_merged should have checked
1892 * that either both extents are uninitialized, or
1893 * both aren't. Thus we need to check only one of
1894 * them here.
1895 */
1896 if (ext4_ext_is_uninitialized(ex))
1897 uninitialized = 1;
1898 ex->ee_block = newext->ee_block;
1899 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1900 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1901 + ext4_ext_get_actual_len(newext));
1902 if (uninitialized)
1903 ext4_ext_mark_uninitialized(ex);
1904 eh = path[depth].p_hdr;
1905 nearex = ex;
1906 goto merge;
1907 }
1849 } 1908 }
1850 1909
1851 depth = ext_depth(inode); 1910 depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1880 * There is no free space in the found leaf. 1939 * There is no free space in the found leaf.
1881 * We're gonna add a new leaf in the tree. 1940 * We're gonna add a new leaf in the tree.
1882 */ 1941 */
1883 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) 1942 if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
1884 flags = EXT4_MB_USE_ROOT_BLOCKS; 1943 flags = EXT4_MB_USE_RESERVED;
1885 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 1944 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1886 if (err) 1945 if (err)
1887 goto cleanup; 1946 goto cleanup;
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2599 return 1; 2658 return 1;
2600} 2659}
2601 2660
2602static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2661int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2603 ext4_lblk_t end) 2662 ext4_lblk_t end)
2604{ 2663{
2605 struct super_block *sb = inode->i_sb; 2664 struct super_block *sb = inode->i_sb;
2606 int depth = ext_depth(inode); 2665 int depth = ext_depth(inode);
@@ -2667,12 +2726,14 @@ again:
2667 2726
2668 /* 2727 /*
2669 * Split the extent in two so that 'end' is the last 2728 * Split the extent in two so that 'end' is the last
2670 * block in the first new extent 2729 * block in the first new extent. Also we should not
2730 * fail removing space due to ENOSPC so try to use
2731 * reserved block if that happens.
2671 */ 2732 */
2672 err = ext4_split_extent_at(handle, inode, path, 2733 err = ext4_split_extent_at(handle, inode, path,
2673 end + 1, split_flag, 2734 end + 1, split_flag,
2674 EXT4_GET_BLOCKS_PRE_IO | 2735 EXT4_GET_BLOCKS_PRE_IO |
2675 EXT4_GET_BLOCKS_PUNCH_OUT_EXT); 2736 EXT4_GET_BLOCKS_METADATA_NOFAIL);
2676 2737
2677 if (err < 0) 2738 if (err < 0)
2678 goto out; 2739 goto out;
@@ -3147,35 +3208,35 @@ out:
3147static int ext4_ext_convert_to_initialized(handle_t *handle, 3208static int ext4_ext_convert_to_initialized(handle_t *handle,
3148 struct inode *inode, 3209 struct inode *inode,
3149 struct ext4_map_blocks *map, 3210 struct ext4_map_blocks *map,
3150 struct ext4_ext_path *path) 3211 struct ext4_ext_path *path,
3212 int flags)
3151{ 3213{
3152 struct ext4_sb_info *sbi; 3214 struct ext4_sb_info *sbi;
3153 struct ext4_extent_header *eh; 3215 struct ext4_extent_header *eh;
3154 struct ext4_map_blocks split_map; 3216 struct ext4_map_blocks split_map;
3155 struct ext4_extent zero_ex; 3217 struct ext4_extent zero_ex;
3156 struct ext4_extent *ex; 3218 struct ext4_extent *ex, *abut_ex;
3157 ext4_lblk_t ee_block, eof_block; 3219 ext4_lblk_t ee_block, eof_block;
3158 unsigned int ee_len, depth; 3220 unsigned int ee_len, depth, map_len = map->m_len;
3159 int allocated, max_zeroout = 0; 3221 int allocated = 0, max_zeroout = 0;
3160 int err = 0; 3222 int err = 0;
3161 int split_flag = 0; 3223 int split_flag = 0;
3162 3224
3163 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3225 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3164 "block %llu, max_blocks %u\n", inode->i_ino, 3226 "block %llu, max_blocks %u\n", inode->i_ino,
3165 (unsigned long long)map->m_lblk, map->m_len); 3227 (unsigned long long)map->m_lblk, map_len);
3166 3228
3167 sbi = EXT4_SB(inode->i_sb); 3229 sbi = EXT4_SB(inode->i_sb);
3168 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3230 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3169 inode->i_sb->s_blocksize_bits; 3231 inode->i_sb->s_blocksize_bits;
3170 if (eof_block < map->m_lblk + map->m_len) 3232 if (eof_block < map->m_lblk + map_len)
3171 eof_block = map->m_lblk + map->m_len; 3233 eof_block = map->m_lblk + map_len;
3172 3234
3173 depth = ext_depth(inode); 3235 depth = ext_depth(inode);
3174 eh = path[depth].p_hdr; 3236 eh = path[depth].p_hdr;
3175 ex = path[depth].p_ext; 3237 ex = path[depth].p_ext;
3176 ee_block = le32_to_cpu(ex->ee_block); 3238 ee_block = le32_to_cpu(ex->ee_block);
3177 ee_len = ext4_ext_get_actual_len(ex); 3239 ee_len = ext4_ext_get_actual_len(ex);
3178 allocated = ee_len - (map->m_lblk - ee_block);
3179 zero_ex.ee_len = 0; 3240 zero_ex.ee_len = 0;
3180 3241
3181 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3242 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3186 3247
3187 /* 3248 /*
3188 * Attempt to transfer newly initialized blocks from the currently 3249 * Attempt to transfer newly initialized blocks from the currently
3189 * uninitialized extent to its left neighbor. This is much cheaper 3250 * uninitialized extent to its neighbor. This is much cheaper
3190 * than an insertion followed by a merge as those involve costly 3251 * than an insertion followed by a merge as those involve costly
3191 * memmove() calls. This is the common case in steady state for 3252 * memmove() calls. Transferring to the left is the common case in
3192 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append 3253 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3193 * writes. 3254 * followed by append writes.
3194 * 3255 *
3195 * Limitations of the current logic: 3256 * Limitations of the current logic:
3196 * - L1: we only deal with writes at the start of the extent. 3257 * - L1: we do not deal with writes covering the whole extent.
3197 * The approach could be extended to writes at the end
3198 * of the extent but this scenario was deemed less common.
3199 * - L2: we do not deal with writes covering the whole extent.
3200 * This would require removing the extent if the transfer 3258 * This would require removing the extent if the transfer
3201 * is possible. 3259 * is possible.
3202 * - L3: we only attempt to merge with an extent stored in the 3260 * - L2: we only attempt to merge with an extent stored in the
3203 * same extent tree node. 3261 * same extent tree node.
3204 */ 3262 */
3205 if ((map->m_lblk == ee_block) && /*L1*/ 3263 if ((map->m_lblk == ee_block) &&
3206 (map->m_len < ee_len) && /*L2*/ 3264 /* See if we can merge left */
3207 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ 3265 (map_len < ee_len) && /*L1*/
3208 struct ext4_extent *prev_ex; 3266 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
3209 ext4_lblk_t prev_lblk; 3267 ext4_lblk_t prev_lblk;
3210 ext4_fsblk_t prev_pblk, ee_pblk; 3268 ext4_fsblk_t prev_pblk, ee_pblk;
3211 unsigned int prev_len, write_len; 3269 unsigned int prev_len;
3212 3270
3213 prev_ex = ex - 1; 3271 abut_ex = ex - 1;
3214 prev_lblk = le32_to_cpu(prev_ex->ee_block); 3272 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3215 prev_len = ext4_ext_get_actual_len(prev_ex); 3273 prev_len = ext4_ext_get_actual_len(abut_ex);
3216 prev_pblk = ext4_ext_pblock(prev_ex); 3274 prev_pblk = ext4_ext_pblock(abut_ex);
3217 ee_pblk = ext4_ext_pblock(ex); 3275 ee_pblk = ext4_ext_pblock(ex);
3218 write_len = map->m_len;
3219 3276
3220 /* 3277 /*
3221 * A transfer of blocks from 'ex' to 'prev_ex' is allowed 3278 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3222 * upon those conditions: 3279 * upon those conditions:
3223 * - C1: prev_ex is initialized, 3280 * - C1: abut_ex is initialized,
3224 * - C2: prev_ex is logically abutting ex, 3281 * - C2: abut_ex is logically abutting ex,
3225 * - C3: prev_ex is physically abutting ex, 3282 * - C3: abut_ex is physically abutting ex,
3226 * - C4: prev_ex can receive the additional blocks without 3283 * - C4: abut_ex can receive the additional blocks without
3227 * overflowing the (initialized) length limit. 3284 * overflowing the (initialized) length limit.
3228 */ 3285 */
3229 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ 3286 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
3230 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3287 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3231 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3288 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3232 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ 3289 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3233 err = ext4_ext_get_access(handle, inode, path + depth); 3290 err = ext4_ext_get_access(handle, inode, path + depth);
3234 if (err) 3291 if (err)
3235 goto out; 3292 goto out;
3236 3293
3237 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3294 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3238 map, ex, prev_ex); 3295 map, ex, abut_ex);
3239 3296
3240 /* Shift the start of ex by 'write_len' blocks */ 3297 /* Shift the start of ex by 'map_len' blocks */
3241 ex->ee_block = cpu_to_le32(ee_block + write_len); 3298 ex->ee_block = cpu_to_le32(ee_block + map_len);
3242 ext4_ext_store_pblock(ex, ee_pblk + write_len); 3299 ext4_ext_store_pblock(ex, ee_pblk + map_len);
3243 ex->ee_len = cpu_to_le16(ee_len - write_len); 3300 ex->ee_len = cpu_to_le16(ee_len - map_len);
3244 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3301 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3245 3302
3246 /* Extend prev_ex by 'write_len' blocks */ 3303 /* Extend abut_ex by 'map_len' blocks */
3247 prev_ex->ee_len = cpu_to_le16(prev_len + write_len); 3304 abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3248 3305
3249 /* Mark the block containing both extents as dirty */ 3306 /* Result: number of initialized blocks past m_lblk */
3250 ext4_ext_dirty(handle, inode, path + depth); 3307 allocated = map_len;
3308 }
3309 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3310 (map_len < ee_len) && /*L1*/
3311 ex < EXT_LAST_EXTENT(eh)) { /*L2*/
3312 /* See if we can merge right */
3313 ext4_lblk_t next_lblk;
3314 ext4_fsblk_t next_pblk, ee_pblk;
3315 unsigned int next_len;
3316
3317 abut_ex = ex + 1;
3318 next_lblk = le32_to_cpu(abut_ex->ee_block);
3319 next_len = ext4_ext_get_actual_len(abut_ex);
3320 next_pblk = ext4_ext_pblock(abut_ex);
3321 ee_pblk = ext4_ext_pblock(ex);
3251 3322
3252 /* Update path to point to the right extent */ 3323 /*
3253 path[depth].p_ext = prev_ex; 3324 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3325 * upon those conditions:
3326 * - C1: abut_ex is initialized,
3327 * - C2: abut_ex is logically abutting ex,
3328 * - C3: abut_ex is physically abutting ex,
3329 * - C4: abut_ex can receive the additional blocks without
3330 * overflowing the (initialized) length limit.
3331 */
3332 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
3333 ((map->m_lblk + map_len) == next_lblk) && /*C2*/
3334 ((ee_pblk + ee_len) == next_pblk) && /*C3*/
3335 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3336 err = ext4_ext_get_access(handle, inode, path + depth);
3337 if (err)
3338 goto out;
3339
3340 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3341 map, ex, abut_ex);
3342
3343 /* Shift the start of abut_ex by 'map_len' blocks */
3344 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3345 ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3346 ex->ee_len = cpu_to_le16(ee_len - map_len);
3347 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3348
3349 /* Extend abut_ex by 'map_len' blocks */
3350 abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3254 3351
3255 /* Result: number of initialized blocks past m_lblk */ 3352 /* Result: number of initialized blocks past m_lblk */
3256 allocated = write_len; 3353 allocated = map_len;
3257 goto out;
3258 } 3354 }
3259 } 3355 }
3356 if (allocated) {
3357 /* Mark the block containing both extents as dirty */
3358 ext4_ext_dirty(handle, inode, path + depth);
3359
3360 /* Update path to point to the right extent */
3361 path[depth].p_ext = abut_ex;
3362 goto out;
3363 } else
3364 allocated = ee_len - (map->m_lblk - ee_block);
3260 3365
3261 WARN_ON(map->m_lblk < ee_block); 3366 WARN_ON(map->m_lblk < ee_block);
3262 /* 3367 /*
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3330 } 3435 }
3331 3436
3332 allocated = ext4_split_extent(handle, inode, path, 3437 allocated = ext4_split_extent(handle, inode, path,
3333 &split_map, split_flag, 0); 3438 &split_map, split_flag, flags);
3334 if (allocated < 0) 3439 if (allocated < 0)
3335 err = allocated; 3440 err = allocated;
3336 3441
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3650 flags, allocated); 3755 flags, allocated);
3651 ext4_ext_show_leaf(inode, path); 3756 ext4_ext_show_leaf(inode, path);
3652 3757
3758 /*
3759 * When writing into uninitialized space, we should not fail to
3760 * allocate metadata blocks for the new extent block if needed.
3761 */
3762 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3763
3653 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3764 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
3654 allocated, newblock); 3765 allocated, newblock);
3655 3766
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3713 } 3824 }
3714 3825
3715 /* buffered write, writepage time, convert*/ 3826 /* buffered write, writepage time, convert*/
3716 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3827 ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
3717 if (ret >= 0) 3828 if (ret >= 0)
3718 ext4_update_inode_fsync_trans(handle, inode, 1); 3829 ext4_update_inode_fsync_trans(handle, inode, 1);
3719out: 3830out:
@@ -4257,48 +4368,13 @@ out3:
4257 return err ? err : allocated; 4368 return err ? err : allocated;
4258} 4369}
4259 4370
4260void ext4_ext_truncate(struct inode *inode) 4371void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4261{ 4372{
4262 struct address_space *mapping = inode->i_mapping;
4263 struct super_block *sb = inode->i_sb; 4373 struct super_block *sb = inode->i_sb;
4264 ext4_lblk_t last_block; 4374 ext4_lblk_t last_block;
4265 handle_t *handle;
4266 loff_t page_len;
4267 int err = 0; 4375 int err = 0;
4268 4376
4269 /* 4377 /*
4270 * finish any pending end_io work so we won't run the risk of
4271 * converting any truncated blocks to initialized later
4272 */
4273 ext4_flush_unwritten_io(inode);
4274
4275 /*
4276 * probably first extent we're gonna free will be last in block
4277 */
4278 err = ext4_writepage_trans_blocks(inode);
4279 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
4280 if (IS_ERR(handle))
4281 return;
4282
4283 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4284 page_len = PAGE_CACHE_SIZE -
4285 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4286
4287 err = ext4_discard_partial_page_buffers(handle,
4288 mapping, inode->i_size, page_len, 0);
4289
4290 if (err)
4291 goto out_stop;
4292 }
4293
4294 if (ext4_orphan_add(handle, inode))
4295 goto out_stop;
4296
4297 down_write(&EXT4_I(inode)->i_data_sem);
4298
4299 ext4_discard_preallocations(inode);
4300
4301 /*
4302 * TODO: optimization is possible here. 4378 * TODO: optimization is possible here.
4303 * Probably we need not scan at all, 4379 * Probably we need not scan at all,
4304 * because page truncation is enough. 4380 * because page truncation is enough.
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode)
4313 err = ext4_es_remove_extent(inode, last_block, 4389 err = ext4_es_remove_extent(inode, last_block,
4314 EXT_MAX_BLOCKS - last_block); 4390 EXT_MAX_BLOCKS - last_block);
4315 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4391 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4316
4317 /* In a multi-transaction truncate, we only make the final
4318 * transaction synchronous.
4319 */
4320 if (IS_SYNC(inode))
4321 ext4_handle_sync(handle);
4322
4323 up_write(&EXT4_I(inode)->i_data_sem);
4324
4325out_stop:
4326 /*
4327 * If this was a simple ftruncate() and the file will remain alive,
4328 * then we need to clear up the orphan record which we created above.
4329 * However, if this was a real unlink then we were called by
4330 * ext4_delete_inode(), and we allow that function to clean up the
4331 * orphan info for us.
4332 */
4333 if (inode->i_nlink)
4334 ext4_orphan_del(handle, inode);
4335
4336 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4337 ext4_mark_inode_dirty(handle, inode);
4338 ext4_journal_stop(handle);
4339} 4392}
4340 4393
4341static void ext4_falloc_update_inode(struct inode *inode, 4394static void ext4_falloc_update_inode(struct inode *inode,
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
4623 return (error < 0 ? error : 0); 4676 return (error < 0 ? error : 0);
4624} 4677}
4625 4678
4626/*
4627 * ext4_ext_punch_hole
4628 *
4629 * Punches a hole of "length" bytes in a file starting
4630 * at byte "offset"
4631 *
4632 * @inode: The inode of the file to punch a hole in
4633 * @offset: The starting byte offset of the hole
4634 * @length: The length of the hole
4635 *
4636 * Returns the number of blocks removed or negative on err
4637 */
4638int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4639{
4640 struct inode *inode = file_inode(file);
4641 struct super_block *sb = inode->i_sb;
4642 ext4_lblk_t first_block, stop_block;
4643 struct address_space *mapping = inode->i_mapping;
4644 handle_t *handle;
4645 loff_t first_page, last_page, page_len;
4646 loff_t first_page_offset, last_page_offset;
4647 int credits, err = 0;
4648
4649 /*
4650 * Write out all dirty pages to avoid race conditions
4651 * Then release them.
4652 */
4653 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4654 err = filemap_write_and_wait_range(mapping,
4655 offset, offset + length - 1);
4656
4657 if (err)
4658 return err;
4659 }
4660
4661 mutex_lock(&inode->i_mutex);
4662 /* It's not possible punch hole on append only file */
4663 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4664 err = -EPERM;
4665 goto out_mutex;
4666 }
4667 if (IS_SWAPFILE(inode)) {
4668 err = -ETXTBSY;
4669 goto out_mutex;
4670 }
4671
4672 /* No need to punch hole beyond i_size */
4673 if (offset >= inode->i_size)
4674 goto out_mutex;
4675
4676 /*
4677 * If the hole extends beyond i_size, set the hole
4678 * to end after the page that contains i_size
4679 */
4680 if (offset + length > inode->i_size) {
4681 length = inode->i_size +
4682 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4683 offset;
4684 }
4685
4686 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4687 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4688
4689 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4690 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4691
4692 /* Now release the pages */
4693 if (last_page_offset > first_page_offset) {
4694 truncate_pagecache_range(inode, first_page_offset,
4695 last_page_offset - 1);
4696 }
4697
4698 /* Wait all existing dio workers, newcomers will block on i_mutex */
4699 ext4_inode_block_unlocked_dio(inode);
4700 err = ext4_flush_unwritten_io(inode);
4701 if (err)
4702 goto out_dio;
4703 inode_dio_wait(inode);
4704
4705 credits = ext4_writepage_trans_blocks(inode);
4706 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4707 if (IS_ERR(handle)) {
4708 err = PTR_ERR(handle);
4709 goto out_dio;
4710 }
4711
4712
4713 /*
4714 * Now we need to zero out the non-page-aligned data in the
4715 * pages at the start and tail of the hole, and unmap the buffer
4716 * heads for the block aligned regions of the page that were
4717 * completely zeroed.
4718 */
4719 if (first_page > last_page) {
4720 /*
4721 * If the file space being truncated is contained within a page
4722 * just zero out and unmap the middle of that page
4723 */
4724 err = ext4_discard_partial_page_buffers(handle,
4725 mapping, offset, length, 0);
4726
4727 if (err)
4728 goto out;
4729 } else {
4730 /*
4731 * zero out and unmap the partial page that contains
4732 * the start of the hole
4733 */
4734 page_len = first_page_offset - offset;
4735 if (page_len > 0) {
4736 err = ext4_discard_partial_page_buffers(handle, mapping,
4737 offset, page_len, 0);
4738 if (err)
4739 goto out;
4740 }
4741
4742 /*
4743 * zero out and unmap the partial page that contains
4744 * the end of the hole
4745 */
4746 page_len = offset + length - last_page_offset;
4747 if (page_len > 0) {
4748 err = ext4_discard_partial_page_buffers(handle, mapping,
4749 last_page_offset, page_len, 0);
4750 if (err)
4751 goto out;
4752 }
4753 }
4754
4755 /*
4756 * If i_size is contained in the last page, we need to
4757 * unmap and zero the partial page after i_size
4758 */
4759 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4760 inode->i_size % PAGE_CACHE_SIZE != 0) {
4761
4762 page_len = PAGE_CACHE_SIZE -
4763 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4764
4765 if (page_len > 0) {
4766 err = ext4_discard_partial_page_buffers(handle,
4767 mapping, inode->i_size, page_len, 0);
4768
4769 if (err)
4770 goto out;
4771 }
4772 }
4773
4774 first_block = (offset + sb->s_blocksize - 1) >>
4775 EXT4_BLOCK_SIZE_BITS(sb);
4776 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4777
4778 /* If there are no blocks to remove, return now */
4779 if (first_block >= stop_block)
4780 goto out;
4781
4782 down_write(&EXT4_I(inode)->i_data_sem);
4783 ext4_discard_preallocations(inode);
4784
4785 err = ext4_es_remove_extent(inode, first_block,
4786 stop_block - first_block);
4787 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4788
4789 ext4_discard_preallocations(inode);
4790
4791 if (IS_SYNC(inode))
4792 ext4_handle_sync(handle);
4793
4794 up_write(&EXT4_I(inode)->i_data_sem);
4795
4796out:
4797 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4798 ext4_mark_inode_dirty(handle, inode);
4799 ext4_journal_stop(handle);
4800out_dio:
4801 ext4_inode_resume_unlocked_dio(inode);
4802out_mutex:
4803 mutex_unlock(&inode->i_mutex);
4804 return err;
4805}
4806
4807int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4679int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4808 __u64 start, __u64 len) 4680 __u64 start, __u64 len)
4809{ 4681{
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..e0ba8a408def 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
166 if (journal->j_flags & JBD2_BARRIER && 166 if (journal->j_flags & JBD2_BARRIER &&
167 !jbd2_trans_will_send_data_barrier(journal, commit_tid)) 167 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
168 needs_barrier = true; 168 needs_barrier = true;
169 jbd2_log_start_commit(journal, commit_tid); 169 ret = jbd2_complete_transaction(journal, commit_tid);
170 ret = jbd2_log_wait_commit(journal, commit_tid);
171 if (needs_barrier) { 170 if (needs_barrier) {
172 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 171 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
173 if (!ret) 172 if (!ret)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6c5bb8d993fe..00a818d67b54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
166 trace_ext4_load_inode_bitmap(sb, block_group); 166 trace_ext4_load_inode_bitmap(sb, block_group);
167 bh->b_end_io = ext4_end_bitmap_read; 167 bh->b_end_io = ext4_end_bitmap_read;
168 get_bh(bh); 168 get_bh(bh);
169 submit_bh(READ, bh); 169 submit_bh(READ | REQ_META | REQ_PRIO, bh);
170 wait_on_buffer(bh); 170 wait_on_buffer(bh);
171 if (!buffer_uptodate(bh)) { 171 if (!buffer_uptodate(bh)) {
172 put_bh(bh); 172 put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
666 ei = EXT4_I(inode); 666 ei = EXT4_I(inode);
667 sbi = EXT4_SB(sb); 667 sbi = EXT4_SB(sb);
668 668
669 /*
670 * Initalize owners and quota early so that we don't have to account
671 * for quota initialization worst case in standard inode creating
672 * transaction
673 */
674 if (owner) {
675 inode->i_mode = mode;
676 i_uid_write(inode, owner[0]);
677 i_gid_write(inode, owner[1]);
678 } else if (test_opt(sb, GRPID)) {
679 inode->i_mode = mode;
680 inode->i_uid = current_fsuid();
681 inode->i_gid = dir->i_gid;
682 } else
683 inode_init_owner(inode, dir, mode);
684 dquot_initialize(inode);
685
669 if (!goal) 686 if (!goal)
670 goal = sbi->s_inode_goal; 687 goal = sbi->s_inode_goal;
671 688
@@ -697,7 +714,7 @@ got_group:
697 714
698 gdp = ext4_get_group_desc(sb, group, &group_desc_bh); 715 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
699 if (!gdp) 716 if (!gdp)
700 goto fail; 717 goto out;
701 718
702 /* 719 /*
703 * Check free inodes count before loading bitmap. 720 * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
711 brelse(inode_bitmap_bh); 728 brelse(inode_bitmap_bh);
712 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 729 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
713 if (!inode_bitmap_bh) 730 if (!inode_bitmap_bh)
714 goto fail; 731 goto out;
715 732
716repeat_in_this_group: 733repeat_in_this_group:
717 ino = ext4_find_next_zero_bit((unsigned long *) 734 ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ repeat_in_this_group:
733 handle_type, nblocks); 750 handle_type, nblocks);
734 if (IS_ERR(handle)) { 751 if (IS_ERR(handle)) {
735 err = PTR_ERR(handle); 752 err = PTR_ERR(handle);
736 goto fail; 753 ext4_std_error(sb, err);
754 goto out;
737 } 755 }
738 } 756 }
739 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 757 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
740 err = ext4_journal_get_write_access(handle, inode_bitmap_bh); 758 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
741 if (err) 759 if (err) {
742 goto fail; 760 ext4_std_error(sb, err);
761 goto out;
762 }
743 ext4_lock_group(sb, group); 763 ext4_lock_group(sb, group);
744 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); 764 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
745 ext4_unlock_group(sb, group); 765 ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ repeat_in_this_group:
755got: 775got:
756 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); 776 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
757 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); 777 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
758 if (err) 778 if (err) {
759 goto fail; 779 ext4_std_error(sb, err);
780 goto out;
781 }
760 782
761 /* We may have to initialize the block bitmap if it isn't already */ 783 /* We may have to initialize the block bitmap if it isn't already */
762 if (ext4_has_group_desc_csum(sb) && 784 if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ got:
768 err = ext4_journal_get_write_access(handle, block_bitmap_bh); 790 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
769 if (err) { 791 if (err) {
770 brelse(block_bitmap_bh); 792 brelse(block_bitmap_bh);
771 goto fail; 793 ext4_std_error(sb, err);
794 goto out;
772 } 795 }
773 796
774 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); 797 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ got:
787 ext4_unlock_group(sb, group); 810 ext4_unlock_group(sb, group);
788 brelse(block_bitmap_bh); 811 brelse(block_bitmap_bh);
789 812
790 if (err) 813 if (err) {
791 goto fail; 814 ext4_std_error(sb, err);
815 goto out;
816 }
792 } 817 }
793 818
794 BUFFER_TRACE(group_desc_bh, "get_write_access"); 819 BUFFER_TRACE(group_desc_bh, "get_write_access");
795 err = ext4_journal_get_write_access(handle, group_desc_bh); 820 err = ext4_journal_get_write_access(handle, group_desc_bh);
796 if (err) 821 if (err) {
797 goto fail; 822 ext4_std_error(sb, err);
823 goto out;
824 }
798 825
799 /* Update the relevant bg descriptor fields */ 826 /* Update the relevant bg descriptor fields */
800 if (ext4_has_group_desc_csum(sb)) { 827 if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ got:
840 867
841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 868 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 869 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
843 if (err) 870 if (err) {
844 goto fail; 871 ext4_std_error(sb, err);
872 goto out;
873 }
845 874
846 percpu_counter_dec(&sbi->s_freeinodes_counter); 875 percpu_counter_dec(&sbi->s_freeinodes_counter);
847 if (S_ISDIR(mode)) 876 if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ got:
851 flex_group = ext4_flex_group(sbi, group); 880 flex_group = ext4_flex_group(sbi, group);
852 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 881 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
853 } 882 }
854 if (owner) {
855 inode->i_mode = mode;
856 i_uid_write(inode, owner[0]);
857 i_gid_write(inode, owner[1]);
858 } else if (test_opt(sb, GRPID)) {
859 inode->i_mode = mode;
860 inode->i_uid = current_fsuid();
861 inode->i_gid = dir->i_gid;
862 } else
863 inode_init_owner(inode, dir, mode);
864 883
865 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 884 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
866 /* This is the optimal IO size (for stat), not the fs block size */ 885 /* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ got:
889 * twice. 908 * twice.
890 */ 909 */
891 err = -EIO; 910 err = -EIO;
892 goto fail; 911 ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
912 inode->i_ino);
913 goto out;
893 } 914 }
894 spin_lock(&sbi->s_next_gen_lock); 915 spin_lock(&sbi->s_next_gen_lock);
895 inode->i_generation = sbi->s_next_generation++; 916 inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ got:
899 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 920 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
900 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 921 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
901 __u32 csum; 922 __u32 csum;
902 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
903 __le32 inum = cpu_to_le32(inode->i_ino); 923 __le32 inum = cpu_to_le32(inode->i_ino);
904 __le32 gen = cpu_to_le32(inode->i_generation); 924 __le32 gen = cpu_to_le32(inode->i_generation);
905 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, 925 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ got:
918 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 938 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
919 939
920 ret = inode; 940 ret = inode;
921 dquot_initialize(inode);
922 err = dquot_alloc_inode(inode); 941 err = dquot_alloc_inode(inode);
923 if (err) 942 if (err)
924 goto fail_drop; 943 goto fail_drop;
@@ -952,24 +971,17 @@ got:
952 971
953 ext4_debug("allocating inode %lu\n", inode->i_ino); 972 ext4_debug("allocating inode %lu\n", inode->i_ino);
954 trace_ext4_allocate_inode(inode, dir, mode); 973 trace_ext4_allocate_inode(inode, dir, mode);
955 goto really_out;
956fail:
957 ext4_std_error(sb, err);
958out:
959 iput(inode);
960 ret = ERR_PTR(err);
961really_out:
962 brelse(inode_bitmap_bh); 974 brelse(inode_bitmap_bh);
963 return ret; 975 return ret;
964 976
965fail_free_drop: 977fail_free_drop:
966 dquot_free_inode(inode); 978 dquot_free_inode(inode);
967
968fail_drop: 979fail_drop:
969 dquot_drop(inode);
970 inode->i_flags |= S_NOQUOTA;
971 clear_nlink(inode); 980 clear_nlink(inode);
972 unlock_new_inode(inode); 981 unlock_new_inode(inode);
982out:
983 dquot_drop(inode);
984 inode->i_flags |= S_NOQUOTA;
973 iput(inode); 985 iput(inode);
974 brelse(inode_bitmap_bh); 986 brelse(inode_bitmap_bh);
975 return ERR_PTR(err); 987 return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a04183127ef0..98be6f697463 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
292} 292}
293 293
294/** 294/**
295 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
296 * @handle: handle for this transaction
297 * @inode: inode which needs allocated blocks
298 * @iblock: the logical block to start allocated at
299 * @goal: preferred physical block of allocation
300 * @indirect_blks: the number of blocks need to allocate for indirect
301 * blocks
302 * @blks: number of desired blocks
303 * @new_blocks: on return it will store the new block numbers for
304 * the indirect blocks(if needed) and the first direct block,
305 * @err: on return it will store the error code
306 *
307 * This function will return the number of blocks allocated as
308 * requested by the passed-in parameters.
309 */
310static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
311 ext4_lblk_t iblock, ext4_fsblk_t goal,
312 int indirect_blks, int blks,
313 ext4_fsblk_t new_blocks[4], int *err)
314{
315 struct ext4_allocation_request ar;
316 int target, i;
317 unsigned long count = 0, blk_allocated = 0;
318 int index = 0;
319 ext4_fsblk_t current_block = 0;
320 int ret = 0;
321
322 /*
323 * Here we try to allocate the requested multiple blocks at once,
324 * on a best-effort basis.
325 * To build a branch, we should allocate blocks for
326 * the indirect blocks(if not allocated yet), and at least
327 * the first direct block of this branch. That's the
328 * minimum number of blocks need to allocate(required)
329 */
330 /* first we try to allocate the indirect blocks */
331 target = indirect_blks;
332 while (target > 0) {
333 count = target;
334 /* allocating blocks for indirect blocks and direct blocks */
335 current_block = ext4_new_meta_blocks(handle, inode, goal,
336 0, &count, err);
337 if (*err)
338 goto failed_out;
339
340 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
341 EXT4_ERROR_INODE(inode,
342 "current_block %llu + count %lu > %d!",
343 current_block, count,
344 EXT4_MAX_BLOCK_FILE_PHYS);
345 *err = -EIO;
346 goto failed_out;
347 }
348
349 target -= count;
350 /* allocate blocks for indirect blocks */
351 while (index < indirect_blks && count) {
352 new_blocks[index++] = current_block++;
353 count--;
354 }
355 if (count > 0) {
356 /*
357 * save the new block number
358 * for the first direct block
359 */
360 new_blocks[index] = current_block;
361 WARN(1, KERN_INFO "%s returned more blocks than "
362 "requested\n", __func__);
363 break;
364 }
365 }
366
367 target = blks - count ;
368 blk_allocated = count;
369 if (!target)
370 goto allocated;
371 /* Now allocate data blocks */
372 memset(&ar, 0, sizeof(ar));
373 ar.inode = inode;
374 ar.goal = goal;
375 ar.len = target;
376 ar.logical = iblock;
377 if (S_ISREG(inode->i_mode))
378 /* enable in-core preallocation only for regular files */
379 ar.flags = EXT4_MB_HINT_DATA;
380
381 current_block = ext4_mb_new_blocks(handle, &ar, err);
382 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
383 EXT4_ERROR_INODE(inode,
384 "current_block %llu + ar.len %d > %d!",
385 current_block, ar.len,
386 EXT4_MAX_BLOCK_FILE_PHYS);
387 *err = -EIO;
388 goto failed_out;
389 }
390
391 if (*err && (target == blks)) {
392 /*
393 * if the allocation failed and we didn't allocate
394 * any blocks before
395 */
396 goto failed_out;
397 }
398 if (!*err) {
399 if (target == blks) {
400 /*
401 * save the new block number
402 * for the first direct block
403 */
404 new_blocks[index] = current_block;
405 }
406 blk_allocated += ar.len;
407 }
408allocated:
409 /* total number of blocks allocated for direct blocks */
410 ret = blk_allocated;
411 *err = 0;
412 return ret;
413failed_out:
414 for (i = 0; i < index; i++)
415 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
416 return ret;
417}
418
419/**
420 * ext4_alloc_branch - allocate and set up a chain of blocks. 295 * ext4_alloc_branch - allocate and set up a chain of blocks.
421 * @handle: handle for this transaction 296 * @handle: handle for this transaction
422 * @inode: owner 297 * @inode: owner
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
448 int *blks, ext4_fsblk_t goal, 323 int *blks, ext4_fsblk_t goal,
449 ext4_lblk_t *offsets, Indirect *branch) 324 ext4_lblk_t *offsets, Indirect *branch)
450{ 325{
451 int blocksize = inode->i_sb->s_blocksize; 326 struct ext4_allocation_request ar;
452 int i, n = 0; 327 struct buffer_head * bh;
453 int err = 0; 328 ext4_fsblk_t b, new_blocks[4];
454 struct buffer_head *bh; 329 __le32 *p;
455 int num; 330 int i, j, err, len = 1;
456 ext4_fsblk_t new_blocks[4];
457 ext4_fsblk_t current_block;
458
459 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
460 *blks, new_blocks, &err);
461 if (err)
462 return err;
463 331
464 branch[0].key = cpu_to_le32(new_blocks[0]);
465 /* 332 /*
466 * metadata blocks and data blocks are allocated. 333 * Set up for the direct block allocation
467 */ 334 */
468 for (n = 1; n <= indirect_blks; n++) { 335 memset(&ar, 0, sizeof(ar));
469 /* 336 ar.inode = inode;
470 * Get buffer_head for parent block, zero it out 337 ar.len = *blks;
471 * and set the pointer to new one, then send 338 ar.logical = iblock;
472 * parent to disk. 339 if (S_ISREG(inode->i_mode))
473 */ 340 ar.flags = EXT4_MB_HINT_DATA;
474 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 341
342 for (i = 0; i <= indirect_blks; i++) {
343 if (i == indirect_blks) {
344 ar.goal = goal;
345 new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
346 } else
347 goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
348 goal, 0, NULL, &err);
349 if (err) {
350 i--;
351 goto failed;
352 }
353 branch[i].key = cpu_to_le32(new_blocks[i]);
354 if (i == 0)
355 continue;
356
357 bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
475 if (unlikely(!bh)) { 358 if (unlikely(!bh)) {
476 err = -ENOMEM; 359 err = -ENOMEM;
477 goto failed; 360 goto failed;
478 } 361 }
479
480 branch[n].bh = bh;
481 lock_buffer(bh); 362 lock_buffer(bh);
482 BUFFER_TRACE(bh, "call get_create_access"); 363 BUFFER_TRACE(bh, "call get_create_access");
483 err = ext4_journal_get_create_access(handle, bh); 364 err = ext4_journal_get_create_access(handle, bh);
484 if (err) { 365 if (err) {
485 /* Don't brelse(bh) here; it's done in
486 * ext4_journal_forget() below */
487 unlock_buffer(bh); 366 unlock_buffer(bh);
488 goto failed; 367 goto failed;
489 } 368 }
490 369
491 memset(bh->b_data, 0, blocksize); 370 memset(bh->b_data, 0, bh->b_size);
492 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 371 p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
493 branch[n].key = cpu_to_le32(new_blocks[n]); 372 b = new_blocks[i];
494 *branch[n].p = branch[n].key; 373
495 if (n == indirect_blks) { 374 if (i == indirect_blks)
496 current_block = new_blocks[n]; 375 len = ar.len;
497 /* 376 for (j = 0; j < len; j++)
498 * End of chain, update the last new metablock of 377 *p++ = cpu_to_le32(b++);
499 * the chain to point to the new allocated 378
500 * data blocks numbers
501 */
502 for (i = 1; i < num; i++)
503 *(branch[n].p + i) = cpu_to_le32(++current_block);
504 }
505 BUFFER_TRACE(bh, "marking uptodate"); 379 BUFFER_TRACE(bh, "marking uptodate");
506 set_buffer_uptodate(bh); 380 set_buffer_uptodate(bh);
507 unlock_buffer(bh); 381 unlock_buffer(bh);
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
511 if (err) 385 if (err)
512 goto failed; 386 goto failed;
513 } 387 }
514 *blks = num; 388 *blks = ar.len;
515 return err; 389 return 0;
516failed: 390failed:
517 /* Allocation failed, free what we already allocated */ 391 for (; i >= 0; i--) {
518 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 392 if (i != indirect_blks && branch[i].bh)
519 for (i = 1; i <= n ; i++) { 393 ext4_forget(handle, 1, inode, branch[i].bh,
520 /* 394 branch[i].bh->b_blocknr);
521 * branch[i].bh is newly allocated, so there is no 395 ext4_free_blocks(handle, inode, NULL, new_blocks[i],
522 * need to revoke the block, which is why we don't 396 (i == indirect_blks) ? ar.len : 1, 0);
523 * need to set EXT4_FREE_BLOCKS_METADATA.
524 */
525 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
526 EXT4_FREE_BLOCKS_FORGET);
527 } 397 }
528 for (i = n+1; i < indirect_blks; i++)
529 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
530
531 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
532
533 return err; 398 return err;
534} 399}
535 400
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
941 * be able to restart the transaction at a conventient checkpoint to make 806 * be able to restart the transaction at a conventient checkpoint to make
942 * sure we don't overflow the journal. 807 * sure we don't overflow the journal.
943 * 808 *
944 * start_transaction gets us a new handle for a truncate transaction, 809 * Try to extend this transaction for the purposes of truncation. If
945 * and extend_transaction tries to extend the existing one a bit. If
946 * extend fails, we need to propagate the failure up and restart the 810 * extend fails, we need to propagate the failure up and restart the
947 * transaction in the top-level truncate loop. --sct 811 * transaction in the top-level truncate loop. --sct
948 */
949static handle_t *start_transaction(struct inode *inode)
950{
951 handle_t *result;
952
953 result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
954 ext4_blocks_for_truncate(inode));
955 if (!IS_ERR(result))
956 return result;
957
958 ext4_std_error(inode->i_sb, PTR_ERR(result));
959 return result;
960}
961
962/*
963 * Try to extend this transaction for the purposes of truncation.
964 * 812 *
965 * Returns 0 if we managed to create more room. If we can't create more 813 * Returns 0 if we managed to create more room. If we can't create more
966 * room, and the transaction must be restarted we return 1. 814 * room, and the transaction must be restarted we return 1.
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
1353 } 1201 }
1354} 1202}
1355 1203
1356void ext4_ind_truncate(struct inode *inode) 1204void ext4_ind_truncate(handle_t *handle, struct inode *inode)
1357{ 1205{
1358 handle_t *handle;
1359 struct ext4_inode_info *ei = EXT4_I(inode); 1206 struct ext4_inode_info *ei = EXT4_I(inode);
1360 __le32 *i_data = ei->i_data; 1207 __le32 *i_data = ei->i_data;
1361 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1208 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1362 struct address_space *mapping = inode->i_mapping;
1363 ext4_lblk_t offsets[4]; 1209 ext4_lblk_t offsets[4];
1364 Indirect chain[4]; 1210 Indirect chain[4];
1365 Indirect *partial; 1211 Indirect *partial;
1366 __le32 nr = 0; 1212 __le32 nr = 0;
1367 int n = 0; 1213 int n = 0;
1368 ext4_lblk_t last_block, max_block; 1214 ext4_lblk_t last_block, max_block;
1369 loff_t page_len;
1370 unsigned blocksize = inode->i_sb->s_blocksize; 1215 unsigned blocksize = inode->i_sb->s_blocksize;
1371 int err;
1372
1373 handle = start_transaction(inode);
1374 if (IS_ERR(handle))
1375 return; /* AKPM: return what? */
1376 1216
1377 last_block = (inode->i_size + blocksize-1) 1217 last_block = (inode->i_size + blocksize-1)
1378 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1218 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1379 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1219 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1380 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1220 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1381 1221
1382 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
1383 page_len = PAGE_CACHE_SIZE -
1384 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1385
1386 err = ext4_discard_partial_page_buffers(handle,
1387 mapping, inode->i_size, page_len, 0);
1388
1389 if (err)
1390 goto out_stop;
1391 }
1392
1393 if (last_block != max_block) { 1222 if (last_block != max_block) {
1394 n = ext4_block_to_path(inode, last_block, offsets, NULL); 1223 n = ext4_block_to_path(inode, last_block, offsets, NULL);
1395 if (n == 0) 1224 if (n == 0)
1396 goto out_stop; /* error */ 1225 return;
1397 } 1226 }
1398 1227
1399 /*
1400 * OK. This truncate is going to happen. We add the inode to the
1401 * orphan list, so that if this truncate spans multiple transactions,
1402 * and we crash, we will resume the truncate when the filesystem
1403 * recovers. It also marks the inode dirty, to catch the new size.
1404 *
1405 * Implication: the file must always be in a sane, consistent
1406 * truncatable state while each transaction commits.
1407 */
1408 if (ext4_orphan_add(handle, inode))
1409 goto out_stop;
1410
1411 /*
1412 * From here we block out all ext4_get_block() callers who want to
1413 * modify the block allocation tree.
1414 */
1415 down_write(&ei->i_data_sem);
1416
1417 ext4_discard_preallocations(inode);
1418 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); 1228 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
1419 1229
1420 /* 1230 /*
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
1431 * It is unnecessary to free any data blocks if last_block is 1241 * It is unnecessary to free any data blocks if last_block is
1432 * equal to the indirect block limit. 1242 * equal to the indirect block limit.
1433 */ 1243 */
1434 goto out_unlock; 1244 return;
1435 } else if (n == 1) { /* direct blocks */ 1245 } else if (n == 1) { /* direct blocks */
1436 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 1246 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
1437 i_data + EXT4_NDIR_BLOCKS); 1247 i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1301,6 @@ do_indirects:
1491 case EXT4_TIND_BLOCK: 1301 case EXT4_TIND_BLOCK:
1492 ; 1302 ;
1493 } 1303 }
1494
1495out_unlock:
1496 up_write(&ei->i_data_sem);
1497 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1498 ext4_mark_inode_dirty(handle, inode);
1499
1500 /*
1501 * In a multi-transaction truncate, we only make the final transaction
1502 * synchronous
1503 */
1504 if (IS_SYNC(inode))
1505 ext4_handle_sync(handle);
1506out_stop:
1507 /*
1508 * If this was a simple ftruncate(), and the file will remain alive
1509 * then we need to clear up the orphan record which we created above.
1510 * However, if this was a real unlink then we were called by
1511 * ext4_delete_inode(), and we allow that function to clean up the
1512 * orphan info for us.
1513 */
1514 if (inode->i_nlink)
1515 ext4_orphan_del(handle, inode);
1516
1517 ext4_journal_stop(handle);
1518 trace_ext4_truncate_exit(inode);
1519} 1304}
1520 1305
1521static int free_hole_blocks(handle_t *handle, struct inode *inode, 1306static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1354,8 @@ err:
1569 return ret; 1354 return ret;
1570} 1355}
1571 1356
1572static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 1357int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
1573 ext4_lblk_t first, ext4_lblk_t stop) 1358 ext4_lblk_t first, ext4_lblk_t stop)
1574{ 1359{
1575 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1360 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1576 int level, ret = 0; 1361 int level, ret = 0;
@@ -1604,157 +1389,3 @@ err:
1604 return ret; 1389 return ret;
1605} 1390}
1606 1391
1607int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
1608{
1609 struct inode *inode = file_inode(file);
1610 struct super_block *sb = inode->i_sb;
1611 ext4_lblk_t first_block, stop_block;
1612 struct address_space *mapping = inode->i_mapping;
1613 handle_t *handle = NULL;
1614 loff_t first_page, last_page, page_len;
1615 loff_t first_page_offset, last_page_offset;
1616 int err = 0;
1617
1618 /*
1619 * Write out all dirty pages to avoid race conditions
1620 * Then release them.
1621 */
1622 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
1623 err = filemap_write_and_wait_range(mapping,
1624 offset, offset + length - 1);
1625 if (err)
1626 return err;
1627 }
1628
1629 mutex_lock(&inode->i_mutex);
1630 /* It's not possible punch hole on append only file */
1631 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
1632 err = -EPERM;
1633 goto out_mutex;
1634 }
1635 if (IS_SWAPFILE(inode)) {
1636 err = -ETXTBSY;
1637 goto out_mutex;
1638 }
1639
1640 /* No need to punch hole beyond i_size */
1641 if (offset >= inode->i_size)
1642 goto out_mutex;
1643
1644 /*
1645 * If the hole extents beyond i_size, set the hole
1646 * to end after the page that contains i_size
1647 */
1648 if (offset + length > inode->i_size) {
1649 length = inode->i_size +
1650 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
1651 offset;
1652 }
1653
1654 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1655 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
1656
1657 first_page_offset = first_page << PAGE_CACHE_SHIFT;
1658 last_page_offset = last_page << PAGE_CACHE_SHIFT;
1659
1660 /* Now release the pages */
1661 if (last_page_offset > first_page_offset) {
1662 truncate_pagecache_range(inode, first_page_offset,
1663 last_page_offset - 1);
1664 }
1665
1666 /* Wait all existing dio works, newcomers will block on i_mutex */
1667 inode_dio_wait(inode);
1668
1669 handle = start_transaction(inode);
1670 if (IS_ERR(handle))
1671 goto out_mutex;
1672
1673 /*
1674 * Now we need to zero out the non-page-aligned data in the
1675 * pages at the start and tail of the hole, and unmap the buffer
1676 * heads for the block aligned regions of the page that were
1677 * completely zerod.
1678 */
1679 if (first_page > last_page) {
1680 /*
1681 * If the file space being truncated is contained within a page
1682 * just zero out and unmap the middle of that page
1683 */
1684 err = ext4_discard_partial_page_buffers(handle,
1685 mapping, offset, length, 0);
1686 if (err)
1687 goto out;
1688 } else {
1689 /*
1690 * Zero out and unmap the paritial page that contains
1691 * the start of the hole
1692 */
1693 page_len = first_page_offset - offset;
1694 if (page_len > 0) {
1695 err = ext4_discard_partial_page_buffers(handle, mapping,
1696 offset, page_len, 0);
1697 if (err)
1698 goto out;
1699 }
1700
1701 /*
1702 * Zero out and unmap the partial page that contains
1703 * the end of the hole
1704 */
1705 page_len = offset + length - last_page_offset;
1706 if (page_len > 0) {
1707 err = ext4_discard_partial_page_buffers(handle, mapping,
1708 last_page_offset, page_len, 0);
1709 if (err)
1710 goto out;
1711 }
1712 }
1713
1714 /*
1715 * If i_size contained in the last page, we need to
1716 * unmap and zero the paritial page after i_size
1717 */
1718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
1719 inode->i_size % PAGE_CACHE_SIZE != 0) {
1720 page_len = PAGE_CACHE_SIZE -
1721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1722 if (page_len > 0) {
1723 err = ext4_discard_partial_page_buffers(handle,
1724 mapping, inode->i_size, page_len, 0);
1725 if (err)
1726 goto out;
1727 }
1728 }
1729
1730 first_block = (offset + sb->s_blocksize - 1) >>
1731 EXT4_BLOCK_SIZE_BITS(sb);
1732 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
1733
1734 if (first_block >= stop_block)
1735 goto out;
1736
1737 down_write(&EXT4_I(inode)->i_data_sem);
1738 ext4_discard_preallocations(inode);
1739
1740 err = ext4_es_remove_extent(inode, first_block,
1741 stop_block - first_block);
1742 err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
1743
1744 ext4_discard_preallocations(inode);
1745
1746 if (IS_SYNC(inode))
1747 ext4_handle_sync(handle);
1748
1749 up_write(&EXT4_I(inode)->i_data_sem);
1750
1751out:
1752 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1753 ext4_mark_inode_dirty(handle, inode);
1754 ext4_journal_stop(handle);
1755
1756out_mutex:
1757 mutex_unlock(&inode->i_mutex);
1758
1759 return err;
1760}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0fd1a123f7d..3e2bf873e8a8 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
19 19
20#define EXT4_XATTR_SYSTEM_DATA "data" 20#define EXT4_XATTR_SYSTEM_DATA "data"
21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) 21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
22#define EXT4_INLINE_DOTDOT_SIZE 4 22#define EXT4_INLINE_DOTDOT_OFFSET 2
23#define EXT4_INLINE_DOTDOT_SIZE 4
23 24
24int ext4_get_inline_size(struct inode *inode) 25int ext4_get_inline_size(struct inode *inode)
25{ 26{
@@ -1289,6 +1290,120 @@ out:
1289 return ret; 1290 return ret;
1290} 1291}
1291 1292
1293/*
1294 * This function fills a red-black tree with information from an
1295 * inlined dir. It returns the number directory entries loaded
1296 * into the tree. If there is an error it is returned in err.
1297 */
1298int htree_inlinedir_to_tree(struct file *dir_file,
1299 struct inode *dir, ext4_lblk_t block,
1300 struct dx_hash_info *hinfo,
1301 __u32 start_hash, __u32 start_minor_hash,
1302 int *has_inline_data)
1303{
1304 int err = 0, count = 0;
1305 unsigned int parent_ino;
1306 int pos;
1307 struct ext4_dir_entry_2 *de;
1308 struct inode *inode = file_inode(dir_file);
1309 int ret, inline_size = 0;
1310 struct ext4_iloc iloc;
1311 void *dir_buf = NULL;
1312 struct ext4_dir_entry_2 fake;
1313
1314 ret = ext4_get_inode_loc(inode, &iloc);
1315 if (ret)
1316 return ret;
1317
1318 down_read(&EXT4_I(inode)->xattr_sem);
1319 if (!ext4_has_inline_data(inode)) {
1320 up_read(&EXT4_I(inode)->xattr_sem);
1321 *has_inline_data = 0;
1322 goto out;
1323 }
1324
1325 inline_size = ext4_get_inline_size(inode);
1326 dir_buf = kmalloc(inline_size, GFP_NOFS);
1327 if (!dir_buf) {
1328 ret = -ENOMEM;
1329 up_read(&EXT4_I(inode)->xattr_sem);
1330 goto out;
1331 }
1332
1333 ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
1334 up_read(&EXT4_I(inode)->xattr_sem);
1335 if (ret < 0)
1336 goto out;
1337
1338 pos = 0;
1339 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1340 while (pos < inline_size) {
1341 /*
1342 * As inlined dir doesn't store any information about '.' and
1343 * only the inode number of '..' is stored, we have to handle
1344 * them differently.
1345 */
1346 if (pos == 0) {
1347 fake.inode = cpu_to_le32(inode->i_ino);
1348 fake.name_len = 1;
1349 strcpy(fake.name, ".");
1350 fake.rec_len = ext4_rec_len_to_disk(
1351 EXT4_DIR_REC_LEN(fake.name_len),
1352 inline_size);
1353 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
1354 de = &fake;
1355 pos = EXT4_INLINE_DOTDOT_OFFSET;
1356 } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
1357 fake.inode = cpu_to_le32(parent_ino);
1358 fake.name_len = 2;
1359 strcpy(fake.name, "..");
1360 fake.rec_len = ext4_rec_len_to_disk(
1361 EXT4_DIR_REC_LEN(fake.name_len),
1362 inline_size);
1363 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
1364 de = &fake;
1365 pos = EXT4_INLINE_DOTDOT_SIZE;
1366 } else {
1367 de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
1368 pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
1369 if (ext4_check_dir_entry(inode, dir_file, de,
1370 iloc.bh, dir_buf,
1371 inline_size, pos)) {
1372 ret = count;
1373 goto out;
1374 }
1375 }
1376
1377 ext4fs_dirhash(de->name, de->name_len, hinfo);
1378 if ((hinfo->hash < start_hash) ||
1379 ((hinfo->hash == start_hash) &&
1380 (hinfo->minor_hash < start_minor_hash)))
1381 continue;
1382 if (de->inode == 0)
1383 continue;
1384 err = ext4_htree_store_dirent(dir_file,
1385 hinfo->hash, hinfo->minor_hash, de);
1386 if (err) {
1387 count = err;
1388 goto out;
1389 }
1390 count++;
1391 }
1392 ret = count;
1393out:
1394 kfree(dir_buf);
1395 brelse(iloc.bh);
1396 return ret;
1397}
1398
1399/*
1400 * So this function is called when the volume is mkfsed with
1401 * dir_index disabled. In order to keep f_pos persistent
1402 * after we convert from an inlined dir to a blocked based,
1403 * we just pretend that we are a normal dir and return the
1404 * offset as if '.' and '..' really take place.
1405 *
1406 */
1292int ext4_read_inline_dir(struct file *filp, 1407int ext4_read_inline_dir(struct file *filp,
1293 void *dirent, filldir_t filldir, 1408 void *dirent, filldir_t filldir,
1294 int *has_inline_data) 1409 int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
1302 int ret, inline_size = 0; 1417 int ret, inline_size = 0;
1303 struct ext4_iloc iloc; 1418 struct ext4_iloc iloc;
1304 void *dir_buf = NULL; 1419 void *dir_buf = NULL;
1420 int dotdot_offset, dotdot_size, extra_offset, extra_size;
1305 1421
1306 ret = ext4_get_inode_loc(inode, &iloc); 1422 ret = ext4_get_inode_loc(inode, &iloc);
1307 if (ret) 1423 if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
1330 sb = inode->i_sb; 1446 sb = inode->i_sb;
1331 stored = 0; 1447 stored = 0;
1332 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1448 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1449 offset = filp->f_pos;
1333 1450
1334 while (!error && !stored && filp->f_pos < inode->i_size) { 1451 /*
1452 * dotdot_offset and dotdot_size is the real offset and
1453 * size for ".." and "." if the dir is block based while
1454 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
1455 * So we will use extra_offset and extra_size to indicate them
1456 * during the inline dir iteration.
1457 */
1458 dotdot_offset = EXT4_DIR_REC_LEN(1);
1459 dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
1460 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
1461 extra_size = extra_offset + inline_size;
1462
1463 while (!error && !stored && filp->f_pos < extra_size) {
1335revalidate: 1464revalidate:
1336 /* 1465 /*
1337 * If the version has changed since the last call to 1466 * If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ revalidate:
1340 * dir to make sure. 1469 * dir to make sure.
1341 */ 1470 */
1342 if (filp->f_version != inode->i_version) { 1471 if (filp->f_version != inode->i_version) {
1343 for (i = 0; 1472 for (i = 0; i < extra_size && i < offset;) {
1344 i < inode->i_size && i < offset;) { 1473 /*
1474 * "." is with offset 0 and
1475 * ".." is dotdot_offset.
1476 */
1345 if (!i) { 1477 if (!i) {
1346 /* skip "." and ".." if needed. */ 1478 i = dotdot_offset;
1347 i += EXT4_INLINE_DOTDOT_SIZE; 1479 continue;
1480 } else if (i == dotdot_offset) {
1481 i = dotdot_size;
1348 continue; 1482 continue;
1349 } 1483 }
1484 /* for other entry, the real offset in
1485 * the buf has to be tuned accordingly.
1486 */
1350 de = (struct ext4_dir_entry_2 *) 1487 de = (struct ext4_dir_entry_2 *)
1351 (dir_buf + i); 1488 (dir_buf + i - extra_offset);
1352 /* It's too expensive to do a full 1489 /* It's too expensive to do a full
1353 * dirent test each time round this 1490 * dirent test each time round this
1354 * loop, but we do have to test at 1491 * loop, but we do have to test at
@@ -1356,43 +1493,47 @@ revalidate:
1356 * failure will be detected in the 1493 * failure will be detected in the
1357 * dirent test below. */ 1494 * dirent test below. */
1358 if (ext4_rec_len_from_disk(de->rec_len, 1495 if (ext4_rec_len_from_disk(de->rec_len,
1359 inline_size) < EXT4_DIR_REC_LEN(1)) 1496 extra_size) < EXT4_DIR_REC_LEN(1))
1360 break; 1497 break;
1361 i += ext4_rec_len_from_disk(de->rec_len, 1498 i += ext4_rec_len_from_disk(de->rec_len,
1362 inline_size); 1499 extra_size);
1363 } 1500 }
1364 offset = i; 1501 offset = i;
1365 filp->f_pos = offset; 1502 filp->f_pos = offset;
1366 filp->f_version = inode->i_version; 1503 filp->f_version = inode->i_version;
1367 } 1504 }
1368 1505
1369 while (!error && filp->f_pos < inode->i_size) { 1506 while (!error && filp->f_pos < extra_size) {
1370 if (filp->f_pos == 0) { 1507 if (filp->f_pos == 0) {
1371 error = filldir(dirent, ".", 1, 0, inode->i_ino, 1508 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1372 DT_DIR); 1509 DT_DIR);
1373 if (error) 1510 if (error)
1374 break; 1511 break;
1375 stored++; 1512 stored++;
1513 filp->f_pos = dotdot_offset;
1514 continue;
1515 }
1376 1516
1377 error = filldir(dirent, "..", 2, 0, parent_ino, 1517 if (filp->f_pos == dotdot_offset) {
1378 DT_DIR); 1518 error = filldir(dirent, "..", 2,
1519 dotdot_offset,
1520 parent_ino, DT_DIR);
1379 if (error) 1521 if (error)
1380 break; 1522 break;
1381 stored++; 1523 stored++;
1382 1524
1383 filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; 1525 filp->f_pos = dotdot_size;
1384 continue; 1526 continue;
1385 } 1527 }
1386 1528
1387 de = (struct ext4_dir_entry_2 *)(dir_buf + offset); 1529 de = (struct ext4_dir_entry_2 *)
1530 (dir_buf + filp->f_pos - extra_offset);
1388 if (ext4_check_dir_entry(inode, filp, de, 1531 if (ext4_check_dir_entry(inode, filp, de,
1389 iloc.bh, dir_buf, 1532 iloc.bh, dir_buf,
1390 inline_size, offset)) { 1533 extra_size, filp->f_pos)) {
1391 ret = stored; 1534 ret = stored;
1392 goto out; 1535 goto out;
1393 } 1536 }
1394 offset += ext4_rec_len_from_disk(de->rec_len,
1395 inline_size);
1396 if (le32_to_cpu(de->inode)) { 1537 if (le32_to_cpu(de->inode)) {
1397 /* We might block in the next section 1538 /* We might block in the next section
1398 * if the data destination is 1539 * if the data destination is
@@ -1415,9 +1556,8 @@ revalidate:
1415 stored++; 1556 stored++;
1416 } 1557 }
1417 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 1558 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1418 inline_size); 1559 extra_size);
1419 } 1560 }
1420 offset = 0;
1421 } 1561 }
1422out: 1562out:
1423 kfree(dir_buf); 1563 kfree(dir_buf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3a5213bc73e..793d44b84d7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
55 __u16 csum_hi = 0; 55 __u16 csum_hi = 0;
56 __u32 csum; 56 __u32 csum;
57 57
58 csum_lo = raw->i_checksum_lo; 58 csum_lo = le16_to_cpu(raw->i_checksum_lo);
59 raw->i_checksum_lo = 0; 59 raw->i_checksum_lo = 0;
60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
62 csum_hi = raw->i_checksum_hi; 62 csum_hi = le16_to_cpu(raw->i_checksum_hi);
63 raw->i_checksum_hi = 0; 63 raw->i_checksum_hi = 0;
64 } 64 }
65 65
66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, 66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
67 EXT4_INODE_SIZE(inode->i_sb)); 67 EXT4_INODE_SIZE(inode->i_sb));
68 68
69 raw->i_checksum_lo = csum_lo; 69 raw->i_checksum_lo = cpu_to_le16(csum_lo);
70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
72 raw->i_checksum_hi = csum_hi; 72 raw->i_checksum_hi = cpu_to_le16(csum_hi);
73 73
74 return csum; 74 return csum;
75} 75}
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
210 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 210 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
211 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 211 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
212 212
213 jbd2_log_start_commit(journal, commit_tid); 213 jbd2_complete_transaction(journal, commit_tid);
214 jbd2_log_wait_commit(journal, commit_tid);
215 filemap_write_and_wait(&inode->i_data); 214 filemap_write_and_wait(&inode->i_data);
216 } 215 }
217 truncate_inode_pages(&inode->i_data, 0); 216 truncate_inode_pages(&inode->i_data, 0);
@@ -1081,20 +1080,42 @@ retry_journal:
1081/* For write_end() in data=journal mode */ 1080/* For write_end() in data=journal mode */
1082static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1081static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1083{ 1082{
1083 int ret;
1084 if (!buffer_mapped(bh) || buffer_freed(bh)) 1084 if (!buffer_mapped(bh) || buffer_freed(bh))
1085 return 0; 1085 return 0;
1086 set_buffer_uptodate(bh); 1086 set_buffer_uptodate(bh);
1087 return ext4_handle_dirty_metadata(handle, NULL, bh); 1087 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1088 clear_buffer_meta(bh);
1089 clear_buffer_prio(bh);
1090 return ret;
1088} 1091}
1089 1092
1090static int ext4_generic_write_end(struct file *file, 1093/*
1091 struct address_space *mapping, 1094 * We need to pick up the new inode size which generic_commit_write gave us
1092 loff_t pos, unsigned len, unsigned copied, 1095 * `file' can be NULL - eg, when called from page_symlink().
1093 struct page *page, void *fsdata) 1096 *
1097 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1098 * buffers are managed internally.
1099 */
1100static int ext4_write_end(struct file *file,
1101 struct address_space *mapping,
1102 loff_t pos, unsigned len, unsigned copied,
1103 struct page *page, void *fsdata)
1094{ 1104{
1095 int i_size_changed = 0;
1096 struct inode *inode = mapping->host;
1097 handle_t *handle = ext4_journal_current_handle(); 1105 handle_t *handle = ext4_journal_current_handle();
1106 struct inode *inode = mapping->host;
1107 int ret = 0, ret2;
1108 int i_size_changed = 0;
1109
1110 trace_ext4_write_end(inode, pos, len, copied);
1111 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
1112 ret = ext4_jbd2_file_inode(handle, inode);
1113 if (ret) {
1114 unlock_page(page);
1115 page_cache_release(page);
1116 goto errout;
1117 }
1118 }
1098 1119
1099 if (ext4_has_inline_data(inode)) 1120 if (ext4_has_inline_data(inode))
1100 copied = ext4_write_inline_data_end(inode, pos, len, 1121 copied = ext4_write_inline_data_end(inode, pos, len,
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file,
1105 1126
1106 /* 1127 /*
1107 * No need to use i_size_read() here, the i_size 1128 * No need to use i_size_read() here, the i_size
1108 * cannot change under us because we hold i_mutex. 1129 * cannot change under us because we hole i_mutex.
1109 * 1130 *
1110 * But it's important to update i_size while still holding page lock: 1131 * But it's important to update i_size while still holding page lock:
1111 * page writeout could otherwise come in and zero beyond i_size. 1132 * page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file,
1115 i_size_changed = 1; 1136 i_size_changed = 1;
1116 } 1137 }
1117 1138
1118 if (pos + copied > EXT4_I(inode)->i_disksize) { 1139 if (pos + copied > EXT4_I(inode)->i_disksize) {
1119 /* We need to mark inode dirty even if 1140 /* We need to mark inode dirty even if
1120 * new_i_size is less that inode->i_size 1141 * new_i_size is less that inode->i_size
1121 * bu greater than i_disksize.(hint delalloc) 1142 * but greater than i_disksize. (hint delalloc)
1122 */ 1143 */
1123 ext4_update_i_disksize(inode, (pos + copied)); 1144 ext4_update_i_disksize(inode, (pos + copied));
1124 i_size_changed = 1; 1145 i_size_changed = 1;
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file,
1135 if (i_size_changed) 1156 if (i_size_changed)
1136 ext4_mark_inode_dirty(handle, inode); 1157 ext4_mark_inode_dirty(handle, inode);
1137 1158
1138 return copied; 1159 if (copied < 0)
1139} 1160 ret = copied;
1140
1141/*
1142 * We need to pick up the new inode size which generic_commit_write gave us
1143 * `file' can be NULL - eg, when called from page_symlink().
1144 *
1145 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1146 * buffers are managed internally.
1147 */
1148static int ext4_ordered_write_end(struct file *file,
1149 struct address_space *mapping,
1150 loff_t pos, unsigned len, unsigned copied,
1151 struct page *page, void *fsdata)
1152{
1153 handle_t *handle = ext4_journal_current_handle();
1154 struct inode *inode = mapping->host;
1155 int ret = 0, ret2;
1156
1157 trace_ext4_ordered_write_end(inode, pos, len, copied);
1158 ret = ext4_jbd2_file_inode(handle, inode);
1159
1160 if (ret == 0) {
1161 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1162 page, fsdata);
1163 copied = ret2;
1164 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1165 /* if we have allocated more blocks and copied
1166 * less. We will have blocks allocated outside
1167 * inode->i_size. So truncate them
1168 */
1169 ext4_orphan_add(handle, inode);
1170 if (ret2 < 0)
1171 ret = ret2;
1172 } else {
1173 unlock_page(page);
1174 page_cache_release(page);
1175 }
1176
1177 ret2 = ext4_journal_stop(handle);
1178 if (!ret)
1179 ret = ret2;
1180
1181 if (pos + len > inode->i_size) {
1182 ext4_truncate_failed_write(inode);
1183 /*
1184 * If truncate failed early the inode might still be
1185 * on the orphan list; we need to make sure the inode
1186 * is removed from the orphan list in that case.
1187 */
1188 if (inode->i_nlink)
1189 ext4_orphan_del(NULL, inode);
1190 }
1191
1192
1193 return ret ? ret : copied;
1194}
1195
1196static int ext4_writeback_write_end(struct file *file,
1197 struct address_space *mapping,
1198 loff_t pos, unsigned len, unsigned copied,
1199 struct page *page, void *fsdata)
1200{
1201 handle_t *handle = ext4_journal_current_handle();
1202 struct inode *inode = mapping->host;
1203 int ret = 0, ret2;
1204
1205 trace_ext4_writeback_write_end(inode, pos, len, copied);
1206 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1207 page, fsdata);
1208 copied = ret2;
1209 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1161 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1210 /* if we have allocated more blocks and copied 1162 /* if we have allocated more blocks and copied
1211 * less. We will have blocks allocated outside 1163 * less. We will have blocks allocated outside
1212 * inode->i_size. So truncate them 1164 * inode->i_size. So truncate them
1213 */ 1165 */
1214 ext4_orphan_add(handle, inode); 1166 ext4_orphan_add(handle, inode);
1215 1167errout:
1216 if (ret2 < 0)
1217 ret = ret2;
1218
1219 ret2 = ext4_journal_stop(handle); 1168 ret2 = ext4_journal_stop(handle);
1220 if (!ret) 1169 if (!ret)
1221 ret = ret2; 1170 ret = ret2;
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1538 struct ext4_io_submit io_submit; 1487 struct ext4_io_submit io_submit;
1539 1488
1540 BUG_ON(mpd->next_page <= mpd->first_page); 1489 BUG_ON(mpd->next_page <= mpd->first_page);
1541 memset(&io_submit, 0, sizeof(io_submit)); 1490 ext4_io_submit_init(&io_submit, mpd->wbc);
1491 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1492 if (!io_submit.io_end)
1493 return -ENOMEM;
1542 /* 1494 /*
1543 * We need to start from the first_page to the next_page - 1 1495 * We need to start from the first_page to the next_page - 1
1544 * to make sure we also write the mapped dirty buffer_heads. 1496 * to make sure we also write the mapped dirty buffer_heads.
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1626 pagevec_release(&pvec); 1578 pagevec_release(&pvec);
1627 } 1579 }
1628 ext4_io_submit(&io_submit); 1580 ext4_io_submit(&io_submit);
1581 /* Drop io_end reference we got from init */
1582 ext4_put_io_end_defer(io_submit.io_end);
1629 return ret; 1583 return ret;
1630} 1584}
1631 1585
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode)
1670{ 1624{
1671 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1625 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1672 struct super_block *sb = inode->i_sb; 1626 struct super_block *sb = inode->i_sb;
1627 struct ext4_inode_info *ei = EXT4_I(inode);
1673 1628
1674 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1629 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1675 EXT4_C2B(EXT4_SB(inode->i_sb), 1630 EXT4_C2B(EXT4_SB(inode->i_sb),
1676 ext4_count_free_clusters(inode->i_sb))); 1631 ext4_count_free_clusters(sb)));
1677 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1632 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1678 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1633 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1679 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1634 (long long) EXT4_C2B(EXT4_SB(sb),
1680 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1635 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1681 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1636 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1682 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1637 (long long) EXT4_C2B(EXT4_SB(sb),
1683 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1638 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1684 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1639 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1685 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1640 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1686 EXT4_I(inode)->i_reserved_data_blocks); 1641 ei->i_reserved_data_blocks);
1687 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", 1642 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1688 EXT4_I(inode)->i_reserved_meta_blocks); 1643 ei->i_reserved_meta_blocks);
1644 ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
1645 ei->i_allocated_meta_blocks);
1689 return; 1646 return;
1690} 1647}
1691 1648
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1740 */ 1697 */
1741 map.m_lblk = next; 1698 map.m_lblk = next;
1742 map.m_len = max_blocks; 1699 map.m_len = max_blocks;
1743 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 1700 /*
1701 * We're in delalloc path and it is possible that we're going to
1702 * need more metadata blocks than previously reserved. However
1703 * we must not fail because we're in writeback and there is
1704 * nothing we can do about it so it might result in data loss.
1705 * So use reserved blocks to allocate metadata if possible.
1706 */
1707 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1708 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1744 if (ext4_should_dioread_nolock(mpd->inode)) 1709 if (ext4_should_dioread_nolock(mpd->inode))
1745 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 1710 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1746 if (mpd->b_state & (1 << BH_Delay)) 1711 if (mpd->b_state & (1 << BH_Delay))
1747 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 1712 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1748 1713
1714
1749 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 1715 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1750 if (blks < 0) { 1716 if (blks < 0) {
1751 struct super_block *sb = mpd->inode->i_sb; 1717 struct super_block *sb = mpd->inode->i_sb;
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page,
2272 */ 2238 */
2273 return __ext4_journalled_writepage(page, len); 2239 return __ext4_journalled_writepage(page, len);
2274 2240
2275 memset(&io_submit, 0, sizeof(io_submit)); 2241 ext4_io_submit_init(&io_submit, wbc);
2242 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
2243 if (!io_submit.io_end) {
2244 redirty_page_for_writepage(wbc, page);
2245 return -ENOMEM;
2246 }
2276 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 2247 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2277 ext4_io_submit(&io_submit); 2248 ext4_io_submit(&io_submit);
2249 /* Drop io_end reference we got from init */
2250 ext4_put_io_end_defer(io_submit.io_end);
2278 return ret; 2251 return ret;
2279} 2252}
2280 2253
@@ -2661,7 +2634,7 @@ out_writepages:
2661 2634
2662static int ext4_nonda_switch(struct super_block *sb) 2635static int ext4_nonda_switch(struct super_block *sb)
2663{ 2636{
2664 s64 free_blocks, dirty_blocks; 2637 s64 free_clusters, dirty_clusters;
2665 struct ext4_sb_info *sbi = EXT4_SB(sb); 2638 struct ext4_sb_info *sbi = EXT4_SB(sb);
2666 2639
2667 /* 2640 /*
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb)
2672 * Delalloc need an accurate free block accounting. So switch 2645 * Delalloc need an accurate free block accounting. So switch
2673 * to non delalloc when we are near to error range. 2646 * to non delalloc when we are near to error range.
2674 */ 2647 */
2675 free_blocks = EXT4_C2B(sbi, 2648 free_clusters =
2676 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2649 percpu_counter_read_positive(&sbi->s_freeclusters_counter);
2677 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2650 dirty_clusters =
2651 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2678 /* 2652 /*
2679 * Start pushing delalloc when 1/2 of free blocks are dirty. 2653 * Start pushing delalloc when 1/2 of free blocks are dirty.
2680 */ 2654 */
2681 if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) 2655 if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
2682 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2656 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2683 2657
2684 if (2 * free_blocks < 3 * dirty_blocks || 2658 if (2 * free_clusters < 3 * dirty_clusters ||
2685 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2659 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
2686 /* 2660 /*
2687 * free block count is less than 150% of dirty blocks 2661 * free block count is less than 150% of dirty blocks
2688 * or free blocks is less than watermark 2662 * or free blocks is less than watermark
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file,
2818 unsigned long start, end; 2792 unsigned long start, end;
2819 int write_mode = (int)(unsigned long)fsdata; 2793 int write_mode = (int)(unsigned long)fsdata;
2820 2794
2821 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2795 if (write_mode == FALL_BACK_TO_NONDELALLOC)
2822 switch (ext4_inode_journal_mode(inode)) { 2796 return ext4_write_end(file, mapping, pos,
2823 case EXT4_INODE_ORDERED_DATA_MODE: 2797 len, copied, page, fsdata);
2824 return ext4_ordered_write_end(file, mapping, pos,
2825 len, copied, page, fsdata);
2826 case EXT4_INODE_WRITEBACK_DATA_MODE:
2827 return ext4_writeback_write_end(file, mapping, pos,
2828 len, copied, page, fsdata);
2829 default:
2830 BUG();
2831 }
2832 }
2833 2798
2834 trace_ext4_da_write_end(inode, pos, len, copied); 2799 trace_ext4_da_write_end(inode, pos, len, copied);
2835 start = pos & (PAGE_CACHE_SIZE - 1); 2800 start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3113 struct inode *inode = file_inode(iocb->ki_filp); 3078 struct inode *inode = file_inode(iocb->ki_filp);
3114 ext4_io_end_t *io_end = iocb->private; 3079 ext4_io_end_t *io_end = iocb->private;
3115 3080
3116 /* if not async direct IO or dio with 0 bytes write, just return */ 3081 /* if not async direct IO just return */
3117 if (!io_end || !size) 3082 if (!io_end) {
3118 goto out; 3083 inode_dio_done(inode);
3084 if (is_async)
3085 aio_complete(iocb, ret, 0);
3086 return;
3087 }
3119 3088
3120 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3089 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3121 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3090 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3123 size); 3092 size);
3124 3093
3125 iocb->private = NULL; 3094 iocb->private = NULL;
3126
3127 /* if not aio dio with unwritten extents, just free io and return */
3128 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3129 ext4_free_io_end(io_end);
3130out:
3131 inode_dio_done(inode);
3132 if (is_async)
3133 aio_complete(iocb, ret, 0);
3134 return;
3135 }
3136
3137 io_end->offset = offset; 3095 io_end->offset = offset;
3138 io_end->size = size; 3096 io_end->size = size;
3139 if (is_async) { 3097 if (is_async) {
3140 io_end->iocb = iocb; 3098 io_end->iocb = iocb;
3141 io_end->result = ret; 3099 io_end->result = ret;
3142 } 3100 }
3143 3101 ext4_put_io_end_defer(io_end);
3144 ext4_add_complete_io(io_end);
3145} 3102}
3146 3103
3147/* 3104/*
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3175 get_block_t *get_block_func = NULL; 3132 get_block_t *get_block_func = NULL;
3176 int dio_flags = 0; 3133 int dio_flags = 0;
3177 loff_t final_size = offset + count; 3134 loff_t final_size = offset + count;
3135 ext4_io_end_t *io_end = NULL;
3178 3136
3179 /* Use the old path for reads and writes beyond i_size. */ 3137 /* Use the old path for reads and writes beyond i_size. */
3180 if (rw != WRITE || final_size > inode->i_size) 3138 if (rw != WRITE || final_size > inode->i_size)
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3213 iocb->private = NULL; 3171 iocb->private = NULL;
3214 ext4_inode_aio_set(inode, NULL); 3172 ext4_inode_aio_set(inode, NULL);
3215 if (!is_sync_kiocb(iocb)) { 3173 if (!is_sync_kiocb(iocb)) {
3216 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3174 io_end = ext4_init_io_end(inode, GFP_NOFS);
3217 if (!io_end) { 3175 if (!io_end) {
3218 ret = -ENOMEM; 3176 ret = -ENOMEM;
3219 goto retake_lock; 3177 goto retake_lock;
3220 } 3178 }
3221 io_end->flag |= EXT4_IO_END_DIRECT; 3179 io_end->flag |= EXT4_IO_END_DIRECT;
3222 iocb->private = io_end; 3180 /*
3181 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3182 */
3183 iocb->private = ext4_get_io_end(io_end);
3223 /* 3184 /*
3224 * we save the io structure for current async direct 3185 * we save the io structure for current async direct
3225 * IO, so that later ext4_map_blocks() could flag the 3186 * IO, so that later ext4_map_blocks() could flag the
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3243 NULL, 3204 NULL,
3244 dio_flags); 3205 dio_flags);
3245 3206
3246 if (iocb->private)
3247 ext4_inode_aio_set(inode, NULL);
3248 /* 3207 /*
3249 * The io_end structure takes a reference to the inode, that 3208 * Put our reference to io_end. This can free the io_end structure e.g.
3250 * structure needs to be destroyed and the reference to the 3209 * in sync IO case or in case of error. It can even perform extent
3251 * inode need to be dropped, when IO is complete, even with 0 3210 * conversion if all bios we submitted finished before we got here.
3252 * byte write, or failed. 3211 * Note that in that case iocb->private can be already set to NULL
3253 * 3212 * here.
3254 * In the successful AIO DIO case, the io_end structure will
3255 * be destroyed and the reference to the inode will be dropped
3256 * after the end_io call back function is called.
3257 *
3258 * In the case there is 0 byte write, or error case, since VFS
3259 * direct IO won't invoke the end_io call back function, we
3260 * need to free the end_io structure here.
3261 */ 3213 */
3262 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3214 if (io_end) {
3263 ext4_free_io_end(iocb->private); 3215 ext4_inode_aio_set(inode, NULL);
3264 iocb->private = NULL; 3216 ext4_put_io_end(io_end);
3265 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3217 /*
3218 * In case of error or no write ext4_end_io_dio() was not
3219 * called so we have to put iocb's reference.
3220 */
3221 if (ret <= 0 && ret != -EIOCBQUEUED) {
3222 WARN_ON(iocb->private != io_end);
3223 ext4_put_io_end(io_end);
3224 iocb->private = NULL;
3225 }
3226 }
3227 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3266 EXT4_STATE_DIO_UNWRITTEN)) { 3228 EXT4_STATE_DIO_UNWRITTEN)) {
3267 int err; 3229 int err;
3268 /* 3230 /*
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page)
3334 return __set_page_dirty_nobuffers(page); 3296 return __set_page_dirty_nobuffers(page);
3335} 3297}
3336 3298
3337static const struct address_space_operations ext4_ordered_aops = { 3299static const struct address_space_operations ext4_aops = {
3338 .readpage = ext4_readpage, 3300 .readpage = ext4_readpage,
3339 .readpages = ext4_readpages, 3301 .readpages = ext4_readpages,
3340 .writepage = ext4_writepage, 3302 .writepage = ext4_writepage,
3341 .write_begin = ext4_write_begin, 3303 .write_begin = ext4_write_begin,
3342 .write_end = ext4_ordered_write_end, 3304 .write_end = ext4_write_end,
3343 .bmap = ext4_bmap,
3344 .invalidatepage = ext4_invalidatepage,
3345 .releasepage = ext4_releasepage,
3346 .direct_IO = ext4_direct_IO,
3347 .migratepage = buffer_migrate_page,
3348 .is_partially_uptodate = block_is_partially_uptodate,
3349 .error_remove_page = generic_error_remove_page,
3350};
3351
3352static const struct address_space_operations ext4_writeback_aops = {
3353 .readpage = ext4_readpage,
3354 .readpages = ext4_readpages,
3355 .writepage = ext4_writepage,
3356 .write_begin = ext4_write_begin,
3357 .write_end = ext4_writeback_write_end,
3358 .bmap = ext4_bmap, 3305 .bmap = ext4_bmap,
3359 .invalidatepage = ext4_invalidatepage, 3306 .invalidatepage = ext4_invalidatepage,
3360 .releasepage = ext4_releasepage, 3307 .releasepage = ext4_releasepage,
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode)
3399{ 3346{
3400 switch (ext4_inode_journal_mode(inode)) { 3347 switch (ext4_inode_journal_mode(inode)) {
3401 case EXT4_INODE_ORDERED_DATA_MODE: 3348 case EXT4_INODE_ORDERED_DATA_MODE:
3402 if (test_opt(inode->i_sb, DELALLOC)) 3349 ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3403 inode->i_mapping->a_ops = &ext4_da_aops;
3404 else
3405 inode->i_mapping->a_ops = &ext4_ordered_aops;
3406 break; 3350 break;
3407 case EXT4_INODE_WRITEBACK_DATA_MODE: 3351 case EXT4_INODE_WRITEBACK_DATA_MODE:
3408 if (test_opt(inode->i_sb, DELALLOC)) 3352 ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3409 inode->i_mapping->a_ops = &ext4_da_aops;
3410 else
3411 inode->i_mapping->a_ops = &ext4_writeback_aops;
3412 break; 3353 break;
3413 case EXT4_INODE_JOURNAL_DATA_MODE: 3354 case EXT4_INODE_JOURNAL_DATA_MODE:
3414 inode->i_mapping->a_ops = &ext4_journalled_aops; 3355 inode->i_mapping->a_ops = &ext4_journalled_aops;
3415 break; 3356 return;
3416 default: 3357 default:
3417 BUG(); 3358 BUG();
3418 } 3359 }
3360 if (test_opt(inode->i_sb, DELALLOC))
3361 inode->i_mapping->a_ops = &ext4_da_aops;
3362 else
3363 inode->i_mapping->a_ops = &ext4_aops;
3419} 3364}
3420 3365
3421 3366
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode)
3646int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3591int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3647{ 3592{
3648 struct inode *inode = file_inode(file); 3593 struct inode *inode = file_inode(file);
3594 struct super_block *sb = inode->i_sb;
3595 ext4_lblk_t first_block, stop_block;
3596 struct address_space *mapping = inode->i_mapping;
3597 loff_t first_page, last_page, page_len;
3598 loff_t first_page_offset, last_page_offset;
3599 handle_t *handle;
3600 unsigned int credits;
3601 int ret = 0;
3602
3649 if (!S_ISREG(inode->i_mode)) 3603 if (!S_ISREG(inode->i_mode))
3650 return -EOPNOTSUPP; 3604 return -EOPNOTSUPP;
3651 3605
3652 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3606 if (EXT4_SB(sb)->s_cluster_ratio > 1) {
3653 return ext4_ind_punch_hole(file, offset, length);
3654
3655 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3656 /* TODO: Add support for bigalloc file systems */ 3607 /* TODO: Add support for bigalloc file systems */
3657 return -EOPNOTSUPP; 3608 return -EOPNOTSUPP;
3658 } 3609 }
3659 3610
3660 trace_ext4_punch_hole(inode, offset, length); 3611 trace_ext4_punch_hole(inode, offset, length);
3661 3612
3662 return ext4_ext_punch_hole(file, offset, length); 3613 /*
3614 * Write out all dirty pages to avoid race conditions
3615 * Then release them.
3616 */
3617 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
3618 ret = filemap_write_and_wait_range(mapping, offset,
3619 offset + length - 1);
3620 if (ret)
3621 return ret;
3622 }
3623
3624 mutex_lock(&inode->i_mutex);
3625 /* It's not possible punch hole on append only file */
3626 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
3627 ret = -EPERM;
3628 goto out_mutex;
3629 }
3630 if (IS_SWAPFILE(inode)) {
3631 ret = -ETXTBSY;
3632 goto out_mutex;
3633 }
3634
3635 /* No need to punch hole beyond i_size */
3636 if (offset >= inode->i_size)
3637 goto out_mutex;
3638
3639 /*
3640 * If the hole extends beyond i_size, set the hole
3641 * to end after the page that contains i_size
3642 */
3643 if (offset + length > inode->i_size) {
3644 length = inode->i_size +
3645 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
3646 offset;
3647 }
3648
3649 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
3650 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
3651
3652 first_page_offset = first_page << PAGE_CACHE_SHIFT;
3653 last_page_offset = last_page << PAGE_CACHE_SHIFT;
3654
3655 /* Now release the pages */
3656 if (last_page_offset > first_page_offset) {
3657 truncate_pagecache_range(inode, first_page_offset,
3658 last_page_offset - 1);
3659 }
3660
3661 /* Wait all existing dio workers, newcomers will block on i_mutex */
3662 ext4_inode_block_unlocked_dio(inode);
3663 ret = ext4_flush_unwritten_io(inode);
3664 if (ret)
3665 goto out_dio;
3666 inode_dio_wait(inode);
3667
3668 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3669 credits = ext4_writepage_trans_blocks(inode);
3670 else
3671 credits = ext4_blocks_for_truncate(inode);
3672 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3673 if (IS_ERR(handle)) {
3674 ret = PTR_ERR(handle);
3675 ext4_std_error(sb, ret);
3676 goto out_dio;
3677 }
3678
3679 /*
3680 * Now we need to zero out the non-page-aligned data in the
3681 * pages at the start and tail of the hole, and unmap the
3682 * buffer heads for the block aligned regions of the page that
3683 * were completely zeroed.
3684 */
3685 if (first_page > last_page) {
3686 /*
3687 * If the file space being truncated is contained
3688 * within a page just zero out and unmap the middle of
3689 * that page
3690 */
3691 ret = ext4_discard_partial_page_buffers(handle,
3692 mapping, offset, length, 0);
3693
3694 if (ret)
3695 goto out_stop;
3696 } else {
3697 /*
3698 * zero out and unmap the partial page that contains
3699 * the start of the hole
3700 */
3701 page_len = first_page_offset - offset;
3702 if (page_len > 0) {
3703 ret = ext4_discard_partial_page_buffers(handle, mapping,
3704 offset, page_len, 0);
3705 if (ret)
3706 goto out_stop;
3707 }
3708
3709 /*
3710 * zero out and unmap the partial page that contains
3711 * the end of the hole
3712 */
3713 page_len = offset + length - last_page_offset;
3714 if (page_len > 0) {
3715 ret = ext4_discard_partial_page_buffers(handle, mapping,
3716 last_page_offset, page_len, 0);
3717 if (ret)
3718 goto out_stop;
3719 }
3720 }
3721
3722 /*
3723 * If i_size is contained in the last page, we need to
3724 * unmap and zero the partial page after i_size
3725 */
3726 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3727 inode->i_size % PAGE_CACHE_SIZE != 0) {
3728 page_len = PAGE_CACHE_SIZE -
3729 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3730
3731 if (page_len > 0) {
3732 ret = ext4_discard_partial_page_buffers(handle,
3733 mapping, inode->i_size, page_len, 0);
3734
3735 if (ret)
3736 goto out_stop;
3737 }
3738 }
3739
3740 first_block = (offset + sb->s_blocksize - 1) >>
3741 EXT4_BLOCK_SIZE_BITS(sb);
3742 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
3743
3744 /* If there are no blocks to remove, return now */
3745 if (first_block >= stop_block)
3746 goto out_stop;
3747
3748 down_write(&EXT4_I(inode)->i_data_sem);
3749 ext4_discard_preallocations(inode);
3750
3751 ret = ext4_es_remove_extent(inode, first_block,
3752 stop_block - first_block);
3753 if (ret) {
3754 up_write(&EXT4_I(inode)->i_data_sem);
3755 goto out_stop;
3756 }
3757
3758 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3759 ret = ext4_ext_remove_space(inode, first_block,
3760 stop_block - 1);
3761 else
3762 ret = ext4_free_hole_blocks(handle, inode, first_block,
3763 stop_block);
3764
3765 ext4_discard_preallocations(inode);
3766 up_write(&EXT4_I(inode)->i_data_sem);
3767 if (IS_SYNC(inode))
3768 ext4_handle_sync(handle);
3769 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3770 ext4_mark_inode_dirty(handle, inode);
3771out_stop:
3772 ext4_journal_stop(handle);
3773out_dio:
3774 ext4_inode_resume_unlocked_dio(inode);
3775out_mutex:
3776 mutex_unlock(&inode->i_mutex);
3777 return ret;
3663} 3778}
3664 3779
3665/* 3780/*
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3692 */ 3807 */
3693void ext4_truncate(struct inode *inode) 3808void ext4_truncate(struct inode *inode)
3694{ 3809{
3810 struct ext4_inode_info *ei = EXT4_I(inode);
3811 unsigned int credits;
3812 handle_t *handle;
3813 struct address_space *mapping = inode->i_mapping;
3814 loff_t page_len;
3815
3816 /*
3817 * There is a possibility that we're either freeing the inode
3818 * or it completely new indode. In those cases we might not
3819 * have i_mutex locked because it's not necessary.
3820 */
3821 if (!(inode->i_state & (I_NEW|I_FREEING)))
3822 WARN_ON(!mutex_is_locked(&inode->i_mutex));
3695 trace_ext4_truncate_enter(inode); 3823 trace_ext4_truncate_enter(inode);
3696 3824
3697 if (!ext4_can_truncate(inode)) 3825 if (!ext4_can_truncate(inode))
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode)
3710 return; 3838 return;
3711 } 3839 }
3712 3840
3841 /*
3842 * finish any pending end_io work so we won't run the risk of
3843 * converting any truncated blocks to initialized later
3844 */
3845 ext4_flush_unwritten_io(inode);
3846
3847 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3848 credits = ext4_writepage_trans_blocks(inode);
3849 else
3850 credits = ext4_blocks_for_truncate(inode);
3851
3852 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3853 if (IS_ERR(handle)) {
3854 ext4_std_error(inode->i_sb, PTR_ERR(handle));
3855 return;
3856 }
3857
3858 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
3859 page_len = PAGE_CACHE_SIZE -
3860 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3861
3862 if (ext4_discard_partial_page_buffers(handle,
3863 mapping, inode->i_size, page_len, 0))
3864 goto out_stop;
3865 }
3866
3867 /*
3868 * We add the inode to the orphan list, so that if this
3869 * truncate spans multiple transactions, and we crash, we will
3870 * resume the truncate when the filesystem recovers. It also
3871 * marks the inode dirty, to catch the new size.
3872 *
3873 * Implication: the file must always be in a sane, consistent
3874 * truncatable state while each transaction commits.
3875 */
3876 if (ext4_orphan_add(handle, inode))
3877 goto out_stop;
3878
3879 down_write(&EXT4_I(inode)->i_data_sem);
3880
3881 ext4_discard_preallocations(inode);
3882
3713 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3883 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3714 ext4_ext_truncate(inode); 3884 ext4_ext_truncate(handle, inode);
3715 else 3885 else
3716 ext4_ind_truncate(inode); 3886 ext4_ind_truncate(handle, inode);
3887
3888 up_write(&ei->i_data_sem);
3889
3890 if (IS_SYNC(inode))
3891 ext4_handle_sync(handle);
3892
3893out_stop:
3894 /*
3895 * If this was a simple ftruncate() and the file will remain alive,
3896 * then we need to clear up the orphan record which we created above.
3897 * However, if this was a real unlink then we were called by
3898 * ext4_delete_inode(), and we allow that function to clean up the
3899 * orphan info for us.
3900 */
3901 if (inode->i_nlink)
3902 ext4_orphan_del(handle, inode);
3903
3904 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3905 ext4_mark_inode_dirty(handle, inode);
3906 ext4_journal_stop(handle);
3717 3907
3718 trace_ext4_truncate_exit(inode); 3908 trace_ext4_truncate_exit(inode);
3719} 3909}
@@ -3821,13 +4011,14 @@ make_io:
3821 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4011 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3822 ext4_fsblk_t b, end, table; 4012 ext4_fsblk_t b, end, table;
3823 unsigned num; 4013 unsigned num;
4014 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
3824 4015
3825 table = ext4_inode_table(sb, gdp); 4016 table = ext4_inode_table(sb, gdp);
3826 /* s_inode_readahead_blks is always a power of 2 */ 4017 /* s_inode_readahead_blks is always a power of 2 */
3827 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4018 b = block & ~((ext4_fsblk_t) ra_blks - 1);
3828 if (table > b) 4019 if (table > b)
3829 b = table; 4020 b = table;
3830 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4021 end = b + ra_blks;
3831 num = EXT4_INODES_PER_GROUP(sb); 4022 num = EXT4_INODES_PER_GROUP(sb);
3832 if (ext4_has_group_desc_csum(sb)) 4023 if (ext4_has_group_desc_csum(sb))
3833 num -= ext4_itable_unused_count(sb, gdp); 4024 num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4024 * NeilBrown 1999oct15 4215 * NeilBrown 1999oct15
4025 */ 4216 */
4026 if (inode->i_nlink == 0) { 4217 if (inode->i_nlink == 0) {
4027 if (inode->i_mode == 0 || 4218 if ((inode->i_mode == 0 ||
4028 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4219 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
4220 ino != EXT4_BOOT_LOADER_INO) {
4029 /* this inode is deleted */ 4221 /* this inode is deleted */
4030 ret = -ESTALE; 4222 ret = -ESTALE;
4031 goto bad_inode; 4223 goto bad_inode;
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4033 /* The only unlinked inodes we let through here have 4225 /* The only unlinked inodes we let through here have
4034 * valid i_mode and are being read by the orphan 4226 * valid i_mode and are being read by the orphan
4035 * recovery code: that's fine, we're about to complete 4227 * recovery code: that's fine, we're about to complete
4036 * the process of deleting those. */ 4228 * the process of deleting those.
4229 * OR it is the EXT4_BOOT_LOADER_INO which is
4230 * not initialized on a new filesystem. */
4037 } 4231 }
4038 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4232 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4039 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4233 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4153 else 4347 else
4154 init_special_inode(inode, inode->i_mode, 4348 init_special_inode(inode, inode->i_mode,
4155 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4349 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4350 } else if (ino == EXT4_BOOT_LOADER_INO) {
4351 make_bad_inode(inode);
4156 } else { 4352 } else {
4157 ret = -EIO; 4353 ret = -EIO;
4158 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); 4354 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 721f4d33e148..9491ac0590f7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
20#include "ext4_extents.h"
20 21
21#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) 22#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
22 23
24/**
25 * Swap memory between @a and @b for @len bytes.
26 *
27 * @a: pointer to first memory area
28 * @b: pointer to second memory area
29 * @len: number of bytes to swap
30 *
31 */
32static void memswap(void *a, void *b, size_t len)
33{
34 unsigned char *ap, *bp;
35 unsigned char tmp;
36
37 ap = (unsigned char *)a;
38 bp = (unsigned char *)b;
39 while (len-- > 0) {
40 tmp = *ap;
41 *ap = *bp;
42 *bp = tmp;
43 ap++;
44 bp++;
45 }
46}
47
48/**
49 * Swap i_data and associated attributes between @inode1 and @inode2.
50 * This function is used for the primary swap between inode1 and inode2
51 * and also to revert this primary swap in case of errors.
52 *
53 * Therefore you have to make sure, that calling this method twice
54 * will revert all changes.
55 *
56 * @inode1: pointer to first inode
57 * @inode2: pointer to second inode
58 */
59static void swap_inode_data(struct inode *inode1, struct inode *inode2)
60{
61 loff_t isize;
62 struct ext4_inode_info *ei1;
63 struct ext4_inode_info *ei2;
64
65 ei1 = EXT4_I(inode1);
66 ei2 = EXT4_I(inode2);
67
68 memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
69 memswap(&inode1->i_version, &inode2->i_version,
70 sizeof(inode1->i_version));
71 memswap(&inode1->i_blocks, &inode2->i_blocks,
72 sizeof(inode1->i_blocks));
73 memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
74 memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
75 memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
76
77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
80 memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
81 memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
82
83 isize = i_size_read(inode1);
84 i_size_write(inode1, i_size_read(inode2));
85 i_size_write(inode2, isize);
86}
87
88/**
89 * Swap the information from the given @inode and the inode
90 * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
91 * important fields of the inodes.
92 *
93 * @sb: the super block of the filesystem
94 * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
95 *
96 */
97static long swap_inode_boot_loader(struct super_block *sb,
98 struct inode *inode)
99{
100 handle_t *handle;
101 int err;
102 struct inode *inode_bl;
103 struct ext4_inode_info *ei;
104 struct ext4_inode_info *ei_bl;
105 struct ext4_sb_info *sbi;
106
107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
108 err = -EINVAL;
109 goto swap_boot_out;
110 }
111
112 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
113 err = -EPERM;
114 goto swap_boot_out;
115 }
116
117 sbi = EXT4_SB(sb);
118 ei = EXT4_I(inode);
119
120 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
121 if (IS_ERR(inode_bl)) {
122 err = PTR_ERR(inode_bl);
123 goto swap_boot_out;
124 }
125 ei_bl = EXT4_I(inode_bl);
126
127 filemap_flush(inode->i_mapping);
128 filemap_flush(inode_bl->i_mapping);
129
130 /* Protect orig inodes against a truncate and make sure,
131 * that only 1 swap_inode_boot_loader is running. */
132 ext4_inode_double_lock(inode, inode_bl);
133
134 truncate_inode_pages(&inode->i_data, 0);
135 truncate_inode_pages(&inode_bl->i_data, 0);
136
137 /* Wait for all existing dio workers */
138 ext4_inode_block_unlocked_dio(inode);
139 ext4_inode_block_unlocked_dio(inode_bl);
140 inode_dio_wait(inode);
141 inode_dio_wait(inode_bl);
142
143 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
144 if (IS_ERR(handle)) {
145 err = -EINVAL;
146 goto swap_boot_out;
147 }
148
149 /* Protect extent tree against block allocations via delalloc */
150 ext4_double_down_write_data_sem(inode, inode_bl);
151
152 if (inode_bl->i_nlink == 0) {
153 /* this inode has never been used as a BOOT_LOADER */
154 set_nlink(inode_bl, 1);
155 i_uid_write(inode_bl, 0);
156 i_gid_write(inode_bl, 0);
157 inode_bl->i_flags = 0;
158 ei_bl->i_flags = 0;
159 inode_bl->i_version = 1;
160 i_size_write(inode_bl, 0);
161 inode_bl->i_mode = S_IFREG;
162 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
163 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
164 ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
165 ext4_ext_tree_init(handle, inode_bl);
166 } else
167 memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
168 }
169
170 swap_inode_data(inode, inode_bl);
171
172 inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
173
174 spin_lock(&sbi->s_next_gen_lock);
175 inode->i_generation = sbi->s_next_generation++;
176 inode_bl->i_generation = sbi->s_next_generation++;
177 spin_unlock(&sbi->s_next_gen_lock);
178
179 ext4_discard_preallocations(inode);
180
181 err = ext4_mark_inode_dirty(handle, inode);
182 if (err < 0) {
183 ext4_warning(inode->i_sb,
184 "couldn't mark inode #%lu dirty (err %d)",
185 inode->i_ino, err);
186 /* Revert all changes: */
187 swap_inode_data(inode, inode_bl);
188 } else {
189 err = ext4_mark_inode_dirty(handle, inode_bl);
190 if (err < 0) {
191 ext4_warning(inode_bl->i_sb,
192 "couldn't mark inode #%lu dirty (err %d)",
193 inode_bl->i_ino, err);
194 /* Revert all changes: */
195 swap_inode_data(inode, inode_bl);
196 ext4_mark_inode_dirty(handle, inode);
197 }
198 }
199
200 ext4_journal_stop(handle);
201
202 ext4_double_up_write_data_sem(inode, inode_bl);
203
204 ext4_inode_resume_unlocked_dio(inode);
205 ext4_inode_resume_unlocked_dio(inode_bl);
206
207 ext4_inode_double_unlock(inode, inode_bl);
208
209 iput(inode_bl);
210
211swap_boot_out:
212 return err;
213}
214
23long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 215long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24{ 216{
25 struct inode *inode = file_inode(filp); 217 struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
83 if (!capable(CAP_SYS_RESOURCE)) 275 if (!capable(CAP_SYS_RESOURCE))
84 goto flags_out; 276 goto flags_out;
85 } 277 }
86 if (oldflags & EXT4_EXTENTS_FL) { 278 if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
87 /* We don't support clearning extent flags */
88 if (!(flags & EXT4_EXTENTS_FL)) {
89 err = -EOPNOTSUPP;
90 goto flags_out;
91 }
92 } else if (flags & EXT4_EXTENTS_FL) {
93 /* migrate the file */
94 migrate = 1; 279 migrate = 1;
95 flags &= ~EXT4_EXTENTS_FL;
96 }
97 280
98 if (flags & EXT4_EOFBLOCKS_FL) { 281 if (flags & EXT4_EOFBLOCKS_FL) {
99 /* we don't support adding EOFBLOCKS flag */ 282 /* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
137 err = ext4_change_inode_journal_flag(inode, jflag); 320 err = ext4_change_inode_journal_flag(inode, jflag);
138 if (err) 321 if (err)
139 goto flags_out; 322 goto flags_out;
140 if (migrate) 323 if (migrate) {
141 err = ext4_ext_migrate(inode); 324 if (flags & EXT4_EXTENTS_FL)
325 err = ext4_ext_migrate(inode);
326 else
327 err = ext4_ind_migrate(inode);
328 }
329
142flags_out: 330flags_out:
143 mutex_unlock(&inode->i_mutex); 331 mutex_unlock(&inode->i_mutex);
144 mnt_drop_write_file(filp); 332 mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
357 return err; 545 return err;
358 } 546 }
359 547
548 case EXT4_IOC_SWAP_BOOT:
549 if (!(filp->f_mode & FMODE_WRITE))
550 return -EBADF;
551 return swap_inode_boot_loader(sb, inode);
552
360 case EXT4_IOC_RESIZE_FS: { 553 case EXT4_IOC_RESIZE_FS: {
361 ext4_fsblk_t n_blocks_count; 554 ext4_fsblk_t n_blocks_count;
362 struct super_block *sb = inode->i_sb;
363 int err = 0, err2 = 0; 555 int err = 0, err2 = 0;
364 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; 556 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
365 557
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..a11ea4d6164c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
405 ext4_clear_bit(bit, addr); 405 ext4_clear_bit(bit, addr);
406} 406}
407 407
408static inline int mb_test_and_clear_bit(int bit, void *addr)
409{
410 addr = mb_correct_addr_and_bit(&bit, addr);
411 return ext4_test_and_clear_bit(bit, addr);
412}
413
408static inline int mb_find_next_zero_bit(void *addr, int max, int start) 414static inline int mb_find_next_zero_bit(void *addr, int max, int start)
409{ 415{
410 int fix = 0, ret, tmpmax; 416 int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
764 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 770 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
765} 771}
766 772
773static void mb_regenerate_buddy(struct ext4_buddy *e4b)
774{
775 int count;
776 int order = 1;
777 void *buddy;
778
779 while ((buddy = mb_find_buddy(e4b, order++, &count))) {
780 ext4_set_bits(buddy, 0, count);
781 }
782 e4b->bd_info->bb_fragments = 0;
783 memset(e4b->bd_info->bb_counters, 0,
784 sizeof(*e4b->bd_info->bb_counters) *
785 (e4b->bd_sb->s_blocksize_bits + 2));
786
787 ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
788 e4b->bd_bitmap, e4b->bd_group);
789}
790
767/* The buddy information is attached the buddy cache inode 791/* The buddy information is attached the buddy cache inode
768 * for convenience. The information regarding each group 792 * for convenience. The information regarding each group
769 * is loaded via ext4_mb_load_buddy. The information involve 793 * is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
860 884
861 first_block = page->index * blocks_per_page; 885 first_block = page->index * blocks_per_page;
862 for (i = 0; i < blocks_per_page; i++) { 886 for (i = 0; i < blocks_per_page; i++) {
863 int group;
864
865 group = (first_block + i) >> 1; 887 group = (first_block + i) >> 1;
866 if (group >= ngroups) 888 if (group >= ngroups)
867 break; 889 break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1011 struct page *page; 1033 struct page *page;
1012 int ret = 0; 1034 int ret = 0;
1013 1035
1036 might_sleep();
1014 mb_debug(1, "init group %u\n", group); 1037 mb_debug(1, "init group %u\n", group);
1015 this_grp = ext4_get_group_info(sb, group); 1038 this_grp = ext4_get_group_info(sb, group);
1016 /* 1039 /*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1082 struct ext4_sb_info *sbi = EXT4_SB(sb); 1105 struct ext4_sb_info *sbi = EXT4_SB(sb);
1083 struct inode *inode = sbi->s_buddy_cache; 1106 struct inode *inode = sbi->s_buddy_cache;
1084 1107
1108 might_sleep();
1085 mb_debug(1, "load group %u\n", group); 1109 mb_debug(1, "load group %u\n", group);
1086 1110
1087 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1111 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
1244 } 1268 }
1245} 1269}
1246 1270
1271/* clear bits in given range
1272 * will return first found zero bit if any, -1 otherwise
1273 */
1274static int mb_test_and_clear_bits(void *bm, int cur, int len)
1275{
1276 __u32 *addr;
1277 int zero_bit = -1;
1278
1279 len = cur + len;
1280 while (cur < len) {
1281 if ((cur & 31) == 0 && (len - cur) >= 32) {
1282 /* fast path: clear whole word at once */
1283 addr = bm + (cur >> 3);
1284 if (*addr != (__u32)(-1) && zero_bit == -1)
1285 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1286 *addr = 0;
1287 cur += 32;
1288 continue;
1289 }
1290 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1291 zero_bit = cur;
1292 cur++;
1293 }
1294
1295 return zero_bit;
1296}
1297
1247void ext4_set_bits(void *bm, int cur, int len) 1298void ext4_set_bits(void *bm, int cur, int len)
1248{ 1299{
1249 __u32 *addr; 1300 __u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
1262 } 1313 }
1263} 1314}
1264 1315
1316/*
1317 * _________________________________________________________________ */
1318
1319static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1320{
1321 if (mb_test_bit(*bit + side, bitmap)) {
1322 mb_clear_bit(*bit, bitmap);
1323 (*bit) -= side;
1324 return 1;
1325 }
1326 else {
1327 (*bit) += side;
1328 mb_set_bit(*bit, bitmap);
1329 return -1;
1330 }
1331}
1332
1333static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1334{
1335 int max;
1336 int order = 1;
1337 void *buddy = mb_find_buddy(e4b, order, &max);
1338
1339 while (buddy) {
1340 void *buddy2;
1341
1342 /* Bits in range [first; last] are known to be set since
1343 * corresponding blocks were allocated. Bits in range
1344 * (first; last) will stay set because they form buddies on
1345 * upper layer. We just deal with borders if they don't
1346 * align with upper layer and then go up.
1347 * Releasing entire group is all about clearing
1348 * single bit of highest order buddy.
1349 */
1350
1351 /* Example:
1352 * ---------------------------------
1353 * | 1 | 1 | 1 | 1 |
1354 * ---------------------------------
1355 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1356 * ---------------------------------
1357 * 0 1 2 3 4 5 6 7
1358 * \_____________________/
1359 *
1360 * Neither [1] nor [6] is aligned to above layer.
1361 * Left neighbour [0] is free, so mark it busy,
1362 * decrease bb_counters and extend range to
1363 * [0; 6]
1364 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1365 * mark [6] free, increase bb_counters and shrink range to
1366 * [0; 5].
1367 * Then shift range to [0; 2], go up and do the same.
1368 */
1369
1370
1371 if (first & 1)
1372 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1373 if (!(last & 1))
1374 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1375 if (first > last)
1376 break;
1377 order++;
1378
1379 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1380 mb_clear_bits(buddy, first, last - first + 1);
1381 e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1382 break;
1383 }
1384 first >>= 1;
1385 last >>= 1;
1386 buddy = buddy2;
1387 }
1388}
1389
1265static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1390static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1266 int first, int count) 1391 int first, int count)
1267{ 1392{
1268 int block = 0; 1393 int left_is_free = 0;
1269 int max = 0; 1394 int right_is_free = 0;
1270 int order; 1395 int block;
1271 void *buddy; 1396 int last = first + count - 1;
1272 void *buddy2;
1273 struct super_block *sb = e4b->bd_sb; 1397 struct super_block *sb = e4b->bd_sb;
1274 1398
1275 BUG_ON(first + count > (sb->s_blocksize << 3)); 1399 BUG_ON(last >= (sb->s_blocksize << 3));
1276 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1400 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1277 mb_check_buddy(e4b); 1401 mb_check_buddy(e4b);
1278 mb_free_blocks_double(inode, e4b, first, count); 1402 mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1281 if (first < e4b->bd_info->bb_first_free) 1405 if (first < e4b->bd_info->bb_first_free)
1282 e4b->bd_info->bb_first_free = first; 1406 e4b->bd_info->bb_first_free = first;
1283 1407
1284 /* let's maintain fragments counter */ 1408 /* access memory sequentially: check left neighbour,
1409 * clear range and then check right neighbour
1410 */
1285 if (first != 0) 1411 if (first != 0)
1286 block = !mb_test_bit(first - 1, e4b->bd_bitmap); 1412 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1287 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1413 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1288 max = !mb_test_bit(first + count, e4b->bd_bitmap); 1414 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1289 if (block && max) 1415 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1290 e4b->bd_info->bb_fragments--;
1291 else if (!block && !max)
1292 e4b->bd_info->bb_fragments++;
1293 1416
1294 /* let's maintain buddy itself */ 1417 if (unlikely(block != -1)) {
1295 while (count-- > 0) { 1418 ext4_fsblk_t blocknr;
1296 block = first++;
1297 order = 0;
1298 1419
1299 if (!mb_test_bit(block, e4b->bd_bitmap)) { 1420 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1300 ext4_fsblk_t blocknr; 1421 blocknr += EXT4_C2B(EXT4_SB(sb), block);
1301 1422 ext4_grp_locked_error(sb, e4b->bd_group,
1302 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1423 inode ? inode->i_ino : 0,
1303 blocknr += EXT4_C2B(EXT4_SB(sb), block); 1424 blocknr,
1304 ext4_grp_locked_error(sb, e4b->bd_group, 1425 "freeing already freed block "
1305 inode ? inode->i_ino : 0, 1426 "(bit %u)", block);
1306 blocknr, 1427 mb_regenerate_buddy(e4b);
1307 "freeing already freed block " 1428 goto done;
1308 "(bit %u)", block); 1429 }
1309 }
1310 mb_clear_bit(block, e4b->bd_bitmap);
1311 e4b->bd_info->bb_counters[order]++;
1312
1313 /* start of the buddy */
1314 buddy = mb_find_buddy(e4b, order, &max);
1315
1316 do {
1317 block &= ~1UL;
1318 if (mb_test_bit(block, buddy) ||
1319 mb_test_bit(block + 1, buddy))
1320 break;
1321
1322 /* both the buddies are free, try to coalesce them */
1323 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1324 1430
1325 if (!buddy2) 1431 /* let's maintain fragments counter */
1326 break; 1432 if (left_is_free && right_is_free)
1433 e4b->bd_info->bb_fragments--;
1434 else if (!left_is_free && !right_is_free)
1435 e4b->bd_info->bb_fragments++;
1327 1436
1328 if (order > 0) { 1437 /* buddy[0] == bd_bitmap is a special case, so handle
1329 /* for special purposes, we don't set 1438 * it right away and let mb_buddy_mark_free stay free of
1330 * free bits in bitmap */ 1439 * zero order checks.
1331 mb_set_bit(block, buddy); 1440 * Check if neighbours are to be coaleasced,
1332 mb_set_bit(block + 1, buddy); 1441 * adjust bitmap bb_counters and borders appropriately.
1333 } 1442 */
1334 e4b->bd_info->bb_counters[order]--; 1443 if (first & 1) {
1335 e4b->bd_info->bb_counters[order]--; 1444 first += !left_is_free;
1445 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1446 }
1447 if (!(last & 1)) {
1448 last -= !right_is_free;
1449 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1450 }
1336 1451
1337 block = block >> 1; 1452 if (first <= last)
1338 order++; 1453 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1339 e4b->bd_info->bb_counters[order]++;
1340 1454
1341 mb_clear_bit(block, buddy2); 1455done:
1342 buddy = buddy2;
1343 } while (1);
1344 }
1345 mb_set_largest_free_order(sb, e4b->bd_info); 1456 mb_set_largest_free_order(sb, e4b->bd_info);
1346 mb_check_buddy(e4b); 1457 mb_check_buddy(e4b);
1347} 1458}
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3342 if (pa->pa_type == MB_GROUP_PA) 3453 if (pa->pa_type == MB_GROUP_PA)
3343 grp_blk--; 3454 grp_blk--;
3344 3455
3345 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3456 grp = ext4_get_group_number(sb, grp_blk);
3346 3457
3347 /* 3458 /*
3348 * possible race: 3459 * possible race:
@@ -3807,7 +3918,7 @@ repeat:
3807 3918
3808 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3919 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3809 BUG_ON(pa->pa_type != MB_INODE_PA); 3920 BUG_ON(pa->pa_type != MB_INODE_PA);
3810 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3921 group = ext4_get_group_number(sb, pa->pa_pstart);
3811 3922
3812 err = ext4_mb_load_buddy(sb, group, &e4b); 3923 err = ext4_mb_load_buddy(sb, group, &e4b);
3813 if (err) { 3924 if (err) {
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4069 4180
4070 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4181 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4071 4182
4072 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4183 group = ext4_get_group_number(sb, pa->pa_pstart);
4073 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4184 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4074 ext4_error(sb, "Error loading buddy information for %u", 4185 ext4_error(sb, "Error loading buddy information for %u",
4075 group); 4186 group);
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4217 unsigned int inquota = 0; 4328 unsigned int inquota = 0;
4218 unsigned int reserv_clstrs = 0; 4329 unsigned int reserv_clstrs = 0;
4219 4330
4331 might_sleep();
4220 sb = ar->inode->i_sb; 4332 sb = ar->inode->i_sb;
4221 sbi = EXT4_SB(sb); 4333 sbi = EXT4_SB(sb);
4222 4334
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4420 node = rb_prev(new_node); 4532 node = rb_prev(new_node);
4421 if (node) { 4533 if (node) {
4422 entry = rb_entry(node, struct ext4_free_data, efd_node); 4534 entry = rb_entry(node, struct ext4_free_data, efd_node);
4423 if (can_merge(entry, new_entry)) { 4535 if (can_merge(entry, new_entry) &&
4536 ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4424 new_entry->efd_start_cluster = entry->efd_start_cluster; 4537 new_entry->efd_start_cluster = entry->efd_start_cluster;
4425 new_entry->efd_count += entry->efd_count; 4538 new_entry->efd_count += entry->efd_count;
4426 rb_erase(node, &(db->bb_free_root)); 4539 rb_erase(node, &(db->bb_free_root));
4427 ext4_journal_callback_del(handle, &entry->efd_jce);
4428 kmem_cache_free(ext4_free_data_cachep, entry); 4540 kmem_cache_free(ext4_free_data_cachep, entry);
4429 } 4541 }
4430 } 4542 }
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4432 node = rb_next(new_node); 4544 node = rb_next(new_node);
4433 if (node) { 4545 if (node) {
4434 entry = rb_entry(node, struct ext4_free_data, efd_node); 4546 entry = rb_entry(node, struct ext4_free_data, efd_node);
4435 if (can_merge(new_entry, entry)) { 4547 if (can_merge(new_entry, entry) &&
4548 ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4436 new_entry->efd_count += entry->efd_count; 4549 new_entry->efd_count += entry->efd_count;
4437 rb_erase(node, &(db->bb_free_root)); 4550 rb_erase(node, &(db->bb_free_root));
4438 ext4_journal_callback_del(handle, &entry->efd_jce);
4439 kmem_cache_free(ext4_free_data_cachep, entry); 4551 kmem_cache_free(ext4_free_data_cachep, entry);
4440 } 4552 }
4441 } 4553 }
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4470 int err = 0; 4582 int err = 0;
4471 int ret; 4583 int ret;
4472 4584
4585 might_sleep();
4473 if (bh) { 4586 if (bh) {
4474 if (block) 4587 if (block)
4475 BUG_ON(block != bh->b_blocknr); 4588 BUG_ON(block != bh->b_blocknr);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 480acf4a085f..49e8bdff9163 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
426 return retval; 426 return retval;
427 } 427 }
428 return retval; 428 return retval;
429
430} 429}
431 430
432int ext4_ext_migrate(struct inode *inode) 431int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
606 605
607 return retval; 606 return retval;
608} 607}
608
609/*
610 * Migrate a simple extent-based inode to use the i_blocks[] array
611 */
612int ext4_ind_migrate(struct inode *inode)
613{
614 struct ext4_extent_header *eh;
615 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
616 struct ext4_inode_info *ei = EXT4_I(inode);
617 struct ext4_extent *ex;
618 unsigned int i, len;
619 ext4_fsblk_t blk;
620 handle_t *handle;
621 int ret;
622
623 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
624 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
625 (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
626 return -EINVAL;
627
628 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
629 EXT4_FEATURE_RO_COMPAT_BIGALLOC))
630 return -EOPNOTSUPP;
631
632 handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
633 if (IS_ERR(handle))
634 return PTR_ERR(handle);
635
636 down_write(&EXT4_I(inode)->i_data_sem);
637 ret = ext4_ext_check_inode(inode);
638 if (ret)
639 goto errout;
640
641 eh = ext_inode_hdr(inode);
642 ex = EXT_FIRST_EXTENT(eh);
643 if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
644 eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
645 ret = -EOPNOTSUPP;
646 goto errout;
647 }
648 if (eh->eh_entries == 0)
649 blk = len = 0;
650 else {
651 len = le16_to_cpu(ex->ee_len);
652 blk = ext4_ext_pblock(ex);
653 if (len > EXT4_NDIR_BLOCKS) {
654 ret = -EOPNOTSUPP;
655 goto errout;
656 }
657 }
658
659 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
660 memset(ei->i_data, 0, sizeof(ei->i_data));
661 for (i=0; i < len; i++)
662 ei->i_data[i] = cpu_to_le32(blk++);
663 ext4_mark_inode_dirty(handle, inode);
664errout:
665 ext4_journal_stop(handle);
666 up_write(&EXT4_I(inode)->i_data_sem);
667 return ret;
668}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f9b551561d2c..214461e42a05 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
7#include "ext4.h" 7#include "ext4.h"
8 8
9/* Checksumming functions */ 9/* Checksumming functions */
10static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) 10static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
11{ 11{
12 struct ext4_sb_info *sbi = EXT4_SB(sb); 12 struct ext4_sb_info *sbi = EXT4_SB(sb);
13 int offset = offsetof(struct mmp_struct, mmp_checksum); 13 int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
54 lock_buffer(bh); 54 lock_buffer(bh);
55 bh->b_end_io = end_buffer_write_sync; 55 bh->b_end_io = end_buffer_write_sync;
56 get_bh(bh); 56 get_bh(bh);
57 submit_bh(WRITE_SYNC, bh); 57 submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
58 wait_on_buffer(bh); 58 wait_on_buffer(bh);
59 sb_end_write(sb); 59 sb_end_write(sb);
60 if (unlikely(!buffer_uptodate(bh))) 60 if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
86 get_bh(*bh); 86 get_bh(*bh);
87 lock_buffer(*bh); 87 lock_buffer(*bh);
88 (*bh)->b_end_io = end_buffer_read_sync; 88 (*bh)->b_end_io = end_buffer_read_sync;
89 submit_bh(READ_SYNC, *bh); 89 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
90 wait_on_buffer(*bh); 90 wait_on_buffer(*bh);
91 if (!buffer_uptodate(*bh)) { 91 if (!buffer_uptodate(*bh)) {
92 brelse(*bh); 92 brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 33e1c086858b..3dcbf364022f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
144} 144}
145 145
146/** 146/**
147 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 147 * ext4_double_down_write_data_sem - Acquire two inodes' write lock
148 * of i_data_sem
148 * 149 *
149 * Acquire write lock of i_data_sem of the two inodes 150 * Acquire write lock of i_data_sem of the two inodes
150 */ 151 */
151static void 152void
152double_down_write_data_sem(struct inode *first, struct inode *second) 153ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
153{ 154{
154 if (first < second) { 155 if (first < second) {
155 down_write(&EXT4_I(first)->i_data_sem); 156 down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
162} 163}
163 164
164/** 165/**
165 * double_up_write_data_sem - Release two inodes' write lock of i_data_sem 166 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
166 * 167 *
167 * @orig_inode: original inode structure to be released its lock first 168 * @orig_inode: original inode structure to be released its lock first
168 * @donor_inode: donor inode structure to be released its lock second 169 * @donor_inode: donor inode structure to be released its lock second
169 * Release write lock of i_data_sem of two inodes (orig and donor). 170 * Release write lock of i_data_sem of two inodes (orig and donor).
170 */ 171 */
171static void 172void
172double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 173ext4_double_up_write_data_sem(struct inode *orig_inode,
174 struct inode *donor_inode)
173{ 175{
174 up_write(&EXT4_I(orig_inode)->i_data_sem); 176 up_write(&EXT4_I(orig_inode)->i_data_sem);
175 up_write(&EXT4_I(donor_inode)->i_data_sem); 177 up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
407 mext_insert_inside_block(o_start, o_end, start_ext, new_ext, 409 mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
408 end_ext, eh, range_to_move); 410 end_ext, eh, range_to_move);
409 411
410 if (depth) { 412 return ext4_ext_dirty(handle, orig_inode, orig_path);
411 ret = ext4_handle_dirty_metadata(handle, orig_inode,
412 orig_path->p_bh);
413 if (ret)
414 return ret;
415 } else {
416 ret = ext4_mark_inode_dirty(handle, orig_inode);
417 if (ret < 0)
418 return ret;
419 }
420
421 return 0;
422} 413}
423 414
424/** 415/**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
737 donor_off += dext_alen; 728 donor_off += dext_alen;
738 orig_off += dext_alen; 729 orig_off += dext_alen;
739 730
731 BUG_ON(replaced_count > count);
740 /* Already moved the expected blocks */ 732 /* Already moved the expected blocks */
741 if (replaced_count >= count) 733 if (replaced_count >= count)
742 break; 734 break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
814 page_cache_release(page[0]); 806 page_cache_release(page[0]);
815 return -ENOMEM; 807 return -ENOMEM;
816 } 808 }
817 809 /*
810 * grab_cache_page_write_begin() may not wait on page's writeback if
811 * BDI not demand that. But it is reasonable to be very conservative
812 * here and explicitly wait on page's writeback
813 */
814 wait_on_page_writeback(page[0]);
815 wait_on_page_writeback(page[1]);
818 if (inode1 > inode2) { 816 if (inode1 > inode2) {
819 struct page *tmp; 817 struct page *tmp;
820 tmp = page[0]; 818 tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
856 if (buffer_uptodate(bh)) 854 if (buffer_uptodate(bh))
857 continue; 855 continue;
858 if (!buffer_mapped(bh)) { 856 if (!buffer_mapped(bh)) {
859 int err = 0;
860 err = ext4_get_block(inode, block, bh, 0); 857 err = ext4_get_block(inode, block, bh, 0);
861 if (err) { 858 if (err) {
862 SetPageError(page); 859 SetPageError(page);
@@ -976,7 +973,7 @@ again:
976 * necessary, just swap data blocks between orig and donor. 973 * necessary, just swap data blocks between orig and donor.
977 */ 974 */
978 if (uninit) { 975 if (uninit) {
979 double_down_write_data_sem(orig_inode, donor_inode); 976 ext4_double_down_write_data_sem(orig_inode, donor_inode);
980 /* If any of extents in range became initialized we have to 977 /* If any of extents in range became initialized we have to
981 * fallback to data copying */ 978 * fallback to data copying */
982 uninit = mext_check_coverage(orig_inode, orig_blk_offset, 979 uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ again:
990 goto drop_data_sem; 987 goto drop_data_sem;
991 988
992 if (!uninit) { 989 if (!uninit) {
993 double_up_write_data_sem(orig_inode, donor_inode); 990 ext4_double_up_write_data_sem(orig_inode, donor_inode);
994 goto data_copy; 991 goto data_copy;
995 } 992 }
996 if ((page_has_private(pagep[0]) && 993 if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ again:
1004 donor_inode, orig_blk_offset, 1001 donor_inode, orig_blk_offset,
1005 block_len_in_page, err); 1002 block_len_in_page, err);
1006 drop_data_sem: 1003 drop_data_sem:
1007 double_up_write_data_sem(orig_inode, donor_inode); 1004 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1008 goto unlock_pages; 1005 goto unlock_pages;
1009 } 1006 }
1010data_copy: 1007data_copy:
@@ -1033,7 +1030,7 @@ data_copy:
1033 } 1030 }
1034 /* Perform all necessary steps similar write_begin()/write_end() 1031 /* Perform all necessary steps similar write_begin()/write_end()
1035 * but keeping in mind that i_size will not change */ 1032 * but keeping in mind that i_size will not change */
1036 *err = __block_write_begin(pagep[0], from, from + replaced_size, 1033 *err = __block_write_begin(pagep[0], from, replaced_size,
1037 ext4_get_block); 1034 ext4_get_block);
1038 if (!*err) 1035 if (!*err)
1039 *err = block_commit_write(pagep[0], from, from + replaced_size); 1036 *err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ repair_branches:
1065 * Extents are swapped already, but we are not able to copy data. 1062 * Extents are swapped already, but we are not able to copy data.
1066 * Try to swap extents to it's original places 1063 * Try to swap extents to it's original places
1067 */ 1064 */
1068 double_down_write_data_sem(orig_inode, donor_inode); 1065 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1069 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 1066 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
1070 orig_blk_offset, 1067 orig_blk_offset,
1071 block_len_in_page, &err2); 1068 block_len_in_page, &err2);
1072 double_up_write_data_sem(orig_inode, donor_inode); 1069 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1073 if (replaced_count != block_len_in_page) { 1070 if (replaced_count != block_len_in_page) {
1074 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), 1071 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
1075 "Unable to copy data block," 1072 "Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
1209} 1206}
1210 1207
1211/** 1208/**
1212 * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 1209 * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
1213 * 1210 *
1214 * @inode1: the inode structure 1211 * @inode1: the inode structure
1215 * @inode2: the inode structure 1212 * @inode2: the inode structure
1216 * 1213 *
1217 * Lock two inodes' i_mutex 1214 * Lock two inodes' i_mutex
1218 */ 1215 */
1219static void 1216void
1220mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1217ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
1221{ 1218{
1222 BUG_ON(inode1 == inode2); 1219 BUG_ON(inode1 == inode2);
1223 if (inode1 < inode2) { 1220 if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1230} 1227}
1231 1228
1232/** 1229/**
1233 * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 1230 * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
1234 * 1231 *
1235 * @inode1: the inode that is released first 1232 * @inode1: the inode that is released first
1236 * @inode2: the inode that is released second 1233 * @inode2: the inode that is released second
1237 * 1234 *
1238 */ 1235 */
1239 1236
1240static void 1237void
1241mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1238ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1242{ 1239{
1243 mutex_unlock(&inode1->i_mutex); 1240 mutex_unlock(&inode1->i_mutex);
1244 mutex_unlock(&inode2->i_mutex); 1241 mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1333 return -EINVAL; 1330 return -EINVAL;
1334 } 1331 }
1335 /* Protect orig and donor inodes against a truncate */ 1332 /* Protect orig and donor inodes against a truncate */
1336 mext_inode_double_lock(orig_inode, donor_inode); 1333 ext4_inode_double_lock(orig_inode, donor_inode);
1337 1334
1338 /* Wait for all existing dio workers */ 1335 /* Wait for all existing dio workers */
1339 ext4_inode_block_unlocked_dio(orig_inode); 1336 ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1342 inode_dio_wait(donor_inode); 1339 inode_dio_wait(donor_inode);
1343 1340
1344 /* Protect extent tree against block allocations via delalloc */ 1341 /* Protect extent tree against block allocations via delalloc */
1345 double_down_write_data_sem(orig_inode, donor_inode); 1342 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1346 /* Check the filesystem environment whether move_extent can be done */ 1343 /* Check the filesystem environment whether move_extent can be done */
1347 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1344 ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
1348 donor_start, &len); 1345 donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1466 * b. racing with ->readpage, ->write_begin, and ext4_get_block 1463 * b. racing with ->readpage, ->write_begin, and ext4_get_block
1467 * in move_extent_per_page 1464 * in move_extent_per_page
1468 */ 1465 */
1469 double_up_write_data_sem(orig_inode, donor_inode); 1466 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1470 1467
1471 while (orig_page_offset <= seq_end_page) { 1468 while (orig_page_offset <= seq_end_page) {
1472 1469
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1500 block_len_in_page = rest_blocks; 1497 block_len_in_page = rest_blocks;
1501 } 1498 }
1502 1499
1503 double_down_write_data_sem(orig_inode, donor_inode); 1500 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1504 if (ret < 0) 1501 if (ret < 0)
1505 break; 1502 break;
1506 1503
@@ -1538,10 +1535,10 @@ out:
1538 ext4_ext_drop_refs(holecheck_path); 1535 ext4_ext_drop_refs(holecheck_path);
1539 kfree(holecheck_path); 1536 kfree(holecheck_path);
1540 } 1537 }
1541 double_up_write_data_sem(orig_inode, donor_inode); 1538 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1542 ext4_inode_resume_unlocked_dio(orig_inode); 1539 ext4_inode_resume_unlocked_dio(orig_inode);
1543 ext4_inode_resume_unlocked_dio(donor_inode); 1540 ext4_inode_resume_unlocked_dio(donor_inode);
1544 mext_inode_double_unlock(orig_inode, donor_inode); 1541 ext4_inode_double_unlock(orig_inode, donor_inode);
1545 1542
1546 return ret; 1543 return ret;
1547} 1544}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3825d6aa8336..6653fc35ecb7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
416{ 416{
417 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 417 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
418 struct ext4_inode_info *ei = EXT4_I(inode); 418 struct ext4_inode_info *ei = EXT4_I(inode);
419 __u32 csum, old_csum; 419 __u32 csum;
420 __le32 save_csum;
420 int size; 421 int size;
421 422
422 size = count_offset + (count * sizeof(struct dx_entry)); 423 size = count_offset + (count * sizeof(struct dx_entry));
423 old_csum = t->dt_checksum; 424 save_csum = t->dt_checksum;
424 t->dt_checksum = 0; 425 t->dt_checksum = 0;
425 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); 426 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
426 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); 427 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
427 t->dt_checksum = old_csum; 428 t->dt_checksum = save_csum;
428 429
429 return cpu_to_le32(csum); 430 return cpu_to_le32(csum);
430} 431}
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
971 hinfo.hash_version += 972 hinfo.hash_version +=
972 EXT4_SB(dir->i_sb)->s_hash_unsigned; 973 EXT4_SB(dir->i_sb)->s_hash_unsigned;
973 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 974 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
975 if (ext4_has_inline_data(dir)) {
976 int has_inline_data = 1;
977 count = htree_inlinedir_to_tree(dir_file, dir, 0,
978 &hinfo, start_hash,
979 start_minor_hash,
980 &has_inline_data);
981 if (has_inline_data) {
982 *next_hash = ~0;
983 return count;
984 }
985 }
974 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 986 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
975 start_hash, start_minor_hash); 987 start_hash, start_minor_hash);
976 *next_hash = ~0; 988 *next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
1455 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); 1467 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1456} 1468}
1457 1469
1458#define S_SHIFT 12
1459static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1460 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
1461 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1462 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
1463 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
1464 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1465 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1466 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
1467};
1468
1469static inline void ext4_set_de_type(struct super_block *sb,
1470 struct ext4_dir_entry_2 *de,
1471 umode_t mode) {
1472 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1473 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1474}
1475
1476/* 1470/*
1477 * Move count entries from end of map between two memory locations. 1471 * Move count entries from end of map between two memory locations.
1478 * Returns pointer to last entry moved. 1472 * Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2251 dquot_initialize(dir); 2245 dquot_initialize(dir);
2252 2246
2253 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2247 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2254 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2248 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2255 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2256retry: 2249retry:
2257 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2250 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
2258 NULL, EXT4_HT_DIR, credits); 2251 NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
2286 dquot_initialize(dir); 2279 dquot_initialize(dir);
2287 2280
2288 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2281 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2289 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2282 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2290 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2291retry: 2283retry:
2292 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2284 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
2293 NULL, EXT4_HT_DIR, credits); 2285 NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2396 dquot_initialize(dir); 2388 dquot_initialize(dir);
2397 2389
2398 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2390 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2399 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2391 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2400 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2401retry: 2392retry:
2402 inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, 2393 inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
2403 &dentry->d_name, 2394 &dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
2826 * quota blocks, sb is already counted in previous macros). 2817 * quota blocks, sb is already counted in previous macros).
2827 */ 2818 */
2828 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2819 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2829 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2820 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
2830 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2831 } 2821 }
2832retry: 2822retry:
2833 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, 2823 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 047a6de04a0a..5929cd0baa20 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -29,25 +29,19 @@
29#include "xattr.h" 29#include "xattr.h"
30#include "acl.h" 30#include "acl.h"
31 31
32static struct kmem_cache *io_page_cachep, *io_end_cachep; 32static struct kmem_cache *io_end_cachep;
33 33
34int __init ext4_init_pageio(void) 34int __init ext4_init_pageio(void)
35{ 35{
36 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
37 if (io_page_cachep == NULL)
38 return -ENOMEM;
39 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 36 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
40 if (io_end_cachep == NULL) { 37 if (io_end_cachep == NULL)
41 kmem_cache_destroy(io_page_cachep);
42 return -ENOMEM; 38 return -ENOMEM;
43 }
44 return 0; 39 return 0;
45} 40}
46 41
47void ext4_exit_pageio(void) 42void ext4_exit_pageio(void)
48{ 43{
49 kmem_cache_destroy(io_end_cachep); 44 kmem_cache_destroy(io_end_cachep);
50 kmem_cache_destroy(io_page_cachep);
51} 45}
52 46
53/* 47/*
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
67 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 61 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
68} 62}
69 63
70static void put_io_page(struct ext4_io_page *io_page) 64static void ext4_release_io_end(ext4_io_end_t *io_end)
71{ 65{
72 if (atomic_dec_and_test(&io_page->p_count)) { 66 BUG_ON(!list_empty(&io_end->list));
73 end_page_writeback(io_page->p_page); 67 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
74 put_page(io_page->p_page); 68
75 kmem_cache_free(io_page_cachep, io_page); 69 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
76 } 70 wake_up_all(ext4_ioend_wq(io_end->inode));
71 if (io_end->flag & EXT4_IO_END_DIRECT)
72 inode_dio_done(io_end->inode);
73 if (io_end->iocb)
74 aio_complete(io_end->iocb, io_end->result, 0);
75 kmem_cache_free(io_end_cachep, io_end);
77} 76}
78 77
79void ext4_free_io_end(ext4_io_end_t *io) 78static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
80{ 79{
81 int i; 80 struct inode *inode = io_end->inode;
82
83 BUG_ON(!io);
84 BUG_ON(!list_empty(&io->list));
85 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
86 81
87 for (i = 0; i < io->num_io_pages; i++) 82 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
88 put_io_page(io->pages[i]); 83 /* Wake up anyone waiting on unwritten extent conversion */
89 io->num_io_pages = 0; 84 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
90 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 85 wake_up_all(ext4_ioend_wq(inode));
91 wake_up_all(ext4_ioend_wq(io->inode));
92 kmem_cache_free(io_end_cachep, io);
93} 86}
94 87
95/* check a range of space and convert unwritten extents to written. */ 88/* check a range of space and convert unwritten extents to written. */
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
112 "(inode %lu, offset %llu, size %zd, error %d)", 105 "(inode %lu, offset %llu, size %zd, error %d)",
113 inode->i_ino, offset, size, ret); 106 inode->i_ino, offset, size, ret);
114 } 107 }
115 /* Wake up anyone waiting on unwritten extent conversion */ 108 ext4_clear_io_unwritten_flag(io);
116 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 109 ext4_release_io_end(io);
117 wake_up_all(ext4_ioend_wq(inode));
118 if (io->flag & EXT4_IO_END_DIRECT)
119 inode_dio_done(inode);
120 if (io->iocb)
121 aio_complete(io->iocb, io->result, 0);
122 return ret; 110 return ret;
123} 111}
124 112
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
149} 137}
150 138
151/* Add the io_end to per-inode completed end_io list. */ 139/* Add the io_end to per-inode completed end_io list. */
152void ext4_add_complete_io(ext4_io_end_t *io_end) 140static void ext4_add_complete_io(ext4_io_end_t *io_end)
153{ 141{
154 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 142 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
155 struct workqueue_struct *wq; 143 struct workqueue_struct *wq;
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
186 err = ext4_end_io(io); 174 err = ext4_end_io(io);
187 if (unlikely(!ret && err)) 175 if (unlikely(!ret && err))
188 ret = err; 176 ret = err;
189 io->flag &= ~EXT4_IO_END_UNWRITTEN;
190 ext4_free_io_end(io);
191 } 177 }
192 return ret; 178 return ret;
193} 179}
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
219 atomic_inc(&EXT4_I(inode)->i_ioend_count); 205 atomic_inc(&EXT4_I(inode)->i_ioend_count);
220 io->inode = inode; 206 io->inode = inode;
221 INIT_LIST_HEAD(&io->list); 207 INIT_LIST_HEAD(&io->list);
208 atomic_set(&io->count, 1);
222 } 209 }
223 return io; 210 return io;
224} 211}
225 212
213void ext4_put_io_end_defer(ext4_io_end_t *io_end)
214{
215 if (atomic_dec_and_test(&io_end->count)) {
216 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
217 ext4_release_io_end(io_end);
218 return;
219 }
220 ext4_add_complete_io(io_end);
221 }
222}
223
224int ext4_put_io_end(ext4_io_end_t *io_end)
225{
226 int err = 0;
227
228 if (atomic_dec_and_test(&io_end->count)) {
229 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
230 err = ext4_convert_unwritten_extents(io_end->inode,
231 io_end->offset, io_end->size);
232 ext4_clear_io_unwritten_flag(io_end);
233 }
234 ext4_release_io_end(io_end);
235 }
236 return err;
237}
238
239ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
240{
241 atomic_inc(&io_end->count);
242 return io_end;
243}
244
226/* 245/*
227 * Print an buffer I/O error compatible with the fs/buffer.c. This 246 * Print an buffer I/O error compatible with the fs/buffer.c. This
228 * provides compatibility with dmesg scrapers that look for a specific 247 * provides compatibility with dmesg scrapers that look for a specific
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
243 ext4_io_end_t *io_end = bio->bi_private; 262 ext4_io_end_t *io_end = bio->bi_private;
244 struct inode *inode; 263 struct inode *inode;
245 int i; 264 int i;
265 int blocksize;
246 sector_t bi_sector = bio->bi_sector; 266 sector_t bi_sector = bio->bi_sector;
247 267
248 BUG_ON(!io_end); 268 BUG_ON(!io_end);
269 inode = io_end->inode;
270 blocksize = 1 << inode->i_blkbits;
249 bio->bi_private = NULL; 271 bio->bi_private = NULL;
250 bio->bi_end_io = NULL; 272 bio->bi_end_io = NULL;
251 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 273 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
252 error = 0; 274 error = 0;
253 bio_put(bio); 275 for (i = 0; i < bio->bi_vcnt; i++) {
254 276 struct bio_vec *bvec = &bio->bi_io_vec[i];
255 for (i = 0; i < io_end->num_io_pages; i++) { 277 struct page *page = bvec->bv_page;
256 struct page *page = io_end->pages[i]->p_page;
257 struct buffer_head *bh, *head; 278 struct buffer_head *bh, *head;
258 loff_t offset; 279 unsigned bio_start = bvec->bv_offset;
259 loff_t io_end_offset; 280 unsigned bio_end = bio_start + bvec->bv_len;
281 unsigned under_io = 0;
282 unsigned long flags;
283
284 if (!page)
285 continue;
260 286
261 if (error) { 287 if (error) {
262 SetPageError(page); 288 SetPageError(page);
263 set_bit(AS_EIO, &page->mapping->flags); 289 set_bit(AS_EIO, &page->mapping->flags);
264 head = page_buffers(page);
265 BUG_ON(!head);
266
267 io_end_offset = io_end->offset + io_end->size;
268
269 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
270 bh = head;
271 do {
272 if ((offset >= io_end->offset) &&
273 (offset+bh->b_size <= io_end_offset))
274 buffer_io_error(bh);
275
276 offset += bh->b_size;
277 bh = bh->b_this_page;
278 } while (bh != head);
279 } 290 }
280 291 bh = head = page_buffers(page);
281 put_io_page(io_end->pages[i]); 292 /*
293 * We check all buffers in the page under BH_Uptodate_Lock
294 * to avoid races with other end io clearing async_write flags
295 */
296 local_irq_save(flags);
297 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
298 do {
299 if (bh_offset(bh) < bio_start ||
300 bh_offset(bh) + blocksize > bio_end) {
301 if (buffer_async_write(bh))
302 under_io++;
303 continue;
304 }
305 clear_buffer_async_write(bh);
306 if (error)
307 buffer_io_error(bh);
308 } while ((bh = bh->b_this_page) != head);
309 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
310 local_irq_restore(flags);
311 if (!under_io)
312 end_page_writeback(page);
282 } 313 }
283 io_end->num_io_pages = 0; 314 bio_put(bio);
284 inode = io_end->inode;
285 315
286 if (error) { 316 if (error) {
287 io_end->flag |= EXT4_IO_END_ERROR; 317 io_end->flag |= EXT4_IO_END_ERROR;
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
294 bi_sector >> (inode->i_blkbits - 9)); 324 bi_sector >> (inode->i_blkbits - 9));
295 } 325 }
296 326
297 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 327 ext4_put_io_end_defer(io_end);
298 ext4_free_io_end(io_end);
299 return;
300 }
301
302 ext4_add_complete_io(io_end);
303} 328}
304 329
305void ext4_io_submit(struct ext4_io_submit *io) 330void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
313 bio_put(io->io_bio); 338 bio_put(io->io_bio);
314 } 339 }
315 io->io_bio = NULL; 340 io->io_bio = NULL;
316 io->io_op = 0; 341}
342
343void ext4_io_submit_init(struct ext4_io_submit *io,
344 struct writeback_control *wbc)
345{
346 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
347 io->io_bio = NULL;
317 io->io_end = NULL; 348 io->io_end = NULL;
318} 349}
319 350
320static int io_submit_init(struct ext4_io_submit *io, 351static int io_submit_init_bio(struct ext4_io_submit *io,
321 struct inode *inode, 352 struct buffer_head *bh)
322 struct writeback_control *wbc,
323 struct buffer_head *bh)
324{ 353{
325 ext4_io_end_t *io_end;
326 struct page *page = bh->b_page;
327 int nvecs = bio_get_nr_vecs(bh->b_bdev); 354 int nvecs = bio_get_nr_vecs(bh->b_bdev);
328 struct bio *bio; 355 struct bio *bio;
329 356
330 io_end = ext4_init_io_end(inode, GFP_NOFS);
331 if (!io_end)
332 return -ENOMEM;
333 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 357 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
334 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 358 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
335 bio->bi_bdev = bh->b_bdev; 359 bio->bi_bdev = bh->b_bdev;
336 bio->bi_private = io->io_end = io_end;
337 bio->bi_end_io = ext4_end_bio; 360 bio->bi_end_io = ext4_end_bio;
338 361 bio->bi_private = ext4_get_io_end(io->io_end);
339 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 362 if (!io->io_end->size)
340 363 io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
364 + bh_offset(bh);
341 io->io_bio = bio; 365 io->io_bio = bio;
342 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
343 io->io_next_block = bh->b_blocknr; 366 io->io_next_block = bh->b_blocknr;
344 return 0; 367 return 0;
345} 368}
346 369
347static int io_submit_add_bh(struct ext4_io_submit *io, 370static int io_submit_add_bh(struct ext4_io_submit *io,
348 struct ext4_io_page *io_page,
349 struct inode *inode, 371 struct inode *inode,
350 struct writeback_control *wbc,
351 struct buffer_head *bh) 372 struct buffer_head *bh)
352{ 373{
353 ext4_io_end_t *io_end; 374 ext4_io_end_t *io_end;
354 int ret; 375 int ret;
355 376
356 if (buffer_new(bh)) {
357 clear_buffer_new(bh);
358 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
359 }
360
361 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 377 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
362submit_and_retry: 378submit_and_retry:
363 ext4_io_submit(io); 379 ext4_io_submit(io);
364 } 380 }
365 if (io->io_bio == NULL) { 381 if (io->io_bio == NULL) {
366 ret = io_submit_init(io, inode, wbc, bh); 382 ret = io_submit_init_bio(io, bh);
367 if (ret) 383 if (ret)
368 return ret; 384 return ret;
369 } 385 }
370 io_end = io->io_end;
371 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
372 (io_end->pages[io_end->num_io_pages-1] != io_page))
373 goto submit_and_retry;
374 if (buffer_uninit(bh))
375 ext4_set_io_unwritten_flag(inode, io_end);
376 io->io_end->size += bh->b_size;
377 io->io_next_block++;
378 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 386 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
379 if (ret != bh->b_size) 387 if (ret != bh->b_size)
380 goto submit_and_retry; 388 goto submit_and_retry;
381 if ((io_end->num_io_pages == 0) || 389 io_end = io->io_end;
382 (io_end->pages[io_end->num_io_pages-1] != io_page)) { 390 if (test_clear_buffer_uninit(bh))
383 io_end->pages[io_end->num_io_pages++] = io_page; 391 ext4_set_io_unwritten_flag(inode, io_end);
384 atomic_inc(&io_page->p_count); 392 io_end->size += bh->b_size;
385 } 393 io->io_next_block++;
386 return 0; 394 return 0;
387} 395}
388 396
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
392 struct writeback_control *wbc) 400 struct writeback_control *wbc)
393{ 401{
394 struct inode *inode = page->mapping->host; 402 struct inode *inode = page->mapping->host;
395 unsigned block_start, block_end, blocksize; 403 unsigned block_start, blocksize;
396 struct ext4_io_page *io_page;
397 struct buffer_head *bh, *head; 404 struct buffer_head *bh, *head;
398 int ret = 0; 405 int ret = 0;
406 int nr_submitted = 0;
399 407
400 blocksize = 1 << inode->i_blkbits; 408 blocksize = 1 << inode->i_blkbits;
401 409
402 BUG_ON(!PageLocked(page)); 410 BUG_ON(!PageLocked(page));
403 BUG_ON(PageWriteback(page)); 411 BUG_ON(PageWriteback(page));
404 412
405 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
406 if (!io_page) {
407 redirty_page_for_writepage(wbc, page);
408 unlock_page(page);
409 return -ENOMEM;
410 }
411 io_page->p_page = page;
412 atomic_set(&io_page->p_count, 1);
413 get_page(page);
414 set_page_writeback(page); 413 set_page_writeback(page);
415 ClearPageError(page); 414 ClearPageError(page);
416 415
417 for (bh = head = page_buffers(page), block_start = 0; 416 /*
418 bh != head || !block_start; 417 * In the first loop we prepare and mark buffers to submit. We have to
419 block_start = block_end, bh = bh->b_this_page) { 418 * mark all buffers in the page before submitting so that
420 419 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
421 block_end = block_start + blocksize; 420 * on the first buffer finishes and we are still working on submitting
421 * the second buffer.
422 */
423 bh = head = page_buffers(page);
424 do {
425 block_start = bh_offset(bh);
422 if (block_start >= len) { 426 if (block_start >= len) {
423 /* 427 /*
424 * Comments copied from block_write_full_page_endio: 428 * Comments copied from block_write_full_page_endio:
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
431 * mapped, and writes to that region are not written 435 * mapped, and writes to that region are not written
432 * out to the file." 436 * out to the file."
433 */ 437 */
434 zero_user_segment(page, block_start, block_end); 438 zero_user_segment(page, block_start,
439 block_start + blocksize);
435 clear_buffer_dirty(bh); 440 clear_buffer_dirty(bh);
436 set_buffer_uptodate(bh); 441 set_buffer_uptodate(bh);
437 continue; 442 continue;
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
445 ext4_io_submit(io); 450 ext4_io_submit(io);
446 continue; 451 continue;
447 } 452 }
448 ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 453 if (buffer_new(bh)) {
454 clear_buffer_new(bh);
455 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
456 }
457 set_buffer_async_write(bh);
458 } while ((bh = bh->b_this_page) != head);
459
460 /* Now submit buffers to write */
461 bh = head = page_buffers(page);
462 do {
463 if (!buffer_async_write(bh))
464 continue;
465 ret = io_submit_add_bh(io, inode, bh);
449 if (ret) { 466 if (ret) {
450 /* 467 /*
451 * We only get here on ENOMEM. Not much else 468 * We only get here on ENOMEM. Not much else
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
455 redirty_page_for_writepage(wbc, page); 472 redirty_page_for_writepage(wbc, page);
456 break; 473 break;
457 } 474 }
475 nr_submitted++;
458 clear_buffer_dirty(bh); 476 clear_buffer_dirty(bh);
477 } while ((bh = bh->b_this_page) != head);
478
479 /* Error stopped previous loop? Clean up buffers... */
480 if (ret) {
481 do {
482 clear_buffer_async_write(bh);
483 bh = bh->b_this_page;
484 } while (bh != head);
459 } 485 }
460 unlock_page(page); 486 unlock_page(page);
461 /* 487 /* Nothing submitted - we have to end page writeback */
462 * If the page was truncated before we could do the writeback, 488 if (!nr_submitted)
463 * or we had a memory allocation error while trying to write 489 end_page_writeback(page);
464 * the first buffer head, we won't have submitted any pages for
465 * I/O. In that case we need to make sure we've cleared the
466 * PageWriteback bit from the page to prevent the system from
467 * wedging later on.
468 */
469 put_io_page(io_page);
470 return ret; 490 return ret;
471} 491}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c169477a62c9..b27c96d01965 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ next_group:
272 if (start_blk >= last_blk) 272 if (start_blk >= last_blk)
273 goto next_group; 273 goto next_group;
274 group_data[bb_index].block_bitmap = start_blk++; 274 group_data[bb_index].block_bitmap = start_blk++;
275 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 275 group = ext4_get_group_number(sb, start_blk - 1);
276 group -= group_data[0].group; 276 group -= group_data[0].group;
277 group_data[group].free_blocks_count--; 277 group_data[group].free_blocks_count--;
278 if (flexbg_size > 1) 278 if (flexbg_size > 1)
@@ -284,7 +284,7 @@ next_group:
284 if (start_blk >= last_blk) 284 if (start_blk >= last_blk)
285 goto next_group; 285 goto next_group;
286 group_data[ib_index].inode_bitmap = start_blk++; 286 group_data[ib_index].inode_bitmap = start_blk++;
287 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 287 group = ext4_get_group_number(sb, start_blk - 1);
288 group -= group_data[0].group; 288 group -= group_data[0].group;
289 group_data[group].free_blocks_count--; 289 group_data[group].free_blocks_count--;
290 if (flexbg_size > 1) 290 if (flexbg_size > 1)
@@ -296,7 +296,7 @@ next_group:
296 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) 296 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
297 goto next_group; 297 goto next_group;
298 group_data[it_index].inode_table = start_blk; 298 group_data[it_index].inode_table = start_blk;
299 ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); 299 group = ext4_get_group_number(sb, start_blk - 1);
300 group -= group_data[0].group; 300 group -= group_data[0].group;
301 group_data[group].free_blocks_count -= 301 group_data[group].free_blocks_count -=
302 EXT4_SB(sb)->s_itb_per_group; 302 EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
392 ext4_group_t group; 392 ext4_group_t group;
393 int err; 393 int err;
394 394
395 ext4_get_group_no_and_offset(sb, block, &group, NULL); 395 group = ext4_get_group_number(sb, block);
396 start = ext4_group_first_block_no(sb, group); 396 start = ext4_group_first_block_no(sb, group);
397 group -= flex_gd->groups[0].group; 397 group -= flex_gd->groups[0].group;
398 398
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
1341 1341
1342 /* Update the global fs size fields */ 1342 /* Update the global fs size fields */
1343 sbi->s_groups_count += flex_gd->count; 1343 sbi->s_groups_count += flex_gd->count;
1344 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
1345 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
1344 1346
1345 /* Update the reserved block counts only once the new group is 1347 /* Update the reserved block counts only once the new group is
1346 * active. */ 1348 * active. */
@@ -1879,7 +1881,11 @@ retry:
1879 /* Nothing need to do */ 1881 /* Nothing need to do */
1880 return 0; 1882 return 0;
1881 1883
1882 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1884 n_group = ext4_get_group_number(sb, n_blocks_count - 1);
1885 if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
1886 ext4_warning(sb, "resize would cause inodes_count overflow");
1887 return -EINVAL;
1888 }
1883 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1889 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1884 1890
1885 n_desc_blocks = num_desc_blocks(sb, n_group + 1); 1891 n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..dbc7c090c13a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
81static void ext4_destroy_lazyinit_thread(void); 81static void ext4_destroy_lazyinit_thread(void);
82static void ext4_unregister_li_request(struct super_block *sb); 82static void ext4_unregister_li_request(struct super_block *sb);
83static void ext4_clear_request_list(void); 83static void ext4_clear_request_list(void);
84static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
84 85
85#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
86static struct file_system_type ext2_fs_type = { 87static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
353 struct super_block *sb = journal->j_private; 354 struct super_block *sb = journal->j_private;
354 struct ext4_sb_info *sbi = EXT4_SB(sb); 355 struct ext4_sb_info *sbi = EXT4_SB(sb);
355 int error = is_journal_aborted(journal); 356 int error = is_journal_aborted(journal);
356 struct ext4_journal_cb_entry *jce, *tmp; 357 struct ext4_journal_cb_entry *jce;
357 358
359 BUG_ON(txn->t_state == T_FINISHED);
358 spin_lock(&sbi->s_md_lock); 360 spin_lock(&sbi->s_md_lock);
359 list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { 361 while (!list_empty(&txn->t_private_list)) {
362 jce = list_entry(txn->t_private_list.next,
363 struct ext4_journal_cb_entry, jce_list);
360 list_del_init(&jce->jce_list); 364 list_del_init(&jce->jce_list);
361 spin_unlock(&sbi->s_md_lock); 365 spin_unlock(&sbi->s_md_lock);
362 jce->jce_func(sb, jce, error); 366 jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1948 if ((sbi->s_es->s_feature_ro_compat & 1952 if ((sbi->s_es->s_feature_ro_compat &
1949 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { 1953 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
1950 /* Use new metadata_csum algorithm */ 1954 /* Use new metadata_csum algorithm */
1951 __u16 old_csum; 1955 __le16 save_csum;
1952 __u32 csum32; 1956 __u32 csum32;
1953 1957
1954 old_csum = gdp->bg_checksum; 1958 save_csum = gdp->bg_checksum;
1955 gdp->bg_checksum = 0; 1959 gdp->bg_checksum = 0;
1956 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, 1960 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
1957 sizeof(le_group)); 1961 sizeof(le_group));
1958 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, 1962 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
1959 sbi->s_desc_size); 1963 sbi->s_desc_size);
1960 gdp->bg_checksum = old_csum; 1964 gdp->bg_checksum = save_csum;
1961 1965
1962 crc = csum32 & 0xFFFF; 1966 crc = csum32 & 0xFFFF;
1963 goto out; 1967 goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
2379 int offset; 2383 int offset;
2380}; 2384};
2381 2385
2382static int parse_strtoul(const char *buf, 2386static int parse_strtoull(const char *buf,
2383 unsigned long max, unsigned long *value) 2387 unsigned long long max, unsigned long long *value)
2384{ 2388{
2385 char *endp; 2389 int ret;
2386
2387 *value = simple_strtoul(skip_spaces(buf), &endp, 0);
2388 endp = skip_spaces(endp);
2389 if (*endp || *value > max)
2390 return -EINVAL;
2391 2390
2392 return 0; 2391 ret = kstrtoull(skip_spaces(buf), 0, value);
2392 if (!ret && *value > max)
2393 ret = -EINVAL;
2394 return ret;
2393} 2395}
2394 2396
2395static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, 2397static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2431 const char *buf, size_t count) 2433 const char *buf, size_t count)
2432{ 2434{
2433 unsigned long t; 2435 unsigned long t;
2436 int ret;
2434 2437
2435 if (parse_strtoul(buf, 0x40000000, &t)) 2438 ret = kstrtoul(skip_spaces(buf), 0, &t);
2436 return -EINVAL; 2439 if (ret)
2440 return ret;
2437 2441
2438 if (t && !is_power_of_2(t)) 2442 if (t && (!is_power_of_2(t) || t > 0x40000000))
2439 return -EINVAL; 2443 return -EINVAL;
2440 2444
2441 sbi->s_inode_readahead_blks = t; 2445 sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2456{ 2460{
2457 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2461 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2458 unsigned long t; 2462 unsigned long t;
2463 int ret;
2459 2464
2460 if (parse_strtoul(buf, 0xffffffff, &t)) 2465 ret = kstrtoul(skip_spaces(buf), 0, &t);
2461 return -EINVAL; 2466 if (ret)
2467 return ret;
2462 *ui = t; 2468 *ui = t;
2463 return count; 2469 return count;
2464} 2470}
2465 2471
2472static ssize_t reserved_clusters_show(struct ext4_attr *a,
2473 struct ext4_sb_info *sbi, char *buf)
2474{
2475 return snprintf(buf, PAGE_SIZE, "%llu\n",
2476 (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
2477}
2478
2479static ssize_t reserved_clusters_store(struct ext4_attr *a,
2480 struct ext4_sb_info *sbi,
2481 const char *buf, size_t count)
2482{
2483 unsigned long long val;
2484 int ret;
2485
2486 if (parse_strtoull(buf, -1ULL, &val))
2487 return -EINVAL;
2488 ret = ext4_reserve_clusters(sbi, val);
2489
2490 return ret ? ret : count;
2491}
2492
2466static ssize_t trigger_test_error(struct ext4_attr *a, 2493static ssize_t trigger_test_error(struct ext4_attr *a,
2467 struct ext4_sb_info *sbi, 2494 struct ext4_sb_info *sbi,
2468 const char *buf, size_t count) 2495 const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2500EXT4_RO_ATTR(delayed_allocation_blocks); 2527EXT4_RO_ATTR(delayed_allocation_blocks);
2501EXT4_RO_ATTR(session_write_kbytes); 2528EXT4_RO_ATTR(session_write_kbytes);
2502EXT4_RO_ATTR(lifetime_write_kbytes); 2529EXT4_RO_ATTR(lifetime_write_kbytes);
2530EXT4_RW_ATTR(reserved_clusters);
2503EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2531EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2504 inode_readahead_blks_store, s_inode_readahead_blks); 2532 inode_readahead_blks_store, s_inode_readahead_blks);
2505EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2533EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
2517 ATTR_LIST(delayed_allocation_blocks), 2545 ATTR_LIST(delayed_allocation_blocks),
2518 ATTR_LIST(session_write_kbytes), 2546 ATTR_LIST(session_write_kbytes),
2519 ATTR_LIST(lifetime_write_kbytes), 2547 ATTR_LIST(lifetime_write_kbytes),
2548 ATTR_LIST(reserved_clusters),
2520 ATTR_LIST(inode_readahead_blks), 2549 ATTR_LIST(inode_readahead_blks),
2521 ATTR_LIST(inode_goal), 2550 ATTR_LIST(inode_goal),
2522 ATTR_LIST(mb_stats), 2551 ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
3192 return 0; 3221 return 0;
3193} 3222}
3194 3223
3224
3225static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
3226{
3227 ext4_fsblk_t resv_clusters;
3228
3229 /*
3230 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3231 * This should cover the situations where we can not afford to run
3232 * out of space like for example punch hole, or converting
3233 * uninitialized extents in delalloc path. In most cases such
3234 * allocation would require 1, or 2 blocks, higher numbers are
3235 * very rare.
3236 */
3237 resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
3238
3239 do_div(resv_clusters, 50);
3240 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3241
3242 return resv_clusters;
3243}
3244
3245
3246static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
3247{
3248 ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
3249 sbi->s_cluster_bits;
3250
3251 if (count >= clusters)
3252 return -EINVAL;
3253
3254 atomic64_set(&sbi->s_resv_clusters, count);
3255 return 0;
3256}
3257
3195static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3258static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3196{ 3259{
3197 char *orig_data = kstrdup(data, GFP_KERNEL); 3260 char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3526 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3589 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3527 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3590 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3528 3591
3592 /* Do we have standard group size of blocksize * 8 blocks ? */
3593 if (sbi->s_blocks_per_group == blocksize << 3)
3594 set_opt2(sb, STD_GROUP_SIZE);
3595
3529 for (i = 0; i < 4; i++) 3596 for (i = 0; i < 4; i++)
3530 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3597 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3531 sbi->s_def_hash_version = es->s_def_hash_version; 3598 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3698 sbi->s_err_report.function = print_daily_error_info; 3765 sbi->s_err_report.function = print_daily_error_info;
3699 sbi->s_err_report.data = (unsigned long) sb; 3766 sbi->s_err_report.data = (unsigned long) sb;
3700 3767
3768 /* Register extent status tree shrinker */
3769 ext4_es_register_shrinker(sb);
3770
3701 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3771 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3702 ext4_count_free_clusters(sb)); 3772 ext4_count_free_clusters(sb));
3703 if (!err) { 3773 if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3723 sbi->s_max_writeback_mb_bump = 128; 3793 sbi->s_max_writeback_mb_bump = 128;
3724 sbi->s_extent_max_zeroout_kb = 32; 3794 sbi->s_extent_max_zeroout_kb = 32;
3725 3795
3726 /* Register extent status tree shrinker */
3727 ext4_es_register_shrinker(sb);
3728
3729 /* 3796 /*
3730 * set up enough so that it can read an inode 3797 * set up enough so that it can read an inode
3731 */ 3798 */
@@ -3911,6 +3978,13 @@ no_journal:
3911 "available"); 3978 "available");
3912 } 3979 }
3913 3980
3981 err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
3982 if (err) {
3983 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
3984 "reserved pool", ext4_calculate_resv_clusters(sbi));
3985 goto failed_mount4a;
3986 }
3987
3914 err = ext4_setup_system_zone(sb); 3988 err = ext4_setup_system_zone(sb);
3915 if (err) { 3989 if (err) {
3916 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3990 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ failed_mount_wq:
4010 sbi->s_journal = NULL; 4084 sbi->s_journal = NULL;
4011 } 4085 }
4012failed_mount3: 4086failed_mount3:
4087 ext4_es_unregister_shrinker(sb);
4013 del_timer(&sbi->s_err_report); 4088 del_timer(&sbi->s_err_report);
4014 if (sbi->s_flex_groups) 4089 if (sbi->s_flex_groups)
4015 ext4_kvfree(sbi->s_flex_groups); 4090 ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
4177 goto out_bdev; 4252 goto out_bdev;
4178 } 4253 }
4179 journal->j_private = sb; 4254 journal->j_private = sb;
4180 ll_rw_block(READ, 1, &journal->j_sb_buffer); 4255 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
4181 wait_on_buffer(journal->j_sb_buffer); 4256 wait_on_buffer(journal->j_sb_buffer);
4182 if (!buffer_uptodate(journal->j_sb_buffer)) { 4257 if (!buffer_uptodate(journal->j_sb_buffer)) {
4183 ext4_msg(sb, KERN_ERR, "I/O error on journal device"); 4258 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4742 struct super_block *sb = dentry->d_sb; 4817 struct super_block *sb = dentry->d_sb;
4743 struct ext4_sb_info *sbi = EXT4_SB(sb); 4818 struct ext4_sb_info *sbi = EXT4_SB(sb);
4744 struct ext4_super_block *es = sbi->s_es; 4819 struct ext4_super_block *es = sbi->s_es;
4745 ext4_fsblk_t overhead = 0; 4820 ext4_fsblk_t overhead = 0, resv_blocks;
4746 u64 fsid; 4821 u64 fsid;
4747 s64 bfree; 4822 s64 bfree;
4823 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
4748 4824
4749 if (!test_opt(sb, MINIX_DF)) 4825 if (!test_opt(sb, MINIX_DF))
4750 overhead = sbi->s_overhead; 4826 overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4756 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4832 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4757 /* prevent underflow in case that few free space is available */ 4833 /* prevent underflow in case that few free space is available */
4758 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); 4834 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
4759 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4835 buf->f_bavail = buf->f_bfree -
4760 if (buf->f_bfree < ext4_r_blocks_count(es)) 4836 (ext4_r_blocks_count(es) + resv_blocks);
4837 if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
4761 buf->f_bavail = 0; 4838 buf->f_bavail = 0;
4762 buf->f_files = le32_to_cpu(es->s_inodes_count); 4839 buf->f_files = le32_to_cpu(es->s_inodes_count);
4763 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 4840 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
4945 return PTR_ERR(qf_inode); 5022 return PTR_ERR(qf_inode);
4946 } 5023 }
4947 5024
5025 /* Don't account quota for quota files to avoid recursion */
5026 qf_inode->i_flags |= S_NOQUOTA;
4948 err = dquot_enable(qf_inode, type, format_id, flags); 5027 err = dquot_enable(qf_inode, type, format_id, flags);
4949 iput(qf_inode); 5028 iput(qf_inode);
4950 5029
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a120b277240..c081e34f717f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
122 struct ext4_xattr_header *hdr) 122 struct ext4_xattr_header *hdr)
123{ 123{
124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
125 __u32 csum, old; 125 __u32 csum;
126 __le32 save_csum;
127 __le64 dsk_block_nr = cpu_to_le64(block_nr);
126 128
127 old = hdr->h_checksum; 129 save_csum = hdr->h_checksum;
128 hdr->h_checksum = 0; 130 hdr->h_checksum = 0;
129 block_nr = cpu_to_le64(block_nr); 131 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
130 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, 132 sizeof(dsk_block_nr));
131 sizeof(block_nr));
132 csum = ext4_chksum(sbi, csum, (__u8 *)hdr, 133 csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
133 EXT4_BLOCK_SIZE(inode->i_sb)); 134 EXT4_BLOCK_SIZE(inode->i_sb));
134 135
135 hdr->h_checksum = old; 136 hdr->h_checksum = save_csum;
136 return cpu_to_le32(csum); 137 return cpu_to_le32(csum);
137} 138}
138 139
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index aa25deb5c6cd..c767dbdd7fc4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
22#define EXT4_XATTR_INDEX_LUSTRE 5 22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6 23#define EXT4_XATTR_INDEX_SECURITY 6
24#define EXT4_XATTR_INDEX_SYSTEM 7 24#define EXT4_XATTR_INDEX_SYSTEM 7
25#define EXT4_XATTR_INDEX_RICHACL 8
25 26
26struct ext4_xattr_header { 27struct ext4_xattr_header {
27 __le32 h_magic; /* magic number for identification */ 28 __le32 h_magic; /* magic number for identification */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
382 int space_left = 0; 382 int space_left = 0;
383 int first_tag = 0; 383 int first_tag = 0;
384 int tag_flag; 384 int tag_flag;
385 int i, to_free = 0; 385 int i;
386 int tag_bytes = journal_tag_bytes(journal); 386 int tag_bytes = journal_tag_bytes(journal);
387 struct buffer_head *cbh = NULL; /* For transactional checksums */ 387 struct buffer_head *cbh = NULL; /* For transactional checksums */
388 __u32 crc32_sum = ~0; 388 __u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
1134 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1134 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1135 spin_unlock(&journal->j_history_lock); 1135 spin_unlock(&journal->j_history_lock);
1136 1136
1137 commit_transaction->t_state = T_FINISHED; 1137 commit_transaction->t_state = T_COMMIT_CALLBACK;
1138 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1138 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1139 journal->j_commit_sequence = commit_transaction->t_tid; 1139 journal->j_commit_sequence = commit_transaction->t_tid;
1140 journal->j_committing_transaction = NULL; 1140 journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
1149 journal->j_average_commit_time*3) / 4; 1149 journal->j_average_commit_time*3) / 4;
1150 else 1150 else
1151 journal->j_average_commit_time = commit_time; 1151 journal->j_average_commit_time = commit_time;
1152
1152 write_unlock(&journal->j_state_lock); 1153 write_unlock(&journal->j_state_lock);
1153 1154
1154 if (commit_transaction->t_checkpoint_list == NULL && 1155 if (journal->j_checkpoint_transactions == NULL) {
1155 commit_transaction->t_checkpoint_io_list == NULL) { 1156 journal->j_checkpoint_transactions = commit_transaction;
1156 __jbd2_journal_drop_transaction(journal, commit_transaction); 1157 commit_transaction->t_cpnext = commit_transaction;
1157 to_free = 1; 1158 commit_transaction->t_cpprev = commit_transaction;
1158 } else { 1159 } else {
1159 if (journal->j_checkpoint_transactions == NULL) { 1160 commit_transaction->t_cpnext =
1160 journal->j_checkpoint_transactions = commit_transaction; 1161 journal->j_checkpoint_transactions;
1161 commit_transaction->t_cpnext = commit_transaction; 1162 commit_transaction->t_cpprev =
1162 commit_transaction->t_cpprev = commit_transaction; 1163 commit_transaction->t_cpnext->t_cpprev;
1163 } else { 1164 commit_transaction->t_cpnext->t_cpprev =
1164 commit_transaction->t_cpnext = 1165 commit_transaction;
1165 journal->j_checkpoint_transactions; 1166 commit_transaction->t_cpprev->t_cpnext =
1166 commit_transaction->t_cpprev =
1167 commit_transaction->t_cpnext->t_cpprev;
1168 commit_transaction->t_cpnext->t_cpprev =
1169 commit_transaction;
1170 commit_transaction->t_cpprev->t_cpnext =
1171 commit_transaction; 1167 commit_transaction;
1172 }
1173 } 1168 }
1174 spin_unlock(&journal->j_list_lock); 1169 spin_unlock(&journal->j_list_lock);
1175 1170 /* Drop all spin_locks because commit_callback may be block.
1171 * __journal_remove_checkpoint() can not destroy transaction
1172 * under us because it is not marked as T_FINISHED yet */
1176 if (journal->j_commit_callback) 1173 if (journal->j_commit_callback)
1177 journal->j_commit_callback(journal, commit_transaction); 1174 journal->j_commit_callback(journal, commit_transaction);
1178 1175
1179 trace_jbd2_end_commit(journal, commit_transaction); 1176 trace_jbd2_end_commit(journal, commit_transaction);
1180 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1177 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1181 journal->j_commit_sequence, journal->j_tail_sequence); 1178 journal->j_commit_sequence, journal->j_tail_sequence);
1182 if (to_free)
1183 jbd2_journal_free_transaction(commit_transaction);
1184 1179
1180 write_lock(&journal->j_state_lock);
1181 spin_lock(&journal->j_list_lock);
1182 commit_transaction->t_state = T_FINISHED;
1183 /* Recheck checkpoint lists after j_list_lock was dropped */
1184 if (commit_transaction->t_checkpoint_list == NULL &&
1185 commit_transaction->t_checkpoint_io_list == NULL) {
1186 __jbd2_journal_drop_transaction(journal, commit_transaction);
1187 jbd2_journal_free_transaction(commit_transaction);
1188 }
1189 spin_unlock(&journal->j_list_lock);
1190 write_unlock(&journal->j_state_lock);
1185 wake_up(&journal->j_wait_done_commit); 1191 wake_up(&journal->j_wait_done_commit);
1186} 1192}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8b220f1ab54f..f6c5ba027f4f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -708,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
708} 708}
709 709
710/* 710/*
711 * When this function returns the transaction corresponding to tid
712 * will be completed. If the transaction has currently running, start
713 * committing that transaction before waiting for it to complete. If
714 * the transaction id is stale, it is by definition already completed,
715 * so just return SUCCESS.
716 */
717int jbd2_complete_transaction(journal_t *journal, tid_t tid)
718{
719 int need_to_wait = 1;
720
721 read_lock(&journal->j_state_lock);
722 if (journal->j_running_transaction &&
723 journal->j_running_transaction->t_tid == tid) {
724 if (journal->j_commit_request != tid) {
725 /* transaction not yet started, so request it */
726 read_unlock(&journal->j_state_lock);
727 jbd2_log_start_commit(journal, tid);
728 goto wait_commit;
729 }
730 } else if (!(journal->j_committing_transaction &&
731 journal->j_committing_transaction->t_tid == tid))
732 need_to_wait = 0;
733 read_unlock(&journal->j_state_lock);
734 if (!need_to_wait)
735 return 0;
736wait_commit:
737 return jbd2_log_wait_commit(journal, tid);
738}
739EXPORT_SYMBOL(jbd2_complete_transaction);
740
741/*
711 * Log buffer allocation routines: 742 * Log buffer allocation routines:
712 */ 743 */
713 744
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..10f524c59ea8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
332 handle_t *handle = jbd2_alloc_handle(GFP_NOFS); 332 handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
333 if (!handle) 333 if (!handle)
334 return NULL; 334 return NULL;
335 memset(handle, 0, sizeof(*handle));
336 handle->h_buffer_credits = nblocks; 335 handle->h_buffer_credits = nblocks;
337 handle->h_ref = 1; 336 handle->h_ref = 1;
338 337
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
640 int error; 639 int error;
641 char *frozen_buffer = NULL; 640 char *frozen_buffer = NULL;
642 int need_copy = 0; 641 int need_copy = 0;
642 unsigned long start_lock, time_lock;
643 643
644 if (is_handle_aborted(handle)) 644 if (is_handle_aborted(handle))
645 return -EROFS; 645 return -EROFS;
@@ -655,9 +655,16 @@ repeat:
655 655
656 /* @@@ Need to check for errors here at some point. */ 656 /* @@@ Need to check for errors here at some point. */
657 657
658 start_lock = jiffies;
658 lock_buffer(bh); 659 lock_buffer(bh);
659 jbd_lock_bh_state(bh); 660 jbd_lock_bh_state(bh);
660 661
662 /* If it takes too long to lock the buffer, trace it */
663 time_lock = jbd2_time_diff(start_lock, jiffies);
664 if (time_lock > HZ/10)
665 trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
666 jiffies_to_msecs(time_lock));
667
661 /* We now hold the buffer lock so it is safe to query the buffer 668 /* We now hold the buffer lock so it is safe to query the buffer
662 * state. Is the buffer dirty? 669 * state. Is the buffer dirty?
663 * 670 *