aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt21
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/balloc.c53
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h101
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/ext4_jbd2.h12
-rw-r--r--fs/ext4/extents.c522
-rw-r--r--fs/ext4/fsync.c3
-rw-r--r--fs/ext4/ialloc.c88
-rw-r--r--fs/ext4/indirect.c473
-rw-r--r--fs/ext4/inline.c178
-rw-r--r--fs/ext4/inode.c580
-rw-r--r--fs/ext4/ioctl.c218
-rw-r--r--fs/ext4/mballoc.c253
-rw-r--r--fs/ext4/migrate.c62
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c73
-rw-r--r--fs/ext4/namei.c48
-rw-r--r--fs/ext4/page-io.c280
-rw-r--r--fs/ext4/resize.c16
-rw-r--r--fs/ext4/super.c131
-rw-r--r--fs/ext4/xattr.c13
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/jbd2/commit.c50
-rw-r--r--fs/jbd2/journal.c31
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--include/linux/buffer_head.h4
-rw-r--r--include/linux/jbd2.h4
-rw-r--r--include/linux/journal-head.h11
-rw-r--r--include/trace/events/ext4.h16
-rw-r--r--include/trace/events/jbd2.h21
34 files changed, 1900 insertions, 1419 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 34ea4f1fa6ea..f7cbf574a875 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
494 session_write_kbytes This file is read-only and shows the number of 494 session_write_kbytes This file is read-only and shows the number of
495 kilobytes of data that have been written to this 495 kilobytes of data that have been written to this
496 filesystem since it was mounted. 496 filesystem since it was mounted.
497
498 reserved_clusters This is RW file and contains number of reserved
499 clusters in the file system which will be used
500 in the specific situations to avoid costly
501 zeroout, unexpected ENOSPC, or possible data
502 loss. The default is 2% or 4096 clusters,
503 whichever is smaller and this can be changed
504 however it can never exceed number of clusters
505 in the file system. If there is not enough space
506 for the reserved space when mounting the file
507 mount will _not_ fail.
497.............................................................................. 508..............................................................................
498 509
499Ioctls 510Ioctls
@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
587 bitmaps and inode table, the userspace tool thus 598 bitmaps and inode table, the userspace tool thus
588 just passes the new number of blocks. 599 just passes the new number of blocks.
589 600
601EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
602 (like i_blocks, i_size, i_flags, ...) from
603 the specified inode with inode
604 EXT4_BOOT_LOADER_INO (#5). This is typically
605 used to store a boot loader in a secure part of
606 the filesystem, where it can't be changed by a
607 normal user by accident.
608 The data blocks of the previous boot loader
609 will be associated with the given inode.
610
590.............................................................................. 611..............................................................................
591 612
592References 613References
diff --git a/fs/buffer.c b/fs/buffer.c
index 10ef81e10b20..bc1fe14aaa3e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
2987 /* Take care of bh's that straddle the end of the device */ 2987 /* Take care of bh's that straddle the end of the device */
2988 guard_bh_eod(rw, bio, bh); 2988 guard_bh_eod(rw, bio, bh);
2989 2989
2990 if (buffer_meta(bh))
2991 rw |= REQ_META;
2992 if (buffer_prio(bh))
2993 rw |= REQ_PRIO;
2994
2990 bio_get(bio); 2995 bio_get(bio);
2991 submit_bio(rw, bio); 2996 submit_bio(rw, bio);
2992 2997
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb9..efea5d5c44ce 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
71 Enables run-time debugging support for the ext4 filesystem. 71 Enables run-time debugging support for the ext4 filesystem.
72 72
73 If you select Y here, then you will be able to turn on debugging 73 If you select Y here, then you will be able to turn on debugging
74 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" 74 with a command such as:
75 echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92e68b33fffd..d0f13eada0ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
30 */ 30 */
31 31
32/* 32/*
33 * Calculate block group number for a given block number
34 */
35ext4_group_t ext4_get_group_number(struct super_block *sb,
36 ext4_fsblk_t block)
37{
38 ext4_group_t group;
39
40 if (test_opt2(sb, STD_GROUP_SIZE))
41 group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
42 block) >>
43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
44 else
45 ext4_get_group_no_and_offset(sb, block, &group, NULL);
46 return group;
47}
48
49/*
33 * Calculate the block group number and offset into the block/cluster 50 * Calculate the block group number and offset into the block/cluster
34 * allocation bitmap, given a block number 51 * allocation bitmap, given a block number
35 */ 52 */
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
49 66
50} 67}
51 68
52static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, 69/*
53 ext4_group_t block_group) 70 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
71 * and 0 otherwise.
72 */
73static inline int ext4_block_in_group(struct super_block *sb,
74 ext4_fsblk_t block,
75 ext4_group_t block_group)
54{ 76{
55 ext4_group_t actual_group; 77 ext4_group_t actual_group;
56 ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); 78
57 if (actual_group == block_group) 79 actual_group = ext4_get_group_number(sb, block);
58 return 1; 80 return (actual_group == block_group) ? 1 : 0;
59 return 0;
60} 81}
61 82
62/* Return the number of clusters used for file system metadata; this 83/* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
420 trace_ext4_read_block_bitmap_load(sb, block_group); 441 trace_ext4_read_block_bitmap_load(sb, block_group);
421 bh->b_end_io = ext4_end_bitmap_read; 442 bh->b_end_io = ext4_end_bitmap_read;
422 get_bh(bh); 443 get_bh(bh);
423 submit_bh(READ, bh); 444 submit_bh(READ | REQ_META | REQ_PRIO, bh);
424 return bh; 445 return bh;
425verify: 446verify:
426 ext4_validate_block_bitmap(sb, desc, block_group, bh); 447 ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
478static int ext4_has_free_clusters(struct ext4_sb_info *sbi, 499static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
479 s64 nclusters, unsigned int flags) 500 s64 nclusters, unsigned int flags)
480{ 501{
481 s64 free_clusters, dirty_clusters, root_clusters; 502 s64 free_clusters, dirty_clusters, rsv, resv_clusters;
482 struct percpu_counter *fcc = &sbi->s_freeclusters_counter; 503 struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
483 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; 504 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
484 505
485 free_clusters = percpu_counter_read_positive(fcc); 506 free_clusters = percpu_counter_read_positive(fcc);
486 dirty_clusters = percpu_counter_read_positive(dcc); 507 dirty_clusters = percpu_counter_read_positive(dcc);
508 resv_clusters = atomic64_read(&sbi->s_resv_clusters);
487 509
488 /* 510 /*
489 * r_blocks_count should always be multiple of the cluster ratio so 511 * r_blocks_count should always be multiple of the cluster ratio so
490 * we are safe to do a plane bit shift only. 512 * we are safe to do a plane bit shift only.
491 */ 513 */
492 root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; 514 rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
515 resv_clusters;
493 516
494 if (free_clusters - (nclusters + root_clusters + dirty_clusters) < 517 if (free_clusters - (nclusters + rsv + dirty_clusters) <
495 EXT4_FREECLUSTERS_WATERMARK) { 518 EXT4_FREECLUSTERS_WATERMARK) {
496 free_clusters = percpu_counter_sum_positive(fcc); 519 free_clusters = percpu_counter_sum_positive(fcc);
497 dirty_clusters = percpu_counter_sum_positive(dcc); 520 dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
499 /* Check whether we have space after accounting for current 522 /* Check whether we have space after accounting for current
500 * dirty clusters & root reserved clusters. 523 * dirty clusters & root reserved clusters.
501 */ 524 */
502 if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) 525 if (free_clusters >= (rsv + nclusters + dirty_clusters))
503 return 1; 526 return 1;
504 527
505 /* Hm, nope. Are (enough) root reserved clusters available? */ 528 /* Hm, nope. Are (enough) root reserved clusters available? */
506 if (uid_eq(sbi->s_resuid, current_fsuid()) || 529 if (uid_eq(sbi->s_resuid, current_fsuid()) ||
507 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || 530 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
508 capable(CAP_SYS_RESOURCE) || 531 capable(CAP_SYS_RESOURCE) ||
509 (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 532 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
510 533
534 if (free_clusters >= (nclusters + dirty_clusters +
535 resv_clusters))
536 return 1;
537 }
538 /* No free blocks. Let's see if we can dip into reserved pool */
539 if (flags & EXT4_MB_USE_RESERVED) {
511 if (free_clusters >= (nclusters + dirty_clusters)) 540 if (free_clusters >= (nclusters + dirty_clusters))
512 return 1; 541 return 1;
513 } 542 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d8cd1f0f4661..f8d56e4254e0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
47 EXT4_FEATURE_COMPAT_DIR_INDEX) && 47 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || 48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
49 ((inode->i_size >> sb->s_blocksize_bits) == 1))) 49 ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
50 ext4_has_inline_data(inode)))
50 return 1; 51 return 1;
51 52
52 return 0; 53 return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
115 int ret = 0; 116 int ret = 0;
116 int dir_has_error = 0; 117 int dir_has_error = 0;
117 118
118 if (ext4_has_inline_data(inode)) {
119 int has_inline_data = 1;
120 ret = ext4_read_inline_dir(filp, dirent, filldir,
121 &has_inline_data);
122 if (has_inline_data)
123 return ret;
124 }
125
126 if (is_dx_dir(inode)) { 119 if (is_dx_dir(inode)) {
127 err = ext4_dx_readdir(filp, dirent, filldir); 120 err = ext4_dx_readdir(filp, dirent, filldir);
128 if (err != ERR_BAD_DX_DIR) { 121 if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
136 ext4_clear_inode_flag(file_inode(filp), 129 ext4_clear_inode_flag(file_inode(filp),
137 EXT4_INODE_INDEX); 130 EXT4_INODE_INDEX);
138 } 131 }
132
133 if (ext4_has_inline_data(inode)) {
134 int has_inline_data = 1;
135 ret = ext4_read_inline_dir(filp, dirent, filldir,
136 &has_inline_data);
137 if (has_inline_data)
138 return ret;
139 }
140
139 stored = 0; 141 stored = 0;
140 offset = filp->f_pos & (sb->s_blocksize - 1); 142 offset = filp->f_pos & (sb->s_blocksize - 1);
141 143
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b83cd604796..0aabb344b02e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
121#define EXT4_MB_STREAM_ALLOC 0x0800 121#define EXT4_MB_STREAM_ALLOC 0x0800
122/* Use reserved root blocks if needed */ 122/* Use reserved root blocks if needed */
123#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 123#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
124/* Use blocks from reserved pool */
125#define EXT4_MB_USE_RESERVED 0x2000
124 126
125struct ext4_allocation_request { 127struct ext4_allocation_request {
126 /* target inode for block we're allocating */ 128 /* target inode for block we're allocating */
@@ -196,19 +198,8 @@ struct mpage_da_data {
196#define EXT4_IO_END_ERROR 0x0002 198#define EXT4_IO_END_ERROR 0x0002
197#define EXT4_IO_END_DIRECT 0x0004 199#define EXT4_IO_END_DIRECT 0x0004
198 200
199struct ext4_io_page {
200 struct page *p_page;
201 atomic_t p_count;
202};
203
204#define MAX_IO_PAGES 128
205
206/* 201/*
207 * For converting uninitialized extents on a work queue. 202 * For converting uninitialized extents on a work queue.
208 *
209 * 'page' is only used from the writepage() path; 'pages' is only used for
210 * buffered writes; they are used to keep page references until conversion
211 * takes place. For AIO/DIO, neither field is filled in.
212 */ 203 */
213typedef struct ext4_io_end { 204typedef struct ext4_io_end {
214 struct list_head list; /* per-file finished IO list */ 205 struct list_head list; /* per-file finished IO list */
@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
218 ssize_t size; /* size of the extent */ 209 ssize_t size; /* size of the extent */
219 struct kiocb *iocb; /* iocb struct for AIO */ 210 struct kiocb *iocb; /* iocb struct for AIO */
220 int result; /* error value for AIO */ 211 int result; /* error value for AIO */
221 int num_io_pages; /* for writepages() */ 212 atomic_t count; /* reference counter */
222 struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
223} ext4_io_end_t; 213} ext4_io_end_t;
224 214
225struct ext4_io_submit { 215struct ext4_io_submit {
226 int io_op; 216 int io_op;
227 struct bio *io_bio; 217 struct bio *io_bio;
228 ext4_io_end_t *io_end; 218 ext4_io_end_t *io_end;
229 struct ext4_io_page *io_page;
230 sector_t io_next_block; 219 sector_t io_next_block;
231}; 220};
232 221
@@ -403,7 +392,7 @@ struct flex_groups {
403#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 392#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
404 393
405#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 394#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
406#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ 395#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
407 396
408/* Flags that should be inherited by new inodes from their parent. */ 397/* Flags that should be inherited by new inodes from their parent. */
409#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 398#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +546,8 @@ enum {
557#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 546#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
558#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 547#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
559 EXT4_GET_BLOCKS_CREATE) 548 EXT4_GET_BLOCKS_CREATE)
560 /* Caller is from the delayed allocation writeout path, 549 /* Caller is from the delayed allocation writeout path
561 so set the magic i_delalloc_reserve_flag after taking the 550 * finally doing the actual allocation of delayed blocks */
562 inode allocation semaphore for */
563#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 551#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
564 /* caller is from the direct IO path, request to creation of an 552 /* caller is from the direct IO path, request to creation of an
565 unitialized extents if not allocated, split the uninitialized 553 unitialized extents if not allocated, split the uninitialized
@@ -571,8 +559,9 @@ enum {
571 /* Convert extent to initialized after IO complete */ 559 /* Convert extent to initialized after IO complete */
572#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 560#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
573 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 561 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
574 /* Punch out blocks of an extent */ 562 /* Eventual metadata allocation (due to growing extent tree)
575#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 563 * should not fail, so try to use reserved blocks for that.*/
564#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
576 /* Don't normalize allocation size (used for fallocate) */ 565 /* Don't normalize allocation size (used for fallocate) */
577#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 566#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
578 /* Request will not result in inode size update (user for fallocate) */ 567 /* Request will not result in inode size update (user for fallocate) */
@@ -616,6 +605,7 @@ enum {
616#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 605#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
617#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 606#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
618#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 607#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
608#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
619 609
620#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 610#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
621/* 611/*
@@ -949,7 +939,7 @@ struct ext4_inode_info {
949#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ 939#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
950 940
951/* 941/*
952 * Mount flags 942 * Mount flags set via mount options or defaults
953 */ 943 */
954#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 944#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
955#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 945#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -981,8 +971,16 @@ struct ext4_inode_info {
981#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 971#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
982#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 972#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
983 973
974/*
975 * Mount flags set either automatically (could not be set by mount option)
976 * based on per file system feature or property or in special cases such as
977 * distinguishing between explicit mount option definition and default.
978 */
984#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly 979#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
985 specified delalloc */ 980 specified delalloc */
981#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
982 size of blocksize * 8
983 blocks */
986 984
987#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 985#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
988 ~EXT4_MOUNT_##opt 986 ~EXT4_MOUNT_##opt
@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
1179 unsigned int s_mount_flags; 1177 unsigned int s_mount_flags;
1180 unsigned int s_def_mount_opt; 1178 unsigned int s_def_mount_opt;
1181 ext4_fsblk_t s_sb_block; 1179 ext4_fsblk_t s_sb_block;
1180 atomic64_t s_resv_clusters;
1182 kuid_t s_resuid; 1181 kuid_t s_resuid;
1183 kgid_t s_resgid; 1182 kgid_t s_resgid;
1184 unsigned short s_mount_state; 1183 unsigned short s_mount_state;
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1333 return ino == EXT4_ROOT_INO || 1332 return ino == EXT4_ROOT_INO ||
1334 ino == EXT4_USR_QUOTA_INO || 1333 ino == EXT4_USR_QUOTA_INO ||
1335 ino == EXT4_GRP_QUOTA_INO || 1334 ino == EXT4_GRP_QUOTA_INO ||
1335 ino == EXT4_BOOT_LOADER_INO ||
1336 ino == EXT4_JOURNAL_INO || 1336 ino == EXT4_JOURNAL_INO ||
1337 ino == EXT4_RESIZE_INO || 1337 ino == EXT4_RESIZE_INO ||
1338 (ino >= EXT4_FIRST_INO(sb) && 1338 (ino >= EXT4_FIRST_INO(sb) &&
@@ -1374,6 +1374,7 @@ enum {
1374 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1374 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1375 nolocking */ 1375 nolocking */
1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1377 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1377}; 1378};
1378 1379
1379#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1380#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1784 */ 1785 */
1785#define ERR_BAD_DX_DIR -75000 1786#define ERR_BAD_DX_DIR -75000
1786 1787
1787void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1788 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1789
1790/* 1788/*
1791 * Timeout and state flag for lazy initialization inode thread. 1789 * Timeout and state flag for lazy initialization inode thread.
1792 */ 1790 */
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
1908 struct buffer_head *bh); 1906 struct buffer_head *bh);
1909 1907
1910/* balloc.c */ 1908/* balloc.c */
1909extern void ext4_get_group_no_and_offset(struct super_block *sb,
1910 ext4_fsblk_t blocknr,
1911 ext4_group_t *blockgrpp,
1912 ext4_grpblk_t *offsetp);
1913extern ext4_group_t ext4_get_group_number(struct super_block *sb,
1914 ext4_fsblk_t block);
1915
1911extern void ext4_validate_block_bitmap(struct super_block *sb, 1916extern void ext4_validate_block_bitmap(struct super_block *sb,
1912 struct ext4_group_desc *desc, 1917 struct ext4_group_desc *desc,
1913 unsigned int block_group, 1918 unsigned int block_group,
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2108 unsigned long nr_segs); 2113 unsigned long nr_segs);
2109extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2114extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2110extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2115extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
2111extern void ext4_ind_truncate(struct inode *inode); 2116extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2112extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); 2117extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2118 ext4_lblk_t first, ext4_lblk_t stop);
2113 2119
2114/* ioctl.c */ 2120/* ioctl.c */
2115extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 2121extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
2117 2123
2118/* migrate.c */ 2124/* migrate.c */
2119extern int ext4_ext_migrate(struct inode *); 2125extern int ext4_ext_migrate(struct inode *);
2126extern int ext4_ind_migrate(struct inode *inode);
2120 2127
2121/* namei.c */ 2128/* namei.c */
2122extern int ext4_dirent_csum_verify(struct inode *inode, 2129extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
2511extern int ext4_read_inline_dir(struct file *filp, 2518extern int ext4_read_inline_dir(struct file *filp,
2512 void *dirent, filldir_t filldir, 2519 void *dirent, filldir_t filldir,
2513 int *has_inline_data); 2520 int *has_inline_data);
2521extern int htree_inlinedir_to_tree(struct file *dir_file,
2522 struct inode *dir, ext4_lblk_t block,
2523 struct dx_hash_info *hinfo,
2524 __u32 start_hash, __u32 start_minor_hash,
2525 int *has_inline_data);
2514extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, 2526extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
2515 const struct qstr *d_name, 2527 const struct qstr *d_name,
2516 struct ext4_dir_entry_2 **res_dir, 2528 struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
2547extern int ext4_handle_dirty_dirent_node(handle_t *handle, 2559extern int ext4_handle_dirty_dirent_node(handle_t *handle,
2548 struct inode *inode, 2560 struct inode *inode,
2549 struct buffer_head *bh); 2561 struct buffer_head *bh);
2562#define S_SHIFT 12
2563static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
2564 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
2565 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
2566 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
2567 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
2568 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
2569 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
2570 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
2571};
2572
2573static inline void ext4_set_de_type(struct super_block *sb,
2574 struct ext4_dir_entry_2 *de,
2575 umode_t mode) {
2576 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
2577 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
2578}
2579
2550 2580
2551/* symlink.c */ 2581/* symlink.c */
2552extern const struct inode_operations ext4_symlink_inode_operations; 2582extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2573 int chunk); 2603 int chunk);
2574extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2604extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2575 struct ext4_map_blocks *map, int flags); 2605 struct ext4_map_blocks *map, int flags);
2576extern void ext4_ext_truncate(struct inode *); 2606extern void ext4_ext_truncate(handle_t *, struct inode *);
2577extern int ext4_ext_punch_hole(struct file *file, loff_t offset, 2607extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2578 loff_t length); 2608 ext4_lblk_t end);
2579extern void ext4_ext_init(struct super_block *); 2609extern void ext4_ext_init(struct super_block *);
2580extern void ext4_ext_release(struct super_block *); 2610extern void ext4_ext_release(struct super_block *);
2581extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2611extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2609 2639
2610 2640
2611/* move_extent.c */ 2641/* move_extent.c */
2642extern void ext4_double_down_write_data_sem(struct inode *first,
2643 struct inode *second);
2644extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2645 struct inode *donor_inode);
2646void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
2647void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
2612extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2648extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2613 __u64 start_orig, __u64 start_donor, 2649 __u64 start_orig, __u64 start_donor,
2614 __u64 len, __u64 *moved_len); 2650 __u64 len, __u64 *moved_len);
2615 2651
2616/* page-io.c */ 2652/* page-io.c */
2617extern int __init ext4_init_pageio(void); 2653extern int __init ext4_init_pageio(void);
2618extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2619extern void ext4_exit_pageio(void); 2654extern void ext4_exit_pageio(void);
2620extern void ext4_ioend_shutdown(struct inode *); 2655extern void ext4_ioend_shutdown(struct inode *);
2621extern void ext4_free_io_end(ext4_io_end_t *io);
2622extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2656extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2657extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2658extern int ext4_put_io_end(ext4_io_end_t *io_end);
2659extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2660extern void ext4_io_submit_init(struct ext4_io_submit *io,
2661 struct writeback_control *wbc);
2623extern void ext4_end_io_work(struct work_struct *work); 2662extern void ext4_end_io_work(struct work_struct *work);
2624extern void ext4_io_submit(struct ext4_io_submit *io); 2663extern void ext4_io_submit(struct ext4_io_submit *io);
2625extern int ext4_bio_write_page(struct ext4_io_submit *io, 2664extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 8643ff5bbeb7..51bc821ade90 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
270 0xffff); 270 0xffff);
271} 271}
272 272
273#define ext4_ext_dirty(handle, inode, path) \
274 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
275int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
276 struct inode *inode, struct ext4_ext_path *path);
277
273#endif /* _EXT4_EXTENTS */ 278#endif /* _EXT4_EXTENTS */
274 279
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7058975e3a55..451eb4045330 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
43{ 43{
44 journal_t *journal; 44 journal_t *journal;
45 45
46 might_sleep();
47
46 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
47 if (sb->s_flags & MS_RDONLY) 49 if (sb->s_flags & MS_RDONLY)
48 return ERR_PTR(-EROFS); 50 return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
113{ 115{
114 int err = 0; 116 int err = 0;
115 117
118 might_sleep();
119
116 if (ext4_handle_valid(handle)) { 120 if (ext4_handle_valid(handle)) {
117 err = jbd2_journal_get_write_access(handle, bh); 121 err = jbd2_journal_get_write_access(handle, bh);
118 if (err) 122 if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
209{ 213{
210 int err = 0; 214 int err = 0;
211 215
216 might_sleep();
217
218 set_buffer_meta(bh);
219 set_buffer_prio(bh);
212 if (ext4_handle_valid(handle)) { 220 if (ext4_handle_valid(handle)) {
213 err = jbd2_journal_dirty_metadata(handle, bh); 221 err = jbd2_journal_dirty_metadata(handle, bh);
214 if (err) { 222 if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4c216b1bf20c..c8c6885406db 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
29 * block to complete the transaction. 29 * block to complete the transaction.
30 * 30 *
31 * For extents-enabled fs we may have to allocate and modify up to 31 * For extents-enabled fs we may have to allocate and modify up to
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree, data block (for each of these we need bitmap + group
33 * summaries), root which is stored in the inode, sb
34 */
33 35
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 36#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 37 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 ? 27U : 8U) 38 ? 20U : 8U)
37 39
38/* Extended attribute operations touch at most two data buffers, 40/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 41 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
194 * ext4_journal_callback_del: delete a registered callback 196 * ext4_journal_callback_del: delete a registered callback
195 * @handle: active journal transaction handle on which callback was registered 197 * @handle: active journal transaction handle on which callback was registered
196 * @jce: registered journal callback entry to unregister 198 * @jce: registered journal callback entry to unregister
199 * Return true if object was sucessfully removed
197 */ 200 */
198static inline void ext4_journal_callback_del(handle_t *handle, 201static inline bool ext4_journal_callback_try_del(handle_t *handle,
199 struct ext4_journal_cb_entry *jce) 202 struct ext4_journal_cb_entry *jce)
200{ 203{
204 bool deleted;
201 struct ext4_sb_info *sbi = 205 struct ext4_sb_info *sbi =
202 EXT4_SB(handle->h_transaction->t_journal->j_private); 206 EXT4_SB(handle->h_transaction->t_journal->j_private);
203 207
204 spin_lock(&sbi->s_md_lock); 208 spin_lock(&sbi->s_md_lock);
209 deleted = !list_empty(&jce->jce_list);
205 list_del_init(&jce->jce_list); 210 list_del_init(&jce->jce_list);
206 spin_unlock(&sbi->s_md_lock); 211 spin_unlock(&sbi->s_md_lock);
212 return deleted;
207} 213}
208 214
209int 215int
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c6d06dcef8b..107936db244e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
157 * - ENOMEM 157 * - ENOMEM
158 * - EIO 158 * - EIO
159 */ 159 */
160#define ext4_ext_dirty(handle, inode, path) \ 160int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 161 struct inode *inode, struct ext4_ext_path *path)
162static int __ext4_ext_dirty(const char *where, unsigned int line,
163 handle_t *handle, struct inode *inode,
164 struct ext4_ext_path *path)
165{ 162{
166 int err; 163 int err;
167 if (path->p_bh) { 164 if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1813 } 1810 }
1814 depth = ext_depth(inode); 1811 depth = ext_depth(inode);
1815 ex = path[depth].p_ext; 1812 ex = path[depth].p_ext;
1813 eh = path[depth].p_hdr;
1816 if (unlikely(path[depth].p_hdr == NULL)) { 1814 if (unlikely(path[depth].p_hdr == NULL)) {
1817 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1815 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1818 return -EIO; 1816 return -EIO;
1819 } 1817 }
1820 1818
1821 /* try to insert block into found extent and return */ 1819 /* try to insert block into found extent and return */
1822 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1820 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
1823 && ext4_can_extents_be_merged(inode, ex, newext)) {
1824 ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1825 ext4_ext_is_uninitialized(newext),
1826 ext4_ext_get_actual_len(newext),
1827 le32_to_cpu(ex->ee_block),
1828 ext4_ext_is_uninitialized(ex),
1829 ext4_ext_get_actual_len(ex),
1830 ext4_ext_pblock(ex));
1831 err = ext4_ext_get_access(handle, inode, path + depth);
1832 if (err)
1833 return err;
1834 1821
1835 /* 1822 /*
1836 * ext4_can_extents_be_merged should have checked that either 1823 * Try to see whether we should rather test the extent on
1837 * both extents are uninitialized, or both aren't. Thus we 1824 * right from ex, or from the left of ex. This is because
1838 * need to check only one of them here. 1825 * ext4_ext_find_extent() can return either extent on the
1826 * left, or on the right from the searched position. This
1827 * will make merging more effective.
1839 */ 1828 */
1840 if (ext4_ext_is_uninitialized(ex)) 1829 if (ex < EXT_LAST_EXTENT(eh) &&
1841 uninitialized = 1; 1830 (le32_to_cpu(ex->ee_block) +
1842 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1831 ext4_ext_get_actual_len(ex) <
1832 le32_to_cpu(newext->ee_block))) {
1833 ex += 1;
1834 goto prepend;
1835 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1836 (le32_to_cpu(newext->ee_block) +
1837 ext4_ext_get_actual_len(newext) <
1838 le32_to_cpu(ex->ee_block)))
1839 ex -= 1;
1840
1841 /* Try to append newex to the ex */
1842 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1843 ext_debug("append [%d]%d block to %u:[%d]%d"
1844 "(from %llu)\n",
1845 ext4_ext_is_uninitialized(newext),
1846 ext4_ext_get_actual_len(newext),
1847 le32_to_cpu(ex->ee_block),
1848 ext4_ext_is_uninitialized(ex),
1849 ext4_ext_get_actual_len(ex),
1850 ext4_ext_pblock(ex));
1851 err = ext4_ext_get_access(handle, inode,
1852 path + depth);
1853 if (err)
1854 return err;
1855
1856 /*
1857 * ext4_can_extents_be_merged should have checked
1858 * that either both extents are uninitialized, or
1859 * both aren't. Thus we need to check only one of
1860 * them here.
1861 */
1862 if (ext4_ext_is_uninitialized(ex))
1863 uninitialized = 1;
1864 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1843 + ext4_ext_get_actual_len(newext)); 1865 + ext4_ext_get_actual_len(newext));
1844 if (uninitialized) 1866 if (uninitialized)
1845 ext4_ext_mark_uninitialized(ex); 1867 ext4_ext_mark_uninitialized(ex);
1846 eh = path[depth].p_hdr; 1868 eh = path[depth].p_hdr;
1847 nearex = ex; 1869 nearex = ex;
1848 goto merge; 1870 goto merge;
1871 }
1872
1873prepend:
1874 /* Try to prepend newex to the ex */
1875 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1876 ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1877 "(from %llu)\n",
1878 le32_to_cpu(newext->ee_block),
1879 ext4_ext_is_uninitialized(newext),
1880 ext4_ext_get_actual_len(newext),
1881 le32_to_cpu(ex->ee_block),
1882 ext4_ext_is_uninitialized(ex),
1883 ext4_ext_get_actual_len(ex),
1884 ext4_ext_pblock(ex));
1885 err = ext4_ext_get_access(handle, inode,
1886 path + depth);
1887 if (err)
1888 return err;
1889
1890 /*
1891 * ext4_can_extents_be_merged should have checked
1892 * that either both extents are uninitialized, or
1893 * both aren't. Thus we need to check only one of
1894 * them here.
1895 */
1896 if (ext4_ext_is_uninitialized(ex))
1897 uninitialized = 1;
1898 ex->ee_block = newext->ee_block;
1899 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1900 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1901 + ext4_ext_get_actual_len(newext));
1902 if (uninitialized)
1903 ext4_ext_mark_uninitialized(ex);
1904 eh = path[depth].p_hdr;
1905 nearex = ex;
1906 goto merge;
1907 }
1849 } 1908 }
1850 1909
1851 depth = ext_depth(inode); 1910 depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1880 * There is no free space in the found leaf. 1939 * There is no free space in the found leaf.
1881 * We're gonna add a new leaf in the tree. 1940 * We're gonna add a new leaf in the tree.
1882 */ 1941 */
1883 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) 1942 if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
1884 flags = EXT4_MB_USE_ROOT_BLOCKS; 1943 flags = EXT4_MB_USE_RESERVED;
1885 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 1944 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1886 if (err) 1945 if (err)
1887 goto cleanup; 1946 goto cleanup;
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2599 return 1; 2658 return 1;
2600} 2659}
2601 2660
2602static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2661int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2603 ext4_lblk_t end) 2662 ext4_lblk_t end)
2604{ 2663{
2605 struct super_block *sb = inode->i_sb; 2664 struct super_block *sb = inode->i_sb;
2606 int depth = ext_depth(inode); 2665 int depth = ext_depth(inode);
@@ -2667,12 +2726,14 @@ again:
2667 2726
2668 /* 2727 /*
2669 * Split the extent in two so that 'end' is the last 2728 * Split the extent in two so that 'end' is the last
2670 * block in the first new extent 2729 * block in the first new extent. Also we should not
2730 * fail removing space due to ENOSPC so try to use
2731 * reserved block if that happens.
2671 */ 2732 */
2672 err = ext4_split_extent_at(handle, inode, path, 2733 err = ext4_split_extent_at(handle, inode, path,
2673 end + 1, split_flag, 2734 end + 1, split_flag,
2674 EXT4_GET_BLOCKS_PRE_IO | 2735 EXT4_GET_BLOCKS_PRE_IO |
2675 EXT4_GET_BLOCKS_PUNCH_OUT_EXT); 2736 EXT4_GET_BLOCKS_METADATA_NOFAIL);
2676 2737
2677 if (err < 0) 2738 if (err < 0)
2678 goto out; 2739 goto out;
@@ -3147,35 +3208,35 @@ out:
3147static int ext4_ext_convert_to_initialized(handle_t *handle, 3208static int ext4_ext_convert_to_initialized(handle_t *handle,
3148 struct inode *inode, 3209 struct inode *inode,
3149 struct ext4_map_blocks *map, 3210 struct ext4_map_blocks *map,
3150 struct ext4_ext_path *path) 3211 struct ext4_ext_path *path,
3212 int flags)
3151{ 3213{
3152 struct ext4_sb_info *sbi; 3214 struct ext4_sb_info *sbi;
3153 struct ext4_extent_header *eh; 3215 struct ext4_extent_header *eh;
3154 struct ext4_map_blocks split_map; 3216 struct ext4_map_blocks split_map;
3155 struct ext4_extent zero_ex; 3217 struct ext4_extent zero_ex;
3156 struct ext4_extent *ex; 3218 struct ext4_extent *ex, *abut_ex;
3157 ext4_lblk_t ee_block, eof_block; 3219 ext4_lblk_t ee_block, eof_block;
3158 unsigned int ee_len, depth; 3220 unsigned int ee_len, depth, map_len = map->m_len;
3159 int allocated, max_zeroout = 0; 3221 int allocated = 0, max_zeroout = 0;
3160 int err = 0; 3222 int err = 0;
3161 int split_flag = 0; 3223 int split_flag = 0;
3162 3224
3163 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3225 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3164 "block %llu, max_blocks %u\n", inode->i_ino, 3226 "block %llu, max_blocks %u\n", inode->i_ino,
3165 (unsigned long long)map->m_lblk, map->m_len); 3227 (unsigned long long)map->m_lblk, map_len);
3166 3228
3167 sbi = EXT4_SB(inode->i_sb); 3229 sbi = EXT4_SB(inode->i_sb);
3168 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3230 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3169 inode->i_sb->s_blocksize_bits; 3231 inode->i_sb->s_blocksize_bits;
3170 if (eof_block < map->m_lblk + map->m_len) 3232 if (eof_block < map->m_lblk + map_len)
3171 eof_block = map->m_lblk + map->m_len; 3233 eof_block = map->m_lblk + map_len;
3172 3234
3173 depth = ext_depth(inode); 3235 depth = ext_depth(inode);
3174 eh = path[depth].p_hdr; 3236 eh = path[depth].p_hdr;
3175 ex = path[depth].p_ext; 3237 ex = path[depth].p_ext;
3176 ee_block = le32_to_cpu(ex->ee_block); 3238 ee_block = le32_to_cpu(ex->ee_block);
3177 ee_len = ext4_ext_get_actual_len(ex); 3239 ee_len = ext4_ext_get_actual_len(ex);
3178 allocated = ee_len - (map->m_lblk - ee_block);
3179 zero_ex.ee_len = 0; 3240 zero_ex.ee_len = 0;
3180 3241
3181 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3242 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3186 3247
3187 /* 3248 /*
3188 * Attempt to transfer newly initialized blocks from the currently 3249 * Attempt to transfer newly initialized blocks from the currently
3189 * uninitialized extent to its left neighbor. This is much cheaper 3250 * uninitialized extent to its neighbor. This is much cheaper
3190 * than an insertion followed by a merge as those involve costly 3251 * than an insertion followed by a merge as those involve costly
3191 * memmove() calls. This is the common case in steady state for 3252 * memmove() calls. Transferring to the left is the common case in
3192 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append 3253 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3193 * writes. 3254 * followed by append writes.
3194 * 3255 *
3195 * Limitations of the current logic: 3256 * Limitations of the current logic:
3196 * - L1: we only deal with writes at the start of the extent. 3257 * - L1: we do not deal with writes covering the whole extent.
3197 * The approach could be extended to writes at the end
3198 * of the extent but this scenario was deemed less common.
3199 * - L2: we do not deal with writes covering the whole extent.
3200 * This would require removing the extent if the transfer 3258 * This would require removing the extent if the transfer
3201 * is possible. 3259 * is possible.
3202 * - L3: we only attempt to merge with an extent stored in the 3260 * - L2: we only attempt to merge with an extent stored in the
3203 * same extent tree node. 3261 * same extent tree node.
3204 */ 3262 */
3205 if ((map->m_lblk == ee_block) && /*L1*/ 3263 if ((map->m_lblk == ee_block) &&
3206 (map->m_len < ee_len) && /*L2*/ 3264 /* See if we can merge left */
3207 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ 3265 (map_len < ee_len) && /*L1*/
3208 struct ext4_extent *prev_ex; 3266 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
3209 ext4_lblk_t prev_lblk; 3267 ext4_lblk_t prev_lblk;
3210 ext4_fsblk_t prev_pblk, ee_pblk; 3268 ext4_fsblk_t prev_pblk, ee_pblk;
3211 unsigned int prev_len, write_len; 3269 unsigned int prev_len;
3212 3270
3213 prev_ex = ex - 1; 3271 abut_ex = ex - 1;
3214 prev_lblk = le32_to_cpu(prev_ex->ee_block); 3272 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3215 prev_len = ext4_ext_get_actual_len(prev_ex); 3273 prev_len = ext4_ext_get_actual_len(abut_ex);
3216 prev_pblk = ext4_ext_pblock(prev_ex); 3274 prev_pblk = ext4_ext_pblock(abut_ex);
3217 ee_pblk = ext4_ext_pblock(ex); 3275 ee_pblk = ext4_ext_pblock(ex);
3218 write_len = map->m_len;
3219 3276
3220 /* 3277 /*
3221 * A transfer of blocks from 'ex' to 'prev_ex' is allowed 3278 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3222 * upon those conditions: 3279 * upon those conditions:
3223 * - C1: prev_ex is initialized, 3280 * - C1: abut_ex is initialized,
3224 * - C2: prev_ex is logically abutting ex, 3281 * - C2: abut_ex is logically abutting ex,
3225 * - C3: prev_ex is physically abutting ex, 3282 * - C3: abut_ex is physically abutting ex,
3226 * - C4: prev_ex can receive the additional blocks without 3283 * - C4: abut_ex can receive the additional blocks without
3227 * overflowing the (initialized) length limit. 3284 * overflowing the (initialized) length limit.
3228 */ 3285 */
3229 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ 3286 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
3230 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3287 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3231 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3288 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3232 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ 3289 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3233 err = ext4_ext_get_access(handle, inode, path + depth); 3290 err = ext4_ext_get_access(handle, inode, path + depth);
3234 if (err) 3291 if (err)
3235 goto out; 3292 goto out;
3236 3293
3237 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3294 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3238 map, ex, prev_ex); 3295 map, ex, abut_ex);
3239 3296
3240 /* Shift the start of ex by 'write_len' blocks */ 3297 /* Shift the start of ex by 'map_len' blocks */
3241 ex->ee_block = cpu_to_le32(ee_block + write_len); 3298 ex->ee_block = cpu_to_le32(ee_block + map_len);
3242 ext4_ext_store_pblock(ex, ee_pblk + write_len); 3299 ext4_ext_store_pblock(ex, ee_pblk + map_len);
3243 ex->ee_len = cpu_to_le16(ee_len - write_len); 3300 ex->ee_len = cpu_to_le16(ee_len - map_len);
3244 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3301 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3245 3302
3246 /* Extend prev_ex by 'write_len' blocks */ 3303 /* Extend abut_ex by 'map_len' blocks */
3247 prev_ex->ee_len = cpu_to_le16(prev_len + write_len); 3304 abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3248 3305
3249 /* Mark the block containing both extents as dirty */ 3306 /* Result: number of initialized blocks past m_lblk */
3250 ext4_ext_dirty(handle, inode, path + depth); 3307 allocated = map_len;
3308 }
3309 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3310 (map_len < ee_len) && /*L1*/
3311 ex < EXT_LAST_EXTENT(eh)) { /*L2*/
3312 /* See if we can merge right */
3313 ext4_lblk_t next_lblk;
3314 ext4_fsblk_t next_pblk, ee_pblk;
3315 unsigned int next_len;
3316
3317 abut_ex = ex + 1;
3318 next_lblk = le32_to_cpu(abut_ex->ee_block);
3319 next_len = ext4_ext_get_actual_len(abut_ex);
3320 next_pblk = ext4_ext_pblock(abut_ex);
3321 ee_pblk = ext4_ext_pblock(ex);
3251 3322
3252 /* Update path to point to the right extent */ 3323 /*
3253 path[depth].p_ext = prev_ex; 3324 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3325 * upon those conditions:
3326 * - C1: abut_ex is initialized,
3327 * - C2: abut_ex is logically abutting ex,
3328 * - C3: abut_ex is physically abutting ex,
3329 * - C4: abut_ex can receive the additional blocks without
3330 * overflowing the (initialized) length limit.
3331 */
3332 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
3333 ((map->m_lblk + map_len) == next_lblk) && /*C2*/
3334 ((ee_pblk + ee_len) == next_pblk) && /*C3*/
3335 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3336 err = ext4_ext_get_access(handle, inode, path + depth);
3337 if (err)
3338 goto out;
3339
3340 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3341 map, ex, abut_ex);
3342
3343 /* Shift the start of abut_ex by 'map_len' blocks */
3344 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3345 ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3346 ex->ee_len = cpu_to_le16(ee_len - map_len);
3347 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3348
3349 /* Extend abut_ex by 'map_len' blocks */
3350 abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3254 3351
3255 /* Result: number of initialized blocks past m_lblk */ 3352 /* Result: number of initialized blocks past m_lblk */
3256 allocated = write_len; 3353 allocated = map_len;
3257 goto out;
3258 } 3354 }
3259 } 3355 }
3356 if (allocated) {
3357 /* Mark the block containing both extents as dirty */
3358 ext4_ext_dirty(handle, inode, path + depth);
3359
3360 /* Update path to point to the right extent */
3361 path[depth].p_ext = abut_ex;
3362 goto out;
3363 } else
3364 allocated = ee_len - (map->m_lblk - ee_block);
3260 3365
3261 WARN_ON(map->m_lblk < ee_block); 3366 WARN_ON(map->m_lblk < ee_block);
3262 /* 3367 /*
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3330 } 3435 }
3331 3436
3332 allocated = ext4_split_extent(handle, inode, path, 3437 allocated = ext4_split_extent(handle, inode, path,
3333 &split_map, split_flag, 0); 3438 &split_map, split_flag, flags);
3334 if (allocated < 0) 3439 if (allocated < 0)
3335 err = allocated; 3440 err = allocated;
3336 3441
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3650 flags, allocated); 3755 flags, allocated);
3651 ext4_ext_show_leaf(inode, path); 3756 ext4_ext_show_leaf(inode, path);
3652 3757
3758 /*
3759 * When writing into uninitialized space, we should not fail to
3760 * allocate metadata blocks for the new extent block if needed.
3761 */
3762 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3763
3653 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3764 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
3654 allocated, newblock); 3765 allocated, newblock);
3655 3766
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3713 } 3824 }
3714 3825
3715 /* buffered write, writepage time, convert*/ 3826 /* buffered write, writepage time, convert*/
3716 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3827 ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
3717 if (ret >= 0) 3828 if (ret >= 0)
3718 ext4_update_inode_fsync_trans(handle, inode, 1); 3829 ext4_update_inode_fsync_trans(handle, inode, 1);
3719out: 3830out:
@@ -4257,48 +4368,13 @@ out3:
4257 return err ? err : allocated; 4368 return err ? err : allocated;
4258} 4369}
4259 4370
4260void ext4_ext_truncate(struct inode *inode) 4371void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4261{ 4372{
4262 struct address_space *mapping = inode->i_mapping;
4263 struct super_block *sb = inode->i_sb; 4373 struct super_block *sb = inode->i_sb;
4264 ext4_lblk_t last_block; 4374 ext4_lblk_t last_block;
4265 handle_t *handle;
4266 loff_t page_len;
4267 int err = 0; 4375 int err = 0;
4268 4376
4269 /* 4377 /*
4270 * finish any pending end_io work so we won't run the risk of
4271 * converting any truncated blocks to initialized later
4272 */
4273 ext4_flush_unwritten_io(inode);
4274
4275 /*
4276 * probably first extent we're gonna free will be last in block
4277 */
4278 err = ext4_writepage_trans_blocks(inode);
4279 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
4280 if (IS_ERR(handle))
4281 return;
4282
4283 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
4284 page_len = PAGE_CACHE_SIZE -
4285 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4286
4287 err = ext4_discard_partial_page_buffers(handle,
4288 mapping, inode->i_size, page_len, 0);
4289
4290 if (err)
4291 goto out_stop;
4292 }
4293
4294 if (ext4_orphan_add(handle, inode))
4295 goto out_stop;
4296
4297 down_write(&EXT4_I(inode)->i_data_sem);
4298
4299 ext4_discard_preallocations(inode);
4300
4301 /*
4302 * TODO: optimization is possible here. 4378 * TODO: optimization is possible here.
4303 * Probably we need not scan at all, 4379 * Probably we need not scan at all,
4304 * because page truncation is enough. 4380 * because page truncation is enough.
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode)
4313 err = ext4_es_remove_extent(inode, last_block, 4389 err = ext4_es_remove_extent(inode, last_block,
4314 EXT_MAX_BLOCKS - last_block); 4390 EXT_MAX_BLOCKS - last_block);
4315 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4391 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4316
4317 /* In a multi-transaction truncate, we only make the final
4318 * transaction synchronous.
4319 */
4320 if (IS_SYNC(inode))
4321 ext4_handle_sync(handle);
4322
4323 up_write(&EXT4_I(inode)->i_data_sem);
4324
4325out_stop:
4326 /*
4327 * If this was a simple ftruncate() and the file will remain alive,
4328 * then we need to clear up the orphan record which we created above.
4329 * However, if this was a real unlink then we were called by
4330 * ext4_delete_inode(), and we allow that function to clean up the
4331 * orphan info for us.
4332 */
4333 if (inode->i_nlink)
4334 ext4_orphan_del(handle, inode);
4335
4336 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4337 ext4_mark_inode_dirty(handle, inode);
4338 ext4_journal_stop(handle);
4339} 4392}
4340 4393
4341static void ext4_falloc_update_inode(struct inode *inode, 4394static void ext4_falloc_update_inode(struct inode *inode,
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
4623 return (error < 0 ? error : 0); 4676 return (error < 0 ? error : 0);
4624} 4677}
4625 4678
4626/*
4627 * ext4_ext_punch_hole
4628 *
4629 * Punches a hole of "length" bytes in a file starting
4630 * at byte "offset"
4631 *
4632 * @inode: The inode of the file to punch a hole in
4633 * @offset: The starting byte offset of the hole
4634 * @length: The length of the hole
4635 *
4636 * Returns the number of blocks removed or negative on err
4637 */
4638int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4639{
4640 struct inode *inode = file_inode(file);
4641 struct super_block *sb = inode->i_sb;
4642 ext4_lblk_t first_block, stop_block;
4643 struct address_space *mapping = inode->i_mapping;
4644 handle_t *handle;
4645 loff_t first_page, last_page, page_len;
4646 loff_t first_page_offset, last_page_offset;
4647 int credits, err = 0;
4648
4649 /*
4650 * Write out all dirty pages to avoid race conditions
4651 * Then release them.
4652 */
4653 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4654 err = filemap_write_and_wait_range(mapping,
4655 offset, offset + length - 1);
4656
4657 if (err)
4658 return err;
4659 }
4660
4661 mutex_lock(&inode->i_mutex);
4662 /* It's not possible punch hole on append only file */
4663 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4664 err = -EPERM;
4665 goto out_mutex;
4666 }
4667 if (IS_SWAPFILE(inode)) {
4668 err = -ETXTBSY;
4669 goto out_mutex;
4670 }
4671
4672 /* No need to punch hole beyond i_size */
4673 if (offset >= inode->i_size)
4674 goto out_mutex;
4675
4676 /*
4677 * If the hole extends beyond i_size, set the hole
4678 * to end after the page that contains i_size
4679 */
4680 if (offset + length > inode->i_size) {
4681 length = inode->i_size +
4682 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4683 offset;
4684 }
4685
4686 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4687 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4688
4689 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4690 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4691
4692 /* Now release the pages */
4693 if (last_page_offset > first_page_offset) {
4694 truncate_pagecache_range(inode, first_page_offset,
4695 last_page_offset - 1);
4696 }
4697
4698 /* Wait all existing dio workers, newcomers will block on i_mutex */
4699 ext4_inode_block_unlocked_dio(inode);
4700 err = ext4_flush_unwritten_io(inode);
4701 if (err)
4702 goto out_dio;
4703 inode_dio_wait(inode);
4704
4705 credits = ext4_writepage_trans_blocks(inode);
4706 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4707 if (IS_ERR(handle)) {
4708 err = PTR_ERR(handle);
4709 goto out_dio;
4710 }
4711
4712
4713 /*
4714 * Now we need to zero out the non-page-aligned data in the
4715 * pages at the start and tail of the hole, and unmap the buffer
4716 * heads for the block aligned regions of the page that were
4717 * completely zeroed.
4718 */
4719 if (first_page > last_page) {
4720 /*
4721 * If the file space being truncated is contained within a page
4722 * just zero out and unmap the middle of that page
4723 */
4724 err = ext4_discard_partial_page_buffers(handle,
4725 mapping, offset, length, 0);
4726
4727 if (err)
4728 goto out;
4729 } else {
4730 /*
4731 * zero out and unmap the partial page that contains
4732 * the start of the hole
4733 */
4734 page_len = first_page_offset - offset;
4735 if (page_len > 0) {
4736 err = ext4_discard_partial_page_buffers(handle, mapping,
4737 offset, page_len, 0);
4738 if (err)
4739 goto out;
4740 }
4741
4742 /*
4743 * zero out and unmap the partial page that contains
4744 * the end of the hole
4745 */
4746 page_len = offset + length - last_page_offset;
4747 if (page_len > 0) {
4748 err = ext4_discard_partial_page_buffers(handle, mapping,
4749 last_page_offset, page_len, 0);
4750 if (err)
4751 goto out;
4752 }
4753 }
4754
4755 /*
4756 * If i_size is contained in the last page, we need to
4757 * unmap and zero the partial page after i_size
4758 */
4759 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4760 inode->i_size % PAGE_CACHE_SIZE != 0) {
4761
4762 page_len = PAGE_CACHE_SIZE -
4763 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4764
4765 if (page_len > 0) {
4766 err = ext4_discard_partial_page_buffers(handle,
4767 mapping, inode->i_size, page_len, 0);
4768
4769 if (err)
4770 goto out;
4771 }
4772 }
4773
4774 first_block = (offset + sb->s_blocksize - 1) >>
4775 EXT4_BLOCK_SIZE_BITS(sb);
4776 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4777
4778 /* If there are no blocks to remove, return now */
4779 if (first_block >= stop_block)
4780 goto out;
4781
4782 down_write(&EXT4_I(inode)->i_data_sem);
4783 ext4_discard_preallocations(inode);
4784
4785 err = ext4_es_remove_extent(inode, first_block,
4786 stop_block - first_block);
4787 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4788
4789 ext4_discard_preallocations(inode);
4790
4791 if (IS_SYNC(inode))
4792 ext4_handle_sync(handle);
4793
4794 up_write(&EXT4_I(inode)->i_data_sem);
4795
4796out:
4797 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4798 ext4_mark_inode_dirty(handle, inode);
4799 ext4_journal_stop(handle);
4800out_dio:
4801 ext4_inode_resume_unlocked_dio(inode);
4802out_mutex:
4803 mutex_unlock(&inode->i_mutex);
4804 return err;
4805}
4806
4807int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4679int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4808 __u64 start, __u64 len) 4680 __u64 start, __u64 len)
4809{ 4681{
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..e0ba8a408def 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
166 if (journal->j_flags & JBD2_BARRIER && 166 if (journal->j_flags & JBD2_BARRIER &&
167 !jbd2_trans_will_send_data_barrier(journal, commit_tid)) 167 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
168 needs_barrier = true; 168 needs_barrier = true;
169 jbd2_log_start_commit(journal, commit_tid); 169 ret = jbd2_complete_transaction(journal, commit_tid);
170 ret = jbd2_log_wait_commit(journal, commit_tid);
171 if (needs_barrier) { 170 if (needs_barrier) {
172 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 171 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
173 if (!ret) 172 if (!ret)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6c5bb8d993fe..00a818d67b54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
166 trace_ext4_load_inode_bitmap(sb, block_group); 166 trace_ext4_load_inode_bitmap(sb, block_group);
167 bh->b_end_io = ext4_end_bitmap_read; 167 bh->b_end_io = ext4_end_bitmap_read;
168 get_bh(bh); 168 get_bh(bh);
169 submit_bh(READ, bh); 169 submit_bh(READ | REQ_META | REQ_PRIO, bh);
170 wait_on_buffer(bh); 170 wait_on_buffer(bh);
171 if (!buffer_uptodate(bh)) { 171 if (!buffer_uptodate(bh)) {
172 put_bh(bh); 172 put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
666 ei = EXT4_I(inode); 666 ei = EXT4_I(inode);
667 sbi = EXT4_SB(sb); 667 sbi = EXT4_SB(sb);
668 668
669 /*
670 * Initalize owners and quota early so that we don't have to account
671 * for quota initialization worst case in standard inode creating
672 * transaction
673 */
674 if (owner) {
675 inode->i_mode = mode;
676 i_uid_write(inode, owner[0]);
677 i_gid_write(inode, owner[1]);
678 } else if (test_opt(sb, GRPID)) {
679 inode->i_mode = mode;
680 inode->i_uid = current_fsuid();
681 inode->i_gid = dir->i_gid;
682 } else
683 inode_init_owner(inode, dir, mode);
684 dquot_initialize(inode);
685
669 if (!goal) 686 if (!goal)
670 goal = sbi->s_inode_goal; 687 goal = sbi->s_inode_goal;
671 688
@@ -697,7 +714,7 @@ got_group:
697 714
698 gdp = ext4_get_group_desc(sb, group, &group_desc_bh); 715 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
699 if (!gdp) 716 if (!gdp)
700 goto fail; 717 goto out;
701 718
702 /* 719 /*
703 * Check free inodes count before loading bitmap. 720 * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
711 brelse(inode_bitmap_bh); 728 brelse(inode_bitmap_bh);
712 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 729 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
713 if (!inode_bitmap_bh) 730 if (!inode_bitmap_bh)
714 goto fail; 731 goto out;
715 732
716repeat_in_this_group: 733repeat_in_this_group:
717 ino = ext4_find_next_zero_bit((unsigned long *) 734 ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ repeat_in_this_group:
733 handle_type, nblocks); 750 handle_type, nblocks);
734 if (IS_ERR(handle)) { 751 if (IS_ERR(handle)) {
735 err = PTR_ERR(handle); 752 err = PTR_ERR(handle);
736 goto fail; 753 ext4_std_error(sb, err);
754 goto out;
737 } 755 }
738 } 756 }
739 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 757 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
740 err = ext4_journal_get_write_access(handle, inode_bitmap_bh); 758 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
741 if (err) 759 if (err) {
742 goto fail; 760 ext4_std_error(sb, err);
761 goto out;
762 }
743 ext4_lock_group(sb, group); 763 ext4_lock_group(sb, group);
744 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); 764 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
745 ext4_unlock_group(sb, group); 765 ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ repeat_in_this_group:
755got: 775got:
756 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); 776 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
757 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); 777 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
758 if (err) 778 if (err) {
759 goto fail; 779 ext4_std_error(sb, err);
780 goto out;
781 }
760 782
761 /* We may have to initialize the block bitmap if it isn't already */ 783 /* We may have to initialize the block bitmap if it isn't already */
762 if (ext4_has_group_desc_csum(sb) && 784 if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ got:
768 err = ext4_journal_get_write_access(handle, block_bitmap_bh); 790 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
769 if (err) { 791 if (err) {
770 brelse(block_bitmap_bh); 792 brelse(block_bitmap_bh);
771 goto fail; 793 ext4_std_error(sb, err);
794 goto out;
772 } 795 }
773 796
774 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); 797 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ got:
787 ext4_unlock_group(sb, group); 810 ext4_unlock_group(sb, group);
788 brelse(block_bitmap_bh); 811 brelse(block_bitmap_bh);
789 812
790 if (err) 813 if (err) {
791 goto fail; 814 ext4_std_error(sb, err);
815 goto out;
816 }
792 } 817 }
793 818
794 BUFFER_TRACE(group_desc_bh, "get_write_access"); 819 BUFFER_TRACE(group_desc_bh, "get_write_access");
795 err = ext4_journal_get_write_access(handle, group_desc_bh); 820 err = ext4_journal_get_write_access(handle, group_desc_bh);
796 if (err) 821 if (err) {
797 goto fail; 822 ext4_std_error(sb, err);
823 goto out;
824 }
798 825
799 /* Update the relevant bg descriptor fields */ 826 /* Update the relevant bg descriptor fields */
800 if (ext4_has_group_desc_csum(sb)) { 827 if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ got:
840 867
841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 868 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 869 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
843 if (err) 870 if (err) {
844 goto fail; 871 ext4_std_error(sb, err);
872 goto out;
873 }
845 874
846 percpu_counter_dec(&sbi->s_freeinodes_counter); 875 percpu_counter_dec(&sbi->s_freeinodes_counter);
847 if (S_ISDIR(mode)) 876 if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ got:
851 flex_group = ext4_flex_group(sbi, group); 880 flex_group = ext4_flex_group(sbi, group);
852 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 881 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
853 } 882 }
854 if (owner) {
855 inode->i_mode = mode;
856 i_uid_write(inode, owner[0]);
857 i_gid_write(inode, owner[1]);
858 } else if (test_opt(sb, GRPID)) {
859 inode->i_mode = mode;
860 inode->i_uid = current_fsuid();
861 inode->i_gid = dir->i_gid;
862 } else
863 inode_init_owner(inode, dir, mode);
864 883
865 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 884 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
866 /* This is the optimal IO size (for stat), not the fs block size */ 885 /* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ got:
889 * twice. 908 * twice.
890 */ 909 */
891 err = -EIO; 910 err = -EIO;
892 goto fail; 911 ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
912 inode->i_ino);
913 goto out;
893 } 914 }
894 spin_lock(&sbi->s_next_gen_lock); 915 spin_lock(&sbi->s_next_gen_lock);
895 inode->i_generation = sbi->s_next_generation++; 916 inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ got:
899 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 920 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
900 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 921 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
901 __u32 csum; 922 __u32 csum;
902 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
903 __le32 inum = cpu_to_le32(inode->i_ino); 923 __le32 inum = cpu_to_le32(inode->i_ino);
904 __le32 gen = cpu_to_le32(inode->i_generation); 924 __le32 gen = cpu_to_le32(inode->i_generation);
905 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, 925 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ got:
918 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 938 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
919 939
920 ret = inode; 940 ret = inode;
921 dquot_initialize(inode);
922 err = dquot_alloc_inode(inode); 941 err = dquot_alloc_inode(inode);
923 if (err) 942 if (err)
924 goto fail_drop; 943 goto fail_drop;
@@ -952,24 +971,17 @@ got:
952 971
953 ext4_debug("allocating inode %lu\n", inode->i_ino); 972 ext4_debug("allocating inode %lu\n", inode->i_ino);
954 trace_ext4_allocate_inode(inode, dir, mode); 973 trace_ext4_allocate_inode(inode, dir, mode);
955 goto really_out;
956fail:
957 ext4_std_error(sb, err);
958out:
959 iput(inode);
960 ret = ERR_PTR(err);
961really_out:
962 brelse(inode_bitmap_bh); 974 brelse(inode_bitmap_bh);
963 return ret; 975 return ret;
964 976
965fail_free_drop: 977fail_free_drop:
966 dquot_free_inode(inode); 978 dquot_free_inode(inode);
967
968fail_drop: 979fail_drop:
969 dquot_drop(inode);
970 inode->i_flags |= S_NOQUOTA;
971 clear_nlink(inode); 980 clear_nlink(inode);
972 unlock_new_inode(inode); 981 unlock_new_inode(inode);
982out:
983 dquot_drop(inode);
984 inode->i_flags |= S_NOQUOTA;
973 iput(inode); 985 iput(inode);
974 brelse(inode_bitmap_bh); 986 brelse(inode_bitmap_bh);
975 return ERR_PTR(err); 987 return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a04183127ef0..98be6f697463 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
292} 292}
293 293
294/** 294/**
295 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
296 * @handle: handle for this transaction
297 * @inode: inode which needs allocated blocks
298 * @iblock: the logical block to start allocated at
299 * @goal: preferred physical block of allocation
300 * @indirect_blks: the number of blocks need to allocate for indirect
301 * blocks
302 * @blks: number of desired blocks
303 * @new_blocks: on return it will store the new block numbers for
304 * the indirect blocks(if needed) and the first direct block,
305 * @err: on return it will store the error code
306 *
307 * This function will return the number of blocks allocated as
308 * requested by the passed-in parameters.
309 */
310static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
311 ext4_lblk_t iblock, ext4_fsblk_t goal,
312 int indirect_blks, int blks,
313 ext4_fsblk_t new_blocks[4], int *err)
314{
315 struct ext4_allocation_request ar;
316 int target, i;
317 unsigned long count = 0, blk_allocated = 0;
318 int index = 0;
319 ext4_fsblk_t current_block = 0;
320 int ret = 0;
321
322 /*
323 * Here we try to allocate the requested multiple blocks at once,
324 * on a best-effort basis.
325 * To build a branch, we should allocate blocks for
326 * the indirect blocks(if not allocated yet), and at least
327 * the first direct block of this branch. That's the
328 * minimum number of blocks need to allocate(required)
329 */
330 /* first we try to allocate the indirect blocks */
331 target = indirect_blks;
332 while (target > 0) {
333 count = target;
334 /* allocating blocks for indirect blocks and direct blocks */
335 current_block = ext4_new_meta_blocks(handle, inode, goal,
336 0, &count, err);
337 if (*err)
338 goto failed_out;
339
340 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
341 EXT4_ERROR_INODE(inode,
342 "current_block %llu + count %lu > %d!",
343 current_block, count,
344 EXT4_MAX_BLOCK_FILE_PHYS);
345 *err = -EIO;
346 goto failed_out;
347 }
348
349 target -= count;
350 /* allocate blocks for indirect blocks */
351 while (index < indirect_blks && count) {
352 new_blocks[index++] = current_block++;
353 count--;
354 }
355 if (count > 0) {
356 /*
357 * save the new block number
358 * for the first direct block
359 */
360 new_blocks[index] = current_block;
361 WARN(1, KERN_INFO "%s returned more blocks than "
362 "requested\n", __func__);
363 break;
364 }
365 }
366
367 target = blks - count ;
368 blk_allocated = count;
369 if (!target)
370 goto allocated;
371 /* Now allocate data blocks */
372 memset(&ar, 0, sizeof(ar));
373 ar.inode = inode;
374 ar.goal = goal;
375 ar.len = target;
376 ar.logical = iblock;
377 if (S_ISREG(inode->i_mode))
378 /* enable in-core preallocation only for regular files */
379 ar.flags = EXT4_MB_HINT_DATA;
380
381 current_block = ext4_mb_new_blocks(handle, &ar, err);
382 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
383 EXT4_ERROR_INODE(inode,
384 "current_block %llu + ar.len %d > %d!",
385 current_block, ar.len,
386 EXT4_MAX_BLOCK_FILE_PHYS);
387 *err = -EIO;
388 goto failed_out;
389 }
390
391 if (*err && (target == blks)) {
392 /*
393 * if the allocation failed and we didn't allocate
394 * any blocks before
395 */
396 goto failed_out;
397 }
398 if (!*err) {
399 if (target == blks) {
400 /*
401 * save the new block number
402 * for the first direct block
403 */
404 new_blocks[index] = current_block;
405 }
406 blk_allocated += ar.len;
407 }
408allocated:
409 /* total number of blocks allocated for direct blocks */
410 ret = blk_allocated;
411 *err = 0;
412 return ret;
413failed_out:
414 for (i = 0; i < index; i++)
415 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
416 return ret;
417}
418
419/**
420 * ext4_alloc_branch - allocate and set up a chain of blocks. 295 * ext4_alloc_branch - allocate and set up a chain of blocks.
421 * @handle: handle for this transaction 296 * @handle: handle for this transaction
422 * @inode: owner 297 * @inode: owner
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
448 int *blks, ext4_fsblk_t goal, 323 int *blks, ext4_fsblk_t goal,
449 ext4_lblk_t *offsets, Indirect *branch) 324 ext4_lblk_t *offsets, Indirect *branch)
450{ 325{
451 int blocksize = inode->i_sb->s_blocksize; 326 struct ext4_allocation_request ar;
452 int i, n = 0; 327 struct buffer_head * bh;
453 int err = 0; 328 ext4_fsblk_t b, new_blocks[4];
454 struct buffer_head *bh; 329 __le32 *p;
455 int num; 330 int i, j, err, len = 1;
456 ext4_fsblk_t new_blocks[4];
457 ext4_fsblk_t current_block;
458
459 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
460 *blks, new_blocks, &err);
461 if (err)
462 return err;
463 331
464 branch[0].key = cpu_to_le32(new_blocks[0]);
465 /* 332 /*
466 * metadata blocks and data blocks are allocated. 333 * Set up for the direct block allocation
467 */ 334 */
468 for (n = 1; n <= indirect_blks; n++) { 335 memset(&ar, 0, sizeof(ar));
469 /* 336 ar.inode = inode;
470 * Get buffer_head for parent block, zero it out 337 ar.len = *blks;
471 * and set the pointer to new one, then send 338 ar.logical = iblock;
472 * parent to disk. 339 if (S_ISREG(inode->i_mode))
473 */ 340 ar.flags = EXT4_MB_HINT_DATA;
474 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 341
342 for (i = 0; i <= indirect_blks; i++) {
343 if (i == indirect_blks) {
344 ar.goal = goal;
345 new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
346 } else
347 goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
348 goal, 0, NULL, &err);
349 if (err) {
350 i--;
351 goto failed;
352 }
353 branch[i].key = cpu_to_le32(new_blocks[i]);
354 if (i == 0)
355 continue;
356
357 bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
475 if (unlikely(!bh)) { 358 if (unlikely(!bh)) {
476 err = -ENOMEM; 359 err = -ENOMEM;
477 goto failed; 360 goto failed;
478 } 361 }
479
480 branch[n].bh = bh;
481 lock_buffer(bh); 362 lock_buffer(bh);
482 BUFFER_TRACE(bh, "call get_create_access"); 363 BUFFER_TRACE(bh, "call get_create_access");
483 err = ext4_journal_get_create_access(handle, bh); 364 err = ext4_journal_get_create_access(handle, bh);
484 if (err) { 365 if (err) {
485 /* Don't brelse(bh) here; it's done in
486 * ext4_journal_forget() below */
487 unlock_buffer(bh); 366 unlock_buffer(bh);
488 goto failed; 367 goto failed;
489 } 368 }
490 369
491 memset(bh->b_data, 0, blocksize); 370 memset(bh->b_data, 0, bh->b_size);
492 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 371 p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
493 branch[n].key = cpu_to_le32(new_blocks[n]); 372 b = new_blocks[i];
494 *branch[n].p = branch[n].key; 373
495 if (n == indirect_blks) { 374 if (i == indirect_blks)
496 current_block = new_blocks[n]; 375 len = ar.len;
497 /* 376 for (j = 0; j < len; j++)
498 * End of chain, update the last new metablock of 377 *p++ = cpu_to_le32(b++);
499 * the chain to point to the new allocated 378
500 * data blocks numbers
501 */
502 for (i = 1; i < num; i++)
503 *(branch[n].p + i) = cpu_to_le32(++current_block);
504 }
505 BUFFER_TRACE(bh, "marking uptodate"); 379 BUFFER_TRACE(bh, "marking uptodate");
506 set_buffer_uptodate(bh); 380 set_buffer_uptodate(bh);
507 unlock_buffer(bh); 381 unlock_buffer(bh);
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
511 if (err) 385 if (err)
512 goto failed; 386 goto failed;
513 } 387 }
514 *blks = num; 388 *blks = ar.len;
515 return err; 389 return 0;
516failed: 390failed:
517 /* Allocation failed, free what we already allocated */ 391 for (; i >= 0; i--) {
518 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 392 if (i != indirect_blks && branch[i].bh)
519 for (i = 1; i <= n ; i++) { 393 ext4_forget(handle, 1, inode, branch[i].bh,
520 /* 394 branch[i].bh->b_blocknr);
521 * branch[i].bh is newly allocated, so there is no 395 ext4_free_blocks(handle, inode, NULL, new_blocks[i],
522 * need to revoke the block, which is why we don't 396 (i == indirect_blks) ? ar.len : 1, 0);
523 * need to set EXT4_FREE_BLOCKS_METADATA.
524 */
525 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
526 EXT4_FREE_BLOCKS_FORGET);
527 } 397 }
528 for (i = n+1; i < indirect_blks; i++)
529 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
530
531 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
532
533 return err; 398 return err;
534} 399}
535 400
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
941 * be able to restart the transaction at a conventient checkpoint to make 806 * be able to restart the transaction at a conventient checkpoint to make
942 * sure we don't overflow the journal. 807 * sure we don't overflow the journal.
943 * 808 *
944 * start_transaction gets us a new handle for a truncate transaction, 809 * Try to extend this transaction for the purposes of truncation. If
945 * and extend_transaction tries to extend the existing one a bit. If
946 * extend fails, we need to propagate the failure up and restart the 810 * extend fails, we need to propagate the failure up and restart the
947 * transaction in the top-level truncate loop. --sct 811 * transaction in the top-level truncate loop. --sct
948 */
949static handle_t *start_transaction(struct inode *inode)
950{
951 handle_t *result;
952
953 result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
954 ext4_blocks_for_truncate(inode));
955 if (!IS_ERR(result))
956 return result;
957
958 ext4_std_error(inode->i_sb, PTR_ERR(result));
959 return result;
960}
961
962/*
963 * Try to extend this transaction for the purposes of truncation.
964 * 812 *
965 * Returns 0 if we managed to create more room. If we can't create more 813 * Returns 0 if we managed to create more room. If we can't create more
966 * room, and the transaction must be restarted we return 1. 814 * room, and the transaction must be restarted we return 1.
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
1353 } 1201 }
1354} 1202}
1355 1203
1356void ext4_ind_truncate(struct inode *inode) 1204void ext4_ind_truncate(handle_t *handle, struct inode *inode)
1357{ 1205{
1358 handle_t *handle;
1359 struct ext4_inode_info *ei = EXT4_I(inode); 1206 struct ext4_inode_info *ei = EXT4_I(inode);
1360 __le32 *i_data = ei->i_data; 1207 __le32 *i_data = ei->i_data;
1361 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1208 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1362 struct address_space *mapping = inode->i_mapping;
1363 ext4_lblk_t offsets[4]; 1209 ext4_lblk_t offsets[4];
1364 Indirect chain[4]; 1210 Indirect chain[4];
1365 Indirect *partial; 1211 Indirect *partial;
1366 __le32 nr = 0; 1212 __le32 nr = 0;
1367 int n = 0; 1213 int n = 0;
1368 ext4_lblk_t last_block, max_block; 1214 ext4_lblk_t last_block, max_block;
1369 loff_t page_len;
1370 unsigned blocksize = inode->i_sb->s_blocksize; 1215 unsigned blocksize = inode->i_sb->s_blocksize;
1371 int err;
1372
1373 handle = start_transaction(inode);
1374 if (IS_ERR(handle))
1375 return; /* AKPM: return what? */
1376 1216
1377 last_block = (inode->i_size + blocksize-1) 1217 last_block = (inode->i_size + blocksize-1)
1378 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1218 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1379 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1219 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1380 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1220 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1381 1221
1382 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
1383 page_len = PAGE_CACHE_SIZE -
1384 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1385
1386 err = ext4_discard_partial_page_buffers(handle,
1387 mapping, inode->i_size, page_len, 0);
1388
1389 if (err)
1390 goto out_stop;
1391 }
1392
1393 if (last_block != max_block) { 1222 if (last_block != max_block) {
1394 n = ext4_block_to_path(inode, last_block, offsets, NULL); 1223 n = ext4_block_to_path(inode, last_block, offsets, NULL);
1395 if (n == 0) 1224 if (n == 0)
1396 goto out_stop; /* error */ 1225 return;
1397 } 1226 }
1398 1227
1399 /*
1400 * OK. This truncate is going to happen. We add the inode to the
1401 * orphan list, so that if this truncate spans multiple transactions,
1402 * and we crash, we will resume the truncate when the filesystem
1403 * recovers. It also marks the inode dirty, to catch the new size.
1404 *
1405 * Implication: the file must always be in a sane, consistent
1406 * truncatable state while each transaction commits.
1407 */
1408 if (ext4_orphan_add(handle, inode))
1409 goto out_stop;
1410
1411 /*
1412 * From here we block out all ext4_get_block() callers who want to
1413 * modify the block allocation tree.
1414 */
1415 down_write(&ei->i_data_sem);
1416
1417 ext4_discard_preallocations(inode);
1418 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); 1228 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
1419 1229
1420 /* 1230 /*
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
1431 * It is unnecessary to free any data blocks if last_block is 1241 * It is unnecessary to free any data blocks if last_block is
1432 * equal to the indirect block limit. 1242 * equal to the indirect block limit.
1433 */ 1243 */
1434 goto out_unlock; 1244 return;
1435 } else if (n == 1) { /* direct blocks */ 1245 } else if (n == 1) { /* direct blocks */
1436 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 1246 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
1437 i_data + EXT4_NDIR_BLOCKS); 1247 i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1301,6 @@ do_indirects:
1491 case EXT4_TIND_BLOCK: 1301 case EXT4_TIND_BLOCK:
1492 ; 1302 ;
1493 } 1303 }
1494
1495out_unlock:
1496 up_write(&ei->i_data_sem);
1497 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1498 ext4_mark_inode_dirty(handle, inode);
1499
1500 /*
1501 * In a multi-transaction truncate, we only make the final transaction
1502 * synchronous
1503 */
1504 if (IS_SYNC(inode))
1505 ext4_handle_sync(handle);
1506out_stop:
1507 /*
1508 * If this was a simple ftruncate(), and the file will remain alive
1509 * then we need to clear up the orphan record which we created above.
1510 * However, if this was a real unlink then we were called by
1511 * ext4_delete_inode(), and we allow that function to clean up the
1512 * orphan info for us.
1513 */
1514 if (inode->i_nlink)
1515 ext4_orphan_del(handle, inode);
1516
1517 ext4_journal_stop(handle);
1518 trace_ext4_truncate_exit(inode);
1519} 1304}
1520 1305
1521static int free_hole_blocks(handle_t *handle, struct inode *inode, 1306static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1354,8 @@ err:
1569 return ret; 1354 return ret;
1570} 1355}
1571 1356
1572static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 1357int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
1573 ext4_lblk_t first, ext4_lblk_t stop) 1358 ext4_lblk_t first, ext4_lblk_t stop)
1574{ 1359{
1575 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1360 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1576 int level, ret = 0; 1361 int level, ret = 0;
@@ -1604,157 +1389,3 @@ err:
1604 return ret; 1389 return ret;
1605} 1390}
1606 1391
1607int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
1608{
1609 struct inode *inode = file_inode(file);
1610 struct super_block *sb = inode->i_sb;
1611 ext4_lblk_t first_block, stop_block;
1612 struct address_space *mapping = inode->i_mapping;
1613 handle_t *handle = NULL;
1614 loff_t first_page, last_page, page_len;
1615 loff_t first_page_offset, last_page_offset;
1616 int err = 0;
1617
1618 /*
1619 * Write out all dirty pages to avoid race conditions
1620 * Then release them.
1621 */
1622 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
1623 err = filemap_write_and_wait_range(mapping,
1624 offset, offset + length - 1);
1625 if (err)
1626 return err;
1627 }
1628
1629 mutex_lock(&inode->i_mutex);
1630 /* It's not possible punch hole on append only file */
1631 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
1632 err = -EPERM;
1633 goto out_mutex;
1634 }
1635 if (IS_SWAPFILE(inode)) {
1636 err = -ETXTBSY;
1637 goto out_mutex;
1638 }
1639
1640 /* No need to punch hole beyond i_size */
1641 if (offset >= inode->i_size)
1642 goto out_mutex;
1643
1644 /*
1645 * If the hole extents beyond i_size, set the hole
1646 * to end after the page that contains i_size
1647 */
1648 if (offset + length > inode->i_size) {
1649 length = inode->i_size +
1650 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
1651 offset;
1652 }
1653
1654 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1655 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
1656
1657 first_page_offset = first_page << PAGE_CACHE_SHIFT;
1658 last_page_offset = last_page << PAGE_CACHE_SHIFT;
1659
1660 /* Now release the pages */
1661 if (last_page_offset > first_page_offset) {
1662 truncate_pagecache_range(inode, first_page_offset,
1663 last_page_offset - 1);
1664 }
1665
1666 /* Wait all existing dio works, newcomers will block on i_mutex */
1667 inode_dio_wait(inode);
1668
1669 handle = start_transaction(inode);
1670 if (IS_ERR(handle))
1671 goto out_mutex;
1672
1673 /*
1674 * Now we need to zero out the non-page-aligned data in the
1675 * pages at the start and tail of the hole, and unmap the buffer
1676 * heads for the block aligned regions of the page that were
1677 * completely zerod.
1678 */
1679 if (first_page > last_page) {
1680 /*
1681 * If the file space being truncated is contained within a page
1682 * just zero out and unmap the middle of that page
1683 */
1684 err = ext4_discard_partial_page_buffers(handle,
1685 mapping, offset, length, 0);
1686 if (err)
1687 goto out;
1688 } else {
1689 /*
1690 * Zero out and unmap the paritial page that contains
1691 * the start of the hole
1692 */
1693 page_len = first_page_offset - offset;
1694 if (page_len > 0) {
1695 err = ext4_discard_partial_page_buffers(handle, mapping,
1696 offset, page_len, 0);
1697 if (err)
1698 goto out;
1699 }
1700
1701 /*
1702 * Zero out and unmap the partial page that contains
1703 * the end of the hole
1704 */
1705 page_len = offset + length - last_page_offset;
1706 if (page_len > 0) {
1707 err = ext4_discard_partial_page_buffers(handle, mapping,
1708 last_page_offset, page_len, 0);
1709 if (err)
1710 goto out;
1711 }
1712 }
1713
1714 /*
1715 * If i_size contained in the last page, we need to
1716 * unmap and zero the paritial page after i_size
1717 */
1718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
1719 inode->i_size % PAGE_CACHE_SIZE != 0) {
1720 page_len = PAGE_CACHE_SIZE -
1721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1722 if (page_len > 0) {
1723 err = ext4_discard_partial_page_buffers(handle,
1724 mapping, inode->i_size, page_len, 0);
1725 if (err)
1726 goto out;
1727 }
1728 }
1729
1730 first_block = (offset + sb->s_blocksize - 1) >>
1731 EXT4_BLOCK_SIZE_BITS(sb);
1732 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
1733
1734 if (first_block >= stop_block)
1735 goto out;
1736
1737 down_write(&EXT4_I(inode)->i_data_sem);
1738 ext4_discard_preallocations(inode);
1739
1740 err = ext4_es_remove_extent(inode, first_block,
1741 stop_block - first_block);
1742 err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
1743
1744 ext4_discard_preallocations(inode);
1745
1746 if (IS_SYNC(inode))
1747 ext4_handle_sync(handle);
1748
1749 up_write(&EXT4_I(inode)->i_data_sem);
1750
1751out:
1752 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1753 ext4_mark_inode_dirty(handle, inode);
1754 ext4_journal_stop(handle);
1755
1756out_mutex:
1757 mutex_unlock(&inode->i_mutex);
1758
1759 return err;
1760}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0fd1a123f7d..3e2bf873e8a8 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
19 19
20#define EXT4_XATTR_SYSTEM_DATA "data" 20#define EXT4_XATTR_SYSTEM_DATA "data"
21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) 21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
22#define EXT4_INLINE_DOTDOT_SIZE 4 22#define EXT4_INLINE_DOTDOT_OFFSET 2
23#define EXT4_INLINE_DOTDOT_SIZE 4
23 24
24int ext4_get_inline_size(struct inode *inode) 25int ext4_get_inline_size(struct inode *inode)
25{ 26{
@@ -1289,6 +1290,120 @@ out:
1289 return ret; 1290 return ret;
1290} 1291}
1291 1292
1293/*
1294 * This function fills a red-black tree with information from an
1295 * inlined dir. It returns the number directory entries loaded
1296 * into the tree. If there is an error it is returned in err.
1297 */
1298int htree_inlinedir_to_tree(struct file *dir_file,
1299 struct inode *dir, ext4_lblk_t block,
1300 struct dx_hash_info *hinfo,
1301 __u32 start_hash, __u32 start_minor_hash,
1302 int *has_inline_data)
1303{
1304 int err = 0, count = 0;
1305 unsigned int parent_ino;
1306 int pos;
1307 struct ext4_dir_entry_2 *de;
1308 struct inode *inode = file_inode(dir_file);
1309 int ret, inline_size = 0;
1310 struct ext4_iloc iloc;
1311 void *dir_buf = NULL;
1312 struct ext4_dir_entry_2 fake;
1313
1314 ret = ext4_get_inode_loc(inode, &iloc);
1315 if (ret)
1316 return ret;
1317
1318 down_read(&EXT4_I(inode)->xattr_sem);
1319 if (!ext4_has_inline_data(inode)) {
1320 up_read(&EXT4_I(inode)->xattr_sem);
1321 *has_inline_data = 0;
1322 goto out;
1323 }
1324
1325 inline_size = ext4_get_inline_size(inode);
1326 dir_buf = kmalloc(inline_size, GFP_NOFS);
1327 if (!dir_buf) {
1328 ret = -ENOMEM;
1329 up_read(&EXT4_I(inode)->xattr_sem);
1330 goto out;
1331 }
1332
1333 ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
1334 up_read(&EXT4_I(inode)->xattr_sem);
1335 if (ret < 0)
1336 goto out;
1337
1338 pos = 0;
1339 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1340 while (pos < inline_size) {
1341 /*
1342 * As inlined dir doesn't store any information about '.' and
1343 * only the inode number of '..' is stored, we have to handle
1344 * them differently.
1345 */
1346 if (pos == 0) {
1347 fake.inode = cpu_to_le32(inode->i_ino);
1348 fake.name_len = 1;
1349 strcpy(fake.name, ".");
1350 fake.rec_len = ext4_rec_len_to_disk(
1351 EXT4_DIR_REC_LEN(fake.name_len),
1352 inline_size);
1353 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
1354 de = &fake;
1355 pos = EXT4_INLINE_DOTDOT_OFFSET;
1356 } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
1357 fake.inode = cpu_to_le32(parent_ino);
1358 fake.name_len = 2;
1359 strcpy(fake.name, "..");
1360 fake.rec_len = ext4_rec_len_to_disk(
1361 EXT4_DIR_REC_LEN(fake.name_len),
1362 inline_size);
1363 ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
1364 de = &fake;
1365 pos = EXT4_INLINE_DOTDOT_SIZE;
1366 } else {
1367 de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
1368 pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
1369 if (ext4_check_dir_entry(inode, dir_file, de,
1370 iloc.bh, dir_buf,
1371 inline_size, pos)) {
1372 ret = count;
1373 goto out;
1374 }
1375 }
1376
1377 ext4fs_dirhash(de->name, de->name_len, hinfo);
1378 if ((hinfo->hash < start_hash) ||
1379 ((hinfo->hash == start_hash) &&
1380 (hinfo->minor_hash < start_minor_hash)))
1381 continue;
1382 if (de->inode == 0)
1383 continue;
1384 err = ext4_htree_store_dirent(dir_file,
1385 hinfo->hash, hinfo->minor_hash, de);
1386 if (err) {
1387 count = err;
1388 goto out;
1389 }
1390 count++;
1391 }
1392 ret = count;
1393out:
1394 kfree(dir_buf);
1395 brelse(iloc.bh);
1396 return ret;
1397}
1398
1399/*
1400 * So this function is called when the volume is mkfsed with
1401 * dir_index disabled. In order to keep f_pos persistent
1402 * after we convert from an inlined dir to a blocked based,
1403 * we just pretend that we are a normal dir and return the
1404 * offset as if '.' and '..' really take place.
1405 *
1406 */
1292int ext4_read_inline_dir(struct file *filp, 1407int ext4_read_inline_dir(struct file *filp,
1293 void *dirent, filldir_t filldir, 1408 void *dirent, filldir_t filldir,
1294 int *has_inline_data) 1409 int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
1302 int ret, inline_size = 0; 1417 int ret, inline_size = 0;
1303 struct ext4_iloc iloc; 1418 struct ext4_iloc iloc;
1304 void *dir_buf = NULL; 1419 void *dir_buf = NULL;
1420 int dotdot_offset, dotdot_size, extra_offset, extra_size;
1305 1421
1306 ret = ext4_get_inode_loc(inode, &iloc); 1422 ret = ext4_get_inode_loc(inode, &iloc);
1307 if (ret) 1423 if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
1330 sb = inode->i_sb; 1446 sb = inode->i_sb;
1331 stored = 0; 1447 stored = 0;
1332 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1448 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1449 offset = filp->f_pos;
1333 1450
1334 while (!error && !stored && filp->f_pos < inode->i_size) { 1451 /*
1452 * dotdot_offset and dotdot_size is the real offset and
1453 * size for ".." and "." if the dir is block based while
1454 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
1455 * So we will use extra_offset and extra_size to indicate them
1456 * during the inline dir iteration.
1457 */
1458 dotdot_offset = EXT4_DIR_REC_LEN(1);
1459 dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
1460 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
1461 extra_size = extra_offset + inline_size;
1462
1463 while (!error && !stored && filp->f_pos < extra_size) {
1335revalidate: 1464revalidate:
1336 /* 1465 /*
1337 * If the version has changed since the last call to 1466 * If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ revalidate:
1340 * dir to make sure. 1469 * dir to make sure.
1341 */ 1470 */
1342 if (filp->f_version != inode->i_version) { 1471 if (filp->f_version != inode->i_version) {
1343 for (i = 0; 1472 for (i = 0; i < extra_size && i < offset;) {
1344 i < inode->i_size && i < offset;) { 1473 /*
1474 * "." is with offset 0 and
1475 * ".." is dotdot_offset.
1476 */
1345 if (!i) { 1477 if (!i) {
1346 /* skip "." and ".." if needed. */ 1478 i = dotdot_offset;
1347 i += EXT4_INLINE_DOTDOT_SIZE; 1479 continue;
1480 } else if (i == dotdot_offset) {
1481 i = dotdot_size;
1348 continue; 1482 continue;
1349 } 1483 }
1484 /* for other entry, the real offset in
1485 * the buf has to be tuned accordingly.
1486 */
1350 de = (struct ext4_dir_entry_2 *) 1487 de = (struct ext4_dir_entry_2 *)
1351 (dir_buf + i); 1488 (dir_buf + i - extra_offset);
1352 /* It's too expensive to do a full 1489 /* It's too expensive to do a full
1353 * dirent test each time round this 1490 * dirent test each time round this
1354 * loop, but we do have to test at 1491 * loop, but we do have to test at
@@ -1356,43 +1493,47 @@ revalidate:
1356 * failure will be detected in the 1493 * failure will be detected in the
1357 * dirent test below. */ 1494 * dirent test below. */
1358 if (ext4_rec_len_from_disk(de->rec_len, 1495 if (ext4_rec_len_from_disk(de->rec_len,
1359 inline_size) < EXT4_DIR_REC_LEN(1)) 1496 extra_size) < EXT4_DIR_REC_LEN(1))
1360 break; 1497 break;
1361 i += ext4_rec_len_from_disk(de->rec_len, 1498 i += ext4_rec_len_from_disk(de->rec_len,
1362 inline_size); 1499 extra_size);
1363 } 1500 }
1364 offset = i; 1501 offset = i;
1365 filp->f_pos = offset; 1502 filp->f_pos = offset;
1366 filp->f_version = inode->i_version; 1503 filp->f_version = inode->i_version;
1367 } 1504 }
1368 1505
1369 while (!error && filp->f_pos < inode->i_size) { 1506 while (!error && filp->f_pos < extra_size) {
1370 if (filp->f_pos == 0) { 1507 if (filp->f_pos == 0) {
1371 error = filldir(dirent, ".", 1, 0, inode->i_ino, 1508 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1372 DT_DIR); 1509 DT_DIR);
1373 if (error) 1510 if (error)
1374 break; 1511 break;
1375 stored++; 1512 stored++;
1513 filp->f_pos = dotdot_offset;
1514 continue;
1515 }
1376 1516
1377 error = filldir(dirent, "..", 2, 0, parent_ino, 1517 if (filp->f_pos == dotdot_offset) {
1378 DT_DIR); 1518 error = filldir(dirent, "..", 2,
1519 dotdot_offset,
1520 parent_ino, DT_DIR);
1379 if (error) 1521 if (error)
1380 break; 1522 break;
1381 stored++; 1523 stored++;
1382 1524
1383 filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; 1525 filp->f_pos = dotdot_size;
1384 continue; 1526 continue;
1385 } 1527 }
1386 1528
1387 de = (struct ext4_dir_entry_2 *)(dir_buf + offset); 1529 de = (struct ext4_dir_entry_2 *)
1530 (dir_buf + filp->f_pos - extra_offset);
1388 if (ext4_check_dir_entry(inode, filp, de, 1531 if (ext4_check_dir_entry(inode, filp, de,
1389 iloc.bh, dir_buf, 1532 iloc.bh, dir_buf,
1390 inline_size, offset)) { 1533 extra_size, filp->f_pos)) {
1391 ret = stored; 1534 ret = stored;
1392 goto out; 1535 goto out;
1393 } 1536 }
1394 offset += ext4_rec_len_from_disk(de->rec_len,
1395 inline_size);
1396 if (le32_to_cpu(de->inode)) { 1537 if (le32_to_cpu(de->inode)) {
1397 /* We might block in the next section 1538 /* We might block in the next section
1398 * if the data destination is 1539 * if the data destination is
@@ -1415,9 +1556,8 @@ revalidate:
1415 stored++; 1556 stored++;
1416 } 1557 }
1417 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 1558 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1418 inline_size); 1559 extra_size);
1419 } 1560 }
1420 offset = 0;
1421 } 1561 }
1422out: 1562out:
1423 kfree(dir_buf); 1563 kfree(dir_buf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3a5213bc73e..793d44b84d7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
55 __u16 csum_hi = 0; 55 __u16 csum_hi = 0;
56 __u32 csum; 56 __u32 csum;
57 57
58 csum_lo = raw->i_checksum_lo; 58 csum_lo = le16_to_cpu(raw->i_checksum_lo);
59 raw->i_checksum_lo = 0; 59 raw->i_checksum_lo = 0;
60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
62 csum_hi = raw->i_checksum_hi; 62 csum_hi = le16_to_cpu(raw->i_checksum_hi);
63 raw->i_checksum_hi = 0; 63 raw->i_checksum_hi = 0;
64 } 64 }
65 65
66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, 66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
67 EXT4_INODE_SIZE(inode->i_sb)); 67 EXT4_INODE_SIZE(inode->i_sb));
68 68
69 raw->i_checksum_lo = csum_lo; 69 raw->i_checksum_lo = cpu_to_le16(csum_lo);
70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
72 raw->i_checksum_hi = csum_hi; 72 raw->i_checksum_hi = cpu_to_le16(csum_hi);
73 73
74 return csum; 74 return csum;
75} 75}
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
210 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 210 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
211 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 211 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
212 212
213 jbd2_log_start_commit(journal, commit_tid); 213 jbd2_complete_transaction(journal, commit_tid);
214 jbd2_log_wait_commit(journal, commit_tid);
215 filemap_write_and_wait(&inode->i_data); 214 filemap_write_and_wait(&inode->i_data);
216 } 215 }
217 truncate_inode_pages(&inode->i_data, 0); 216 truncate_inode_pages(&inode->i_data, 0);
@@ -1081,20 +1080,42 @@ retry_journal:
1081/* For write_end() in data=journal mode */ 1080/* For write_end() in data=journal mode */
1082static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1081static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1083{ 1082{
1083 int ret;
1084 if (!buffer_mapped(bh) || buffer_freed(bh)) 1084 if (!buffer_mapped(bh) || buffer_freed(bh))
1085 return 0; 1085 return 0;
1086 set_buffer_uptodate(bh); 1086 set_buffer_uptodate(bh);
1087 return ext4_handle_dirty_metadata(handle, NULL, bh); 1087 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1088 clear_buffer_meta(bh);
1089 clear_buffer_prio(bh);
1090 return ret;
1088} 1091}
1089 1092
1090static int ext4_generic_write_end(struct file *file, 1093/*
1091 struct address_space *mapping, 1094 * We need to pick up the new inode size which generic_commit_write gave us
1092 loff_t pos, unsigned len, unsigned copied, 1095 * `file' can be NULL - eg, when called from page_symlink().
1093 struct page *page, void *fsdata) 1096 *
1097 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1098 * buffers are managed internally.
1099 */
1100static int ext4_write_end(struct file *file,
1101 struct address_space *mapping,
1102 loff_t pos, unsigned len, unsigned copied,
1103 struct page *page, void *fsdata)
1094{ 1104{
1095 int i_size_changed = 0;
1096 struct inode *inode = mapping->host;
1097 handle_t *handle = ext4_journal_current_handle(); 1105 handle_t *handle = ext4_journal_current_handle();
1106 struct inode *inode = mapping->host;
1107 int ret = 0, ret2;
1108 int i_size_changed = 0;
1109
1110 trace_ext4_write_end(inode, pos, len, copied);
1111 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
1112 ret = ext4_jbd2_file_inode(handle, inode);
1113 if (ret) {
1114 unlock_page(page);
1115 page_cache_release(page);
1116 goto errout;
1117 }
1118 }
1098 1119
1099 if (ext4_has_inline_data(inode)) 1120 if (ext4_has_inline_data(inode))
1100 copied = ext4_write_inline_data_end(inode, pos, len, 1121 copied = ext4_write_inline_data_end(inode, pos, len,
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file,
1105 1126
1106 /* 1127 /*
1107 * No need to use i_size_read() here, the i_size 1128 * No need to use i_size_read() here, the i_size
1108 * cannot change under us because we hold i_mutex. 1129 * cannot change under us because we hole i_mutex.
1109 * 1130 *
1110 * But it's important to update i_size while still holding page lock: 1131 * But it's important to update i_size while still holding page lock:
1111 * page writeout could otherwise come in and zero beyond i_size. 1132 * page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file,
1115 i_size_changed = 1; 1136 i_size_changed = 1;
1116 } 1137 }
1117 1138
1118 if (pos + copied > EXT4_I(inode)->i_disksize) { 1139 if (pos + copied > EXT4_I(inode)->i_disksize) {
1119 /* We need to mark inode dirty even if 1140 /* We need to mark inode dirty even if
1120 * new_i_size is less that inode->i_size 1141 * new_i_size is less that inode->i_size
1121 * bu greater than i_disksize.(hint delalloc) 1142 * but greater than i_disksize. (hint delalloc)
1122 */ 1143 */
1123 ext4_update_i_disksize(inode, (pos + copied)); 1144 ext4_update_i_disksize(inode, (pos + copied));
1124 i_size_changed = 1; 1145 i_size_changed = 1;
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file,
1135 if (i_size_changed) 1156 if (i_size_changed)
1136 ext4_mark_inode_dirty(handle, inode); 1157 ext4_mark_inode_dirty(handle, inode);
1137 1158
1138 return copied; 1159 if (copied < 0)
1139} 1160 ret = copied;
1140
1141/*
1142 * We need to pick up the new inode size which generic_commit_write gave us
1143 * `file' can be NULL - eg, when called from page_symlink().
1144 *
1145 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1146 * buffers are managed internally.
1147 */
1148static int ext4_ordered_write_end(struct file *file,
1149 struct address_space *mapping,
1150 loff_t pos, unsigned len, unsigned copied,
1151 struct page *page, void *fsdata)
1152{
1153 handle_t *handle = ext4_journal_current_handle();
1154 struct inode *inode = mapping->host;
1155 int ret = 0, ret2;
1156
1157 trace_ext4_ordered_write_end(inode, pos, len, copied);
1158 ret = ext4_jbd2_file_inode(handle, inode);
1159
1160 if (ret == 0) {
1161 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1162 page, fsdata);
1163 copied = ret2;
1164 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1165 /* if we have allocated more blocks and copied
1166 * less. We will have blocks allocated outside
1167 * inode->i_size. So truncate them
1168 */
1169 ext4_orphan_add(handle, inode);
1170 if (ret2 < 0)
1171 ret = ret2;
1172 } else {
1173 unlock_page(page);
1174 page_cache_release(page);
1175 }
1176
1177 ret2 = ext4_journal_stop(handle);
1178 if (!ret)
1179 ret = ret2;
1180
1181 if (pos + len > inode->i_size) {
1182 ext4_truncate_failed_write(inode);
1183 /*
1184 * If truncate failed early the inode might still be
1185 * on the orphan list; we need to make sure the inode
1186 * is removed from the orphan list in that case.
1187 */
1188 if (inode->i_nlink)
1189 ext4_orphan_del(NULL, inode);
1190 }
1191
1192
1193 return ret ? ret : copied;
1194}
1195
1196static int ext4_writeback_write_end(struct file *file,
1197 struct address_space *mapping,
1198 loff_t pos, unsigned len, unsigned copied,
1199 struct page *page, void *fsdata)
1200{
1201 handle_t *handle = ext4_journal_current_handle();
1202 struct inode *inode = mapping->host;
1203 int ret = 0, ret2;
1204
1205 trace_ext4_writeback_write_end(inode, pos, len, copied);
1206 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1207 page, fsdata);
1208 copied = ret2;
1209 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1161 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1210 /* if we have allocated more blocks and copied 1162 /* if we have allocated more blocks and copied
1211 * less. We will have blocks allocated outside 1163 * less. We will have blocks allocated outside
1212 * inode->i_size. So truncate them 1164 * inode->i_size. So truncate them
1213 */ 1165 */
1214 ext4_orphan_add(handle, inode); 1166 ext4_orphan_add(handle, inode);
1215 1167errout:
1216 if (ret2 < 0)
1217 ret = ret2;
1218
1219 ret2 = ext4_journal_stop(handle); 1168 ret2 = ext4_journal_stop(handle);
1220 if (!ret) 1169 if (!ret)
1221 ret = ret2; 1170 ret = ret2;
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1538 struct ext4_io_submit io_submit; 1487 struct ext4_io_submit io_submit;
1539 1488
1540 BUG_ON(mpd->next_page <= mpd->first_page); 1489 BUG_ON(mpd->next_page <= mpd->first_page);
1541 memset(&io_submit, 0, sizeof(io_submit)); 1490 ext4_io_submit_init(&io_submit, mpd->wbc);
1491 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1492 if (!io_submit.io_end)
1493 return -ENOMEM;
1542 /* 1494 /*
1543 * We need to start from the first_page to the next_page - 1 1495 * We need to start from the first_page to the next_page - 1
1544 * to make sure we also write the mapped dirty buffer_heads. 1496 * to make sure we also write the mapped dirty buffer_heads.
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1626 pagevec_release(&pvec); 1578 pagevec_release(&pvec);
1627 } 1579 }
1628 ext4_io_submit(&io_submit); 1580 ext4_io_submit(&io_submit);
1581 /* Drop io_end reference we got from init */
1582 ext4_put_io_end_defer(io_submit.io_end);
1629 return ret; 1583 return ret;
1630} 1584}
1631 1585
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode)
1670{ 1624{
1671 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1625 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1672 struct super_block *sb = inode->i_sb; 1626 struct super_block *sb = inode->i_sb;
1627 struct ext4_inode_info *ei = EXT4_I(inode);
1673 1628
1674 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1629 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1675 EXT4_C2B(EXT4_SB(inode->i_sb), 1630 EXT4_C2B(EXT4_SB(inode->i_sb),
1676 ext4_count_free_clusters(inode->i_sb))); 1631 ext4_count_free_clusters(sb)));
1677 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1632 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1678 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1633 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1679 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1634 (long long) EXT4_C2B(EXT4_SB(sb),
1680 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1635 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1681 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1636 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1682 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1637 (long long) EXT4_C2B(EXT4_SB(sb),
1683 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1638 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1684 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1639 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1685 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1640 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1686 EXT4_I(inode)->i_reserved_data_blocks); 1641 ei->i_reserved_data_blocks);
1687 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", 1642 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1688 EXT4_I(inode)->i_reserved_meta_blocks); 1643 ei->i_reserved_meta_blocks);
1644 ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
1645 ei->i_allocated_meta_blocks);
1689 return; 1646 return;
1690} 1647}
1691 1648
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1740 */ 1697 */
1741 map.m_lblk = next; 1698 map.m_lblk = next;
1742 map.m_len = max_blocks; 1699 map.m_len = max_blocks;
1743 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 1700 /*
1701 * We're in delalloc path and it is possible that we're going to
1702 * need more metadata blocks than previously reserved. However
1703 * we must not fail because we're in writeback and there is
1704 * nothing we can do about it so it might result in data loss.
1705 * So use reserved blocks to allocate metadata if possible.
1706 */
1707 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1708 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1744 if (ext4_should_dioread_nolock(mpd->inode)) 1709 if (ext4_should_dioread_nolock(mpd->inode))
1745 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 1710 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1746 if (mpd->b_state & (1 << BH_Delay)) 1711 if (mpd->b_state & (1 << BH_Delay))
1747 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 1712 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1748 1713
1714
1749 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 1715 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1750 if (blks < 0) { 1716 if (blks < 0) {
1751 struct super_block *sb = mpd->inode->i_sb; 1717 struct super_block *sb = mpd->inode->i_sb;
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page,
2272 */ 2238 */
2273 return __ext4_journalled_writepage(page, len); 2239 return __ext4_journalled_writepage(page, len);
2274 2240
2275 memset(&io_submit, 0, sizeof(io_submit)); 2241 ext4_io_submit_init(&io_submit, wbc);
2242 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
2243 if (!io_submit.io_end) {
2244 redirty_page_for_writepage(wbc, page);
2245 return -ENOMEM;
2246 }
2276 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 2247 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2277 ext4_io_submit(&io_submit); 2248 ext4_io_submit(&io_submit);
2249 /* Drop io_end reference we got from init */
2250 ext4_put_io_end_defer(io_submit.io_end);
2278 return ret; 2251 return ret;
2279} 2252}
2280 2253
@@ -2661,7 +2634,7 @@ out_writepages:
2661 2634
2662static int ext4_nonda_switch(struct super_block *sb) 2635static int ext4_nonda_switch(struct super_block *sb)
2663{ 2636{
2664 s64 free_blocks, dirty_blocks; 2637 s64 free_clusters, dirty_clusters;
2665 struct ext4_sb_info *sbi = EXT4_SB(sb); 2638 struct ext4_sb_info *sbi = EXT4_SB(sb);
2666 2639
2667 /* 2640 /*
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb)
2672 * Delalloc need an accurate free block accounting. So switch 2645 * Delalloc need an accurate free block accounting. So switch
2673 * to non delalloc when we are near to error range. 2646 * to non delalloc when we are near to error range.
2674 */ 2647 */
2675 free_blocks = EXT4_C2B(sbi, 2648 free_clusters =
2676 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2649 percpu_counter_read_positive(&sbi->s_freeclusters_counter);
2677 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2650 dirty_clusters =
2651 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2678 /* 2652 /*
2679 * Start pushing delalloc when 1/2 of free blocks are dirty. 2653 * Start pushing delalloc when 1/2 of free blocks are dirty.
2680 */ 2654 */
2681 if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) 2655 if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
2682 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2656 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2683 2657
2684 if (2 * free_blocks < 3 * dirty_blocks || 2658 if (2 * free_clusters < 3 * dirty_clusters ||
2685 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2659 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
2686 /* 2660 /*
2687 * free block count is less than 150% of dirty blocks 2661 * free block count is less than 150% of dirty blocks
2688 * or free blocks is less than watermark 2662 * or free blocks is less than watermark
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file,
2818 unsigned long start, end; 2792 unsigned long start, end;
2819 int write_mode = (int)(unsigned long)fsdata; 2793 int write_mode = (int)(unsigned long)fsdata;
2820 2794
2821 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2795 if (write_mode == FALL_BACK_TO_NONDELALLOC)
2822 switch (ext4_inode_journal_mode(inode)) { 2796 return ext4_write_end(file, mapping, pos,
2823 case EXT4_INODE_ORDERED_DATA_MODE: 2797 len, copied, page, fsdata);
2824 return ext4_ordered_write_end(file, mapping, pos,
2825 len, copied, page, fsdata);
2826 case EXT4_INODE_WRITEBACK_DATA_MODE:
2827 return ext4_writeback_write_end(file, mapping, pos,
2828 len, copied, page, fsdata);
2829 default:
2830 BUG();
2831 }
2832 }
2833 2798
2834 trace_ext4_da_write_end(inode, pos, len, copied); 2799 trace_ext4_da_write_end(inode, pos, len, copied);
2835 start = pos & (PAGE_CACHE_SIZE - 1); 2800 start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3113 struct inode *inode = file_inode(iocb->ki_filp); 3078 struct inode *inode = file_inode(iocb->ki_filp);
3114 ext4_io_end_t *io_end = iocb->private; 3079 ext4_io_end_t *io_end = iocb->private;
3115 3080
3116 /* if not async direct IO or dio with 0 bytes write, just return */ 3081 /* if not async direct IO just return */
3117 if (!io_end || !size) 3082 if (!io_end) {
3118 goto out; 3083 inode_dio_done(inode);
3084 if (is_async)
3085 aio_complete(iocb, ret, 0);
3086 return;
3087 }
3119 3088
3120 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3089 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3121 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3090 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3123 size); 3092 size);
3124 3093
3125 iocb->private = NULL; 3094 iocb->private = NULL;
3126
3127 /* if not aio dio with unwritten extents, just free io and return */
3128 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3129 ext4_free_io_end(io_end);
3130out:
3131 inode_dio_done(inode);
3132 if (is_async)
3133 aio_complete(iocb, ret, 0);
3134 return;
3135 }
3136
3137 io_end->offset = offset; 3095 io_end->offset = offset;
3138 io_end->size = size; 3096 io_end->size = size;
3139 if (is_async) { 3097 if (is_async) {
3140 io_end->iocb = iocb; 3098 io_end->iocb = iocb;
3141 io_end->result = ret; 3099 io_end->result = ret;
3142 } 3100 }
3143 3101 ext4_put_io_end_defer(io_end);
3144 ext4_add_complete_io(io_end);
3145} 3102}
3146 3103
3147/* 3104/*
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3175 get_block_t *get_block_func = NULL; 3132 get_block_t *get_block_func = NULL;
3176 int dio_flags = 0; 3133 int dio_flags = 0;
3177 loff_t final_size = offset + count; 3134 loff_t final_size = offset + count;
3135 ext4_io_end_t *io_end = NULL;
3178 3136
3179 /* Use the old path for reads and writes beyond i_size. */ 3137 /* Use the old path for reads and writes beyond i_size. */
3180 if (rw != WRITE || final_size > inode->i_size) 3138 if (rw != WRITE || final_size > inode->i_size)
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3213 iocb->private = NULL; 3171 iocb->private = NULL;
3214 ext4_inode_aio_set(inode, NULL); 3172 ext4_inode_aio_set(inode, NULL);
3215 if (!is_sync_kiocb(iocb)) { 3173 if (!is_sync_kiocb(iocb)) {
3216 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3174 io_end = ext4_init_io_end(inode, GFP_NOFS);
3217 if (!io_end) { 3175 if (!io_end) {
3218 ret = -ENOMEM; 3176 ret = -ENOMEM;
3219 goto retake_lock; 3177 goto retake_lock;
3220 } 3178 }
3221 io_end->flag |= EXT4_IO_END_DIRECT; 3179 io_end->flag |= EXT4_IO_END_DIRECT;
3222 iocb->private = io_end; 3180 /*
3181 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3182 */
3183 iocb->private = ext4_get_io_end(io_end);
3223 /* 3184 /*
3224 * we save the io structure for current async direct 3185 * we save the io structure for current async direct
3225 * IO, so that later ext4_map_blocks() could flag the 3186 * IO, so that later ext4_map_blocks() could flag the
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3243 NULL, 3204 NULL,
3244 dio_flags); 3205 dio_flags);
3245 3206
3246 if (iocb->private)
3247 ext4_inode_aio_set(inode, NULL);
3248 /* 3207 /*
3249 * The io_end structure takes a reference to the inode, that 3208 * Put our reference to io_end. This can free the io_end structure e.g.
3250 * structure needs to be destroyed and the reference to the 3209 * in sync IO case or in case of error. It can even perform extent
3251 * inode need to be dropped, when IO is complete, even with 0 3210 * conversion if all bios we submitted finished before we got here.
3252 * byte write, or failed. 3211 * Note that in that case iocb->private can be already set to NULL
3253 * 3212 * here.
3254 * In the successful AIO DIO case, the io_end structure will
3255 * be destroyed and the reference to the inode will be dropped
3256 * after the end_io call back function is called.
3257 *
3258 * In the case there is 0 byte write, or error case, since VFS
3259 * direct IO won't invoke the end_io call back function, we
3260 * need to free the end_io structure here.
3261 */ 3213 */
3262 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3214 if (io_end) {
3263 ext4_free_io_end(iocb->private); 3215 ext4_inode_aio_set(inode, NULL);
3264 iocb->private = NULL; 3216 ext4_put_io_end(io_end);
3265 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3217 /*
3218 * In case of error or no write ext4_end_io_dio() was not
3219 * called so we have to put iocb's reference.
3220 */
3221 if (ret <= 0 && ret != -EIOCBQUEUED) {
3222 WARN_ON(iocb->private != io_end);
3223 ext4_put_io_end(io_end);
3224 iocb->private = NULL;
3225 }
3226 }
3227 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3266 EXT4_STATE_DIO_UNWRITTEN)) { 3228 EXT4_STATE_DIO_UNWRITTEN)) {
3267 int err; 3229 int err;
3268 /* 3230 /*
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page)
3334 return __set_page_dirty_nobuffers(page); 3296 return __set_page_dirty_nobuffers(page);
3335} 3297}
3336 3298
3337static const struct address_space_operations ext4_ordered_aops = { 3299static const struct address_space_operations ext4_aops = {
3338 .readpage = ext4_readpage, 3300 .readpage = ext4_readpage,
3339 .readpages = ext4_readpages, 3301 .readpages = ext4_readpages,
3340 .writepage = ext4_writepage, 3302 .writepage = ext4_writepage,
3341 .write_begin = ext4_write_begin, 3303 .write_begin = ext4_write_begin,
3342 .write_end = ext4_ordered_write_end, 3304 .write_end = ext4_write_end,
3343 .bmap = ext4_bmap,
3344 .invalidatepage = ext4_invalidatepage,
3345 .releasepage = ext4_releasepage,
3346 .direct_IO = ext4_direct_IO,
3347 .migratepage = buffer_migrate_page,
3348 .is_partially_uptodate = block_is_partially_uptodate,
3349 .error_remove_page = generic_error_remove_page,
3350};
3351
3352static const struct address_space_operations ext4_writeback_aops = {
3353 .readpage = ext4_readpage,
3354 .readpages = ext4_readpages,
3355 .writepage = ext4_writepage,
3356 .write_begin = ext4_write_begin,
3357 .write_end = ext4_writeback_write_end,
3358 .bmap = ext4_bmap, 3305 .bmap = ext4_bmap,
3359 .invalidatepage = ext4_invalidatepage, 3306 .invalidatepage = ext4_invalidatepage,
3360 .releasepage = ext4_releasepage, 3307 .releasepage = ext4_releasepage,
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode)
3399{ 3346{
3400 switch (ext4_inode_journal_mode(inode)) { 3347 switch (ext4_inode_journal_mode(inode)) {
3401 case EXT4_INODE_ORDERED_DATA_MODE: 3348 case EXT4_INODE_ORDERED_DATA_MODE:
3402 if (test_opt(inode->i_sb, DELALLOC)) 3349 ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3403 inode->i_mapping->a_ops = &ext4_da_aops;
3404 else
3405 inode->i_mapping->a_ops = &ext4_ordered_aops;
3406 break; 3350 break;
3407 case EXT4_INODE_WRITEBACK_DATA_MODE: 3351 case EXT4_INODE_WRITEBACK_DATA_MODE:
3408 if (test_opt(inode->i_sb, DELALLOC)) 3352 ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3409 inode->i_mapping->a_ops = &ext4_da_aops;
3410 else
3411 inode->i_mapping->a_ops = &ext4_writeback_aops;
3412 break; 3353 break;
3413 case EXT4_INODE_JOURNAL_DATA_MODE: 3354 case EXT4_INODE_JOURNAL_DATA_MODE:
3414 inode->i_mapping->a_ops = &ext4_journalled_aops; 3355 inode->i_mapping->a_ops = &ext4_journalled_aops;
3415 break; 3356 return;
3416 default: 3357 default:
3417 BUG(); 3358 BUG();
3418 } 3359 }
3360 if (test_opt(inode->i_sb, DELALLOC))
3361 inode->i_mapping->a_ops = &ext4_da_aops;
3362 else
3363 inode->i_mapping->a_ops = &ext4_aops;
3419} 3364}
3420 3365
3421 3366
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode)
3646int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3591int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3647{ 3592{
3648 struct inode *inode = file_inode(file); 3593 struct inode *inode = file_inode(file);
3594 struct super_block *sb = inode->i_sb;
3595 ext4_lblk_t first_block, stop_block;
3596 struct address_space *mapping = inode->i_mapping;
3597 loff_t first_page, last_page, page_len;
3598 loff_t first_page_offset, last_page_offset;
3599 handle_t *handle;
3600 unsigned int credits;
3601 int ret = 0;
3602
3649 if (!S_ISREG(inode->i_mode)) 3603 if (!S_ISREG(inode->i_mode))
3650 return -EOPNOTSUPP; 3604 return -EOPNOTSUPP;
3651 3605
3652 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3606 if (EXT4_SB(sb)->s_cluster_ratio > 1) {
3653 return ext4_ind_punch_hole(file, offset, length);
3654
3655 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3656 /* TODO: Add support for bigalloc file systems */ 3607 /* TODO: Add support for bigalloc file systems */
3657 return -EOPNOTSUPP; 3608 return -EOPNOTSUPP;
3658 } 3609 }
3659 3610
3660 trace_ext4_punch_hole(inode, offset, length); 3611 trace_ext4_punch_hole(inode, offset, length);
3661 3612
3662 return ext4_ext_punch_hole(file, offset, length); 3613 /*
3614 * Write out all dirty pages to avoid race conditions
3615 * Then release them.
3616 */
3617 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
3618 ret = filemap_write_and_wait_range(mapping, offset,
3619 offset + length - 1);
3620 if (ret)
3621 return ret;
3622 }
3623
3624 mutex_lock(&inode->i_mutex);
3625 /* It's not possible punch hole on append only file */
3626 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
3627 ret = -EPERM;
3628 goto out_mutex;
3629 }
3630 if (IS_SWAPFILE(inode)) {
3631 ret = -ETXTBSY;
3632 goto out_mutex;
3633 }
3634
3635 /* No need to punch hole beyond i_size */
3636 if (offset >= inode->i_size)
3637 goto out_mutex;
3638
3639 /*
3640 * If the hole extends beyond i_size, set the hole
3641 * to end after the page that contains i_size
3642 */
3643 if (offset + length > inode->i_size) {
3644 length = inode->i_size +
3645 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
3646 offset;
3647 }
3648
3649 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
3650 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
3651
3652 first_page_offset = first_page << PAGE_CACHE_SHIFT;
3653 last_page_offset = last_page << PAGE_CACHE_SHIFT;
3654
3655 /* Now release the pages */
3656 if (last_page_offset > first_page_offset) {
3657 truncate_pagecache_range(inode, first_page_offset,
3658 last_page_offset - 1);
3659 }
3660
3661 /* Wait all existing dio workers, newcomers will block on i_mutex */
3662 ext4_inode_block_unlocked_dio(inode);
3663 ret = ext4_flush_unwritten_io(inode);
3664 if (ret)
3665 goto out_dio;
3666 inode_dio_wait(inode);
3667
3668 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3669 credits = ext4_writepage_trans_blocks(inode);
3670 else
3671 credits = ext4_blocks_for_truncate(inode);
3672 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3673 if (IS_ERR(handle)) {
3674 ret = PTR_ERR(handle);
3675 ext4_std_error(sb, ret);
3676 goto out_dio;
3677 }
3678
3679 /*
3680 * Now we need to zero out the non-page-aligned data in the
3681 * pages at the start and tail of the hole, and unmap the
3682 * buffer heads for the block aligned regions of the page that
3683 * were completely zeroed.
3684 */
3685 if (first_page > last_page) {
3686 /*
3687 * If the file space being truncated is contained
3688 * within a page just zero out and unmap the middle of
3689 * that page
3690 */
3691 ret = ext4_discard_partial_page_buffers(handle,
3692 mapping, offset, length, 0);
3693
3694 if (ret)
3695 goto out_stop;
3696 } else {
3697 /*
3698 * zero out and unmap the partial page that contains
3699 * the start of the hole
3700 */
3701 page_len = first_page_offset - offset;
3702 if (page_len > 0) {
3703 ret = ext4_discard_partial_page_buffers(handle, mapping,
3704 offset, page_len, 0);
3705 if (ret)
3706 goto out_stop;
3707 }
3708
3709 /*
3710 * zero out and unmap the partial page that contains
3711 * the end of the hole
3712 */
3713 page_len = offset + length - last_page_offset;
3714 if (page_len > 0) {
3715 ret = ext4_discard_partial_page_buffers(handle, mapping,
3716 last_page_offset, page_len, 0);
3717 if (ret)
3718 goto out_stop;
3719 }
3720 }
3721
3722 /*
3723 * If i_size is contained in the last page, we need to
3724 * unmap and zero the partial page after i_size
3725 */
3726 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3727 inode->i_size % PAGE_CACHE_SIZE != 0) {
3728 page_len = PAGE_CACHE_SIZE -
3729 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3730
3731 if (page_len > 0) {
3732 ret = ext4_discard_partial_page_buffers(handle,
3733 mapping, inode->i_size, page_len, 0);
3734
3735 if (ret)
3736 goto out_stop;
3737 }
3738 }
3739
3740 first_block = (offset + sb->s_blocksize - 1) >>
3741 EXT4_BLOCK_SIZE_BITS(sb);
3742 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
3743
3744 /* If there are no blocks to remove, return now */
3745 if (first_block >= stop_block)
3746 goto out_stop;
3747
3748 down_write(&EXT4_I(inode)->i_data_sem);
3749 ext4_discard_preallocations(inode);
3750
3751 ret = ext4_es_remove_extent(inode, first_block,
3752 stop_block - first_block);
3753 if (ret) {
3754 up_write(&EXT4_I(inode)->i_data_sem);
3755 goto out_stop;
3756 }
3757
3758 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3759 ret = ext4_ext_remove_space(inode, first_block,
3760 stop_block - 1);
3761 else
3762 ret = ext4_free_hole_blocks(handle, inode, first_block,
3763 stop_block);
3764
3765 ext4_discard_preallocations(inode);
3766 up_write(&EXT4_I(inode)->i_data_sem);
3767 if (IS_SYNC(inode))
3768 ext4_handle_sync(handle);
3769 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3770 ext4_mark_inode_dirty(handle, inode);
3771out_stop:
3772 ext4_journal_stop(handle);
3773out_dio:
3774 ext4_inode_resume_unlocked_dio(inode);
3775out_mutex:
3776 mutex_unlock(&inode->i_mutex);
3777 return ret;
3663} 3778}
3664 3779
3665/* 3780/*
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3692 */ 3807 */
3693void ext4_truncate(struct inode *inode) 3808void ext4_truncate(struct inode *inode)
3694{ 3809{
3810 struct ext4_inode_info *ei = EXT4_I(inode);
3811 unsigned int credits;
3812 handle_t *handle;
3813 struct address_space *mapping = inode->i_mapping;
3814 loff_t page_len;
3815
3816 /*
3817 * There is a possibility that we're either freeing the inode
3818 * or it completely new indode. In those cases we might not
3819 * have i_mutex locked because it's not necessary.
3820 */
3821 if (!(inode->i_state & (I_NEW|I_FREEING)))
3822 WARN_ON(!mutex_is_locked(&inode->i_mutex));
3695 trace_ext4_truncate_enter(inode); 3823 trace_ext4_truncate_enter(inode);
3696 3824
3697 if (!ext4_can_truncate(inode)) 3825 if (!ext4_can_truncate(inode))
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode)
3710 return; 3838 return;
3711 } 3839 }
3712 3840
3841 /*
3842 * finish any pending end_io work so we won't run the risk of
3843 * converting any truncated blocks to initialized later
3844 */
3845 ext4_flush_unwritten_io(inode);
3846
3847 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3848 credits = ext4_writepage_trans_blocks(inode);
3849 else
3850 credits = ext4_blocks_for_truncate(inode);
3851
3852 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3853 if (IS_ERR(handle)) {
3854 ext4_std_error(inode->i_sb, PTR_ERR(handle));
3855 return;
3856 }
3857
3858 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
3859 page_len = PAGE_CACHE_SIZE -
3860 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3861
3862 if (ext4_discard_partial_page_buffers(handle,
3863 mapping, inode->i_size, page_len, 0))
3864 goto out_stop;
3865 }
3866
3867 /*
3868 * We add the inode to the orphan list, so that if this
3869 * truncate spans multiple transactions, and we crash, we will
3870 * resume the truncate when the filesystem recovers. It also
3871 * marks the inode dirty, to catch the new size.
3872 *
3873 * Implication: the file must always be in a sane, consistent
3874 * truncatable state while each transaction commits.
3875 */
3876 if (ext4_orphan_add(handle, inode))
3877 goto out_stop;
3878
3879 down_write(&EXT4_I(inode)->i_data_sem);
3880
3881 ext4_discard_preallocations(inode);
3882
3713 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3883 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3714 ext4_ext_truncate(inode); 3884 ext4_ext_truncate(handle, inode);
3715 else 3885 else
3716 ext4_ind_truncate(inode); 3886 ext4_ind_truncate(handle, inode);
3887
3888 up_write(&ei->i_data_sem);
3889
3890 if (IS_SYNC(inode))
3891 ext4_handle_sync(handle);
3892
3893out_stop:
3894 /*
3895 * If this was a simple ftruncate() and the file will remain alive,
3896 * then we need to clear up the orphan record which we created above.
3897 * However, if this was a real unlink then we were called by
3898 * ext4_delete_inode(), and we allow that function to clean up the
3899 * orphan info for us.
3900 */
3901 if (inode->i_nlink)
3902 ext4_orphan_del(handle, inode);
3903
3904 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3905 ext4_mark_inode_dirty(handle, inode);
3906 ext4_journal_stop(handle);
3717 3907
3718 trace_ext4_truncate_exit(inode); 3908 trace_ext4_truncate_exit(inode);
3719} 3909}
@@ -3821,13 +4011,14 @@ make_io:
3821 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4011 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3822 ext4_fsblk_t b, end, table; 4012 ext4_fsblk_t b, end, table;
3823 unsigned num; 4013 unsigned num;
4014 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
3824 4015
3825 table = ext4_inode_table(sb, gdp); 4016 table = ext4_inode_table(sb, gdp);
3826 /* s_inode_readahead_blks is always a power of 2 */ 4017 /* s_inode_readahead_blks is always a power of 2 */
3827 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4018 b = block & ~((ext4_fsblk_t) ra_blks - 1);
3828 if (table > b) 4019 if (table > b)
3829 b = table; 4020 b = table;
3830 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4021 end = b + ra_blks;
3831 num = EXT4_INODES_PER_GROUP(sb); 4022 num = EXT4_INODES_PER_GROUP(sb);
3832 if (ext4_has_group_desc_csum(sb)) 4023 if (ext4_has_group_desc_csum(sb))
3833 num -= ext4_itable_unused_count(sb, gdp); 4024 num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4024 * NeilBrown 1999oct15 4215 * NeilBrown 1999oct15
4025 */ 4216 */
4026 if (inode->i_nlink == 0) { 4217 if (inode->i_nlink == 0) {
4027 if (inode->i_mode == 0 || 4218 if ((inode->i_mode == 0 ||
4028 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4219 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
4220 ino != EXT4_BOOT_LOADER_INO) {
4029 /* this inode is deleted */ 4221 /* this inode is deleted */
4030 ret = -ESTALE; 4222 ret = -ESTALE;
4031 goto bad_inode; 4223 goto bad_inode;
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4033 /* The only unlinked inodes we let through here have 4225 /* The only unlinked inodes we let through here have
4034 * valid i_mode and are being read by the orphan 4226 * valid i_mode and are being read by the orphan
4035 * recovery code: that's fine, we're about to complete 4227 * recovery code: that's fine, we're about to complete
4036 * the process of deleting those. */ 4228 * the process of deleting those.
4229 * OR it is the EXT4_BOOT_LOADER_INO which is
4230 * not initialized on a new filesystem. */
4037 } 4231 }
4038 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4232 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4039 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4233 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4153 else 4347 else
4154 init_special_inode(inode, inode->i_mode, 4348 init_special_inode(inode, inode->i_mode,
4155 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4349 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4350 } else if (ino == EXT4_BOOT_LOADER_INO) {
4351 make_bad_inode(inode);
4156 } else { 4352 } else {
4157 ret = -EIO; 4353 ret = -EIO;
4158 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); 4354 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 721f4d33e148..9491ac0590f7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
20#include "ext4_extents.h"
20 21
21#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) 22#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
22 23
24/**
25 * Swap memory between @a and @b for @len bytes.
26 *
27 * @a: pointer to first memory area
28 * @b: pointer to second memory area
29 * @len: number of bytes to swap
30 *
31 */
32static void memswap(void *a, void *b, size_t len)
33{
34 unsigned char *ap, *bp;
35 unsigned char tmp;
36
37 ap = (unsigned char *)a;
38 bp = (unsigned char *)b;
39 while (len-- > 0) {
40 tmp = *ap;
41 *ap = *bp;
42 *bp = tmp;
43 ap++;
44 bp++;
45 }
46}
47
48/**
49 * Swap i_data and associated attributes between @inode1 and @inode2.
50 * This function is used for the primary swap between inode1 and inode2
51 * and also to revert this primary swap in case of errors.
52 *
53 * Therefore you have to make sure, that calling this method twice
54 * will revert all changes.
55 *
56 * @inode1: pointer to first inode
57 * @inode2: pointer to second inode
58 */
59static void swap_inode_data(struct inode *inode1, struct inode *inode2)
60{
61 loff_t isize;
62 struct ext4_inode_info *ei1;
63 struct ext4_inode_info *ei2;
64
65 ei1 = EXT4_I(inode1);
66 ei2 = EXT4_I(inode2);
67
68 memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
69 memswap(&inode1->i_version, &inode2->i_version,
70 sizeof(inode1->i_version));
71 memswap(&inode1->i_blocks, &inode2->i_blocks,
72 sizeof(inode1->i_blocks));
73 memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
74 memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
75 memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
76
77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
80 memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
81 memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
82
83 isize = i_size_read(inode1);
84 i_size_write(inode1, i_size_read(inode2));
85 i_size_write(inode2, isize);
86}
87
88/**
89 * Swap the information from the given @inode and the inode
90 * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
91 * important fields of the inodes.
92 *
93 * @sb: the super block of the filesystem
94 * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
95 *
96 */
97static long swap_inode_boot_loader(struct super_block *sb,
98 struct inode *inode)
99{
100 handle_t *handle;
101 int err;
102 struct inode *inode_bl;
103 struct ext4_inode_info *ei;
104 struct ext4_inode_info *ei_bl;
105 struct ext4_sb_info *sbi;
106
107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
108 err = -EINVAL;
109 goto swap_boot_out;
110 }
111
112 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
113 err = -EPERM;
114 goto swap_boot_out;
115 }
116
117 sbi = EXT4_SB(sb);
118 ei = EXT4_I(inode);
119
120 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
121 if (IS_ERR(inode_bl)) {
122 err = PTR_ERR(inode_bl);
123 goto swap_boot_out;
124 }
125 ei_bl = EXT4_I(inode_bl);
126
127 filemap_flush(inode->i_mapping);
128 filemap_flush(inode_bl->i_mapping);
129
130 /* Protect orig inodes against a truncate and make sure,
131 * that only 1 swap_inode_boot_loader is running. */
132 ext4_inode_double_lock(inode, inode_bl);
133
134 truncate_inode_pages(&inode->i_data, 0);
135 truncate_inode_pages(&inode_bl->i_data, 0);
136
137 /* Wait for all existing dio workers */
138 ext4_inode_block_unlocked_dio(inode);
139 ext4_inode_block_unlocked_dio(inode_bl);
140 inode_dio_wait(inode);
141 inode_dio_wait(inode_bl);
142
143 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
144 if (IS_ERR(handle)) {
145 err = -EINVAL;
146 goto swap_boot_out;
147 }
148
149 /* Protect extent tree against block allocations via delalloc */
150 ext4_double_down_write_data_sem(inode, inode_bl);
151
152 if (inode_bl->i_nlink == 0) {
153 /* this inode has never been used as a BOOT_LOADER */
154 set_nlink(inode_bl, 1);
155 i_uid_write(inode_bl, 0);
156 i_gid_write(inode_bl, 0);
157 inode_bl->i_flags = 0;
158 ei_bl->i_flags = 0;
159 inode_bl->i_version = 1;
160 i_size_write(inode_bl, 0);
161 inode_bl->i_mode = S_IFREG;
162 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
163 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
164 ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
165 ext4_ext_tree_init(handle, inode_bl);
166 } else
167 memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
168 }
169
170 swap_inode_data(inode, inode_bl);
171
172 inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
173
174 spin_lock(&sbi->s_next_gen_lock);
175 inode->i_generation = sbi->s_next_generation++;
176 inode_bl->i_generation = sbi->s_next_generation++;
177 spin_unlock(&sbi->s_next_gen_lock);
178
179 ext4_discard_preallocations(inode);
180
181 err = ext4_mark_inode_dirty(handle, inode);
182 if (err < 0) {
183 ext4_warning(inode->i_sb,
184 "couldn't mark inode #%lu dirty (err %d)",
185 inode->i_ino, err);
186 /* Revert all changes: */
187 swap_inode_data(inode, inode_bl);
188 } else {
189 err = ext4_mark_inode_dirty(handle, inode_bl);
190 if (err < 0) {
191 ext4_warning(inode_bl->i_sb,
192 "couldn't mark inode #%lu dirty (err %d)",
193 inode_bl->i_ino, err);
194 /* Revert all changes: */
195 swap_inode_data(inode, inode_bl);
196 ext4_mark_inode_dirty(handle, inode);
197 }
198 }
199
200 ext4_journal_stop(handle);
201
202 ext4_double_up_write_data_sem(inode, inode_bl);
203
204 ext4_inode_resume_unlocked_dio(inode);
205 ext4_inode_resume_unlocked_dio(inode_bl);
206
207 ext4_inode_double_unlock(inode, inode_bl);
208
209 iput(inode_bl);
210
211swap_boot_out:
212 return err;
213}
214
23long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 215long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24{ 216{
25 struct inode *inode = file_inode(filp); 217 struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
83 if (!capable(CAP_SYS_RESOURCE)) 275 if (!capable(CAP_SYS_RESOURCE))
84 goto flags_out; 276 goto flags_out;
85 } 277 }
86 if (oldflags & EXT4_EXTENTS_FL) { 278 if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
87 /* We don't support clearning extent flags */
88 if (!(flags & EXT4_EXTENTS_FL)) {
89 err = -EOPNOTSUPP;
90 goto flags_out;
91 }
92 } else if (flags & EXT4_EXTENTS_FL) {
93 /* migrate the file */
94 migrate = 1; 279 migrate = 1;
95 flags &= ~EXT4_EXTENTS_FL;
96 }
97 280
98 if (flags & EXT4_EOFBLOCKS_FL) { 281 if (flags & EXT4_EOFBLOCKS_FL) {
99 /* we don't support adding EOFBLOCKS flag */ 282 /* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
137 err = ext4_change_inode_journal_flag(inode, jflag); 320 err = ext4_change_inode_journal_flag(inode, jflag);
138 if (err) 321 if (err)
139 goto flags_out; 322 goto flags_out;
140 if (migrate) 323 if (migrate) {
141 err = ext4_ext_migrate(inode); 324 if (flags & EXT4_EXTENTS_FL)
325 err = ext4_ext_migrate(inode);
326 else
327 err = ext4_ind_migrate(inode);
328 }
329
142flags_out: 330flags_out:
143 mutex_unlock(&inode->i_mutex); 331 mutex_unlock(&inode->i_mutex);
144 mnt_drop_write_file(filp); 332 mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
357 return err; 545 return err;
358 } 546 }
359 547
548 case EXT4_IOC_SWAP_BOOT:
549 if (!(filp->f_mode & FMODE_WRITE))
550 return -EBADF;
551 return swap_inode_boot_loader(sb, inode);
552
360 case EXT4_IOC_RESIZE_FS: { 553 case EXT4_IOC_RESIZE_FS: {
361 ext4_fsblk_t n_blocks_count; 554 ext4_fsblk_t n_blocks_count;
362 struct super_block *sb = inode->i_sb;
363 int err = 0, err2 = 0; 555 int err = 0, err2 = 0;
364 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; 556 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
365 557
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..a11ea4d6164c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
405 ext4_clear_bit(bit, addr); 405 ext4_clear_bit(bit, addr);
406} 406}
407 407
408static inline int mb_test_and_clear_bit(int bit, void *addr)
409{
410 addr = mb_correct_addr_and_bit(&bit, addr);
411 return ext4_test_and_clear_bit(bit, addr);
412}
413
408static inline int mb_find_next_zero_bit(void *addr, int max, int start) 414static inline int mb_find_next_zero_bit(void *addr, int max, int start)
409{ 415{
410 int fix = 0, ret, tmpmax; 416 int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
764 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 770 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
765} 771}
766 772
773static void mb_regenerate_buddy(struct ext4_buddy *e4b)
774{
775 int count;
776 int order = 1;
777 void *buddy;
778
779 while ((buddy = mb_find_buddy(e4b, order++, &count))) {
780 ext4_set_bits(buddy, 0, count);
781 }
782 e4b->bd_info->bb_fragments = 0;
783 memset(e4b->bd_info->bb_counters, 0,
784 sizeof(*e4b->bd_info->bb_counters) *
785 (e4b->bd_sb->s_blocksize_bits + 2));
786
787 ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
788 e4b->bd_bitmap, e4b->bd_group);
789}
790
767/* The buddy information is attached the buddy cache inode 791/* The buddy information is attached the buddy cache inode
768 * for convenience. The information regarding each group 792 * for convenience. The information regarding each group
769 * is loaded via ext4_mb_load_buddy. The information involve 793 * is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
860 884
861 first_block = page->index * blocks_per_page; 885 first_block = page->index * blocks_per_page;
862 for (i = 0; i < blocks_per_page; i++) { 886 for (i = 0; i < blocks_per_page; i++) {
863 int group;
864
865 group = (first_block + i) >> 1; 887 group = (first_block + i) >> 1;
866 if (group >= ngroups) 888 if (group >= ngroups)
867 break; 889 break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1011 struct page *page; 1033 struct page *page;
1012 int ret = 0; 1034 int ret = 0;
1013 1035
1036 might_sleep();
1014 mb_debug(1, "init group %u\n", group); 1037 mb_debug(1, "init group %u\n", group);
1015 this_grp = ext4_get_group_info(sb, group); 1038 this_grp = ext4_get_group_info(sb, group);
1016 /* 1039 /*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1082 struct ext4_sb_info *sbi = EXT4_SB(sb); 1105 struct ext4_sb_info *sbi = EXT4_SB(sb);
1083 struct inode *inode = sbi->s_buddy_cache; 1106 struct inode *inode = sbi->s_buddy_cache;
1084 1107
1108 might_sleep();
1085 mb_debug(1, "load group %u\n", group); 1109 mb_debug(1, "load group %u\n", group);
1086 1110
1087 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1111 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
1244 } 1268 }
1245} 1269}
1246 1270
1271/* clear bits in given range
1272 * will return first found zero bit if any, -1 otherwise
1273 */
1274static int mb_test_and_clear_bits(void *bm, int cur, int len)
1275{
1276 __u32 *addr;
1277 int zero_bit = -1;
1278
1279 len = cur + len;
1280 while (cur < len) {
1281 if ((cur & 31) == 0 && (len - cur) >= 32) {
1282 /* fast path: clear whole word at once */
1283 addr = bm + (cur >> 3);
1284 if (*addr != (__u32)(-1) && zero_bit == -1)
1285 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1286 *addr = 0;
1287 cur += 32;
1288 continue;
1289 }
1290 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1291 zero_bit = cur;
1292 cur++;
1293 }
1294
1295 return zero_bit;
1296}
1297
1247void ext4_set_bits(void *bm, int cur, int len) 1298void ext4_set_bits(void *bm, int cur, int len)
1248{ 1299{
1249 __u32 *addr; 1300 __u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
1262 } 1313 }
1263} 1314}
1264 1315
1316/*
1317 * _________________________________________________________________ */
1318
1319static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1320{
1321 if (mb_test_bit(*bit + side, bitmap)) {
1322 mb_clear_bit(*bit, bitmap);
1323 (*bit) -= side;
1324 return 1;
1325 }
1326 else {
1327 (*bit) += side;
1328 mb_set_bit(*bit, bitmap);
1329 return -1;
1330 }
1331}
1332
1333static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1334{
1335 int max;
1336 int order = 1;
1337 void *buddy = mb_find_buddy(e4b, order, &max);
1338
1339 while (buddy) {
1340 void *buddy2;
1341
1342 /* Bits in range [first; last] are known to be set since
1343 * corresponding blocks were allocated. Bits in range
1344 * (first; last) will stay set because they form buddies on
1345 * upper layer. We just deal with borders if they don't
1346 * align with upper layer and then go up.
1347 * Releasing entire group is all about clearing
1348 * single bit of highest order buddy.
1349 */
1350
1351 /* Example:
1352 * ---------------------------------
1353 * | 1 | 1 | 1 | 1 |
1354 * ---------------------------------
1355 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1356 * ---------------------------------
1357 * 0 1 2 3 4 5 6 7
1358 * \_____________________/
1359 *
1360 * Neither [1] nor [6] is aligned to above layer.
1361 * Left neighbour [0] is free, so mark it busy,
1362 * decrease bb_counters and extend range to
1363 * [0; 6]
1364 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1365 * mark [6] free, increase bb_counters and shrink range to
1366 * [0; 5].
1367 * Then shift range to [0; 2], go up and do the same.
1368 */
1369
1370
1371 if (first & 1)
1372 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1373 if (!(last & 1))
1374 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1375 if (first > last)
1376 break;
1377 order++;
1378
1379 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1380 mb_clear_bits(buddy, first, last - first + 1);
1381 e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1382 break;
1383 }
1384 first >>= 1;
1385 last >>= 1;
1386 buddy = buddy2;
1387 }
1388}
1389
1265static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1390static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1266 int first, int count) 1391 int first, int count)
1267{ 1392{
1268 int block = 0; 1393 int left_is_free = 0;
1269 int max = 0; 1394 int right_is_free = 0;
1270 int order; 1395 int block;
1271 void *buddy; 1396 int last = first + count - 1;
1272 void *buddy2;
1273 struct super_block *sb = e4b->bd_sb; 1397 struct super_block *sb = e4b->bd_sb;
1274 1398
1275 BUG_ON(first + count > (sb->s_blocksize << 3)); 1399 BUG_ON(last >= (sb->s_blocksize << 3));
1276 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1400 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1277 mb_check_buddy(e4b); 1401 mb_check_buddy(e4b);
1278 mb_free_blocks_double(inode, e4b, first, count); 1402 mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1281 if (first < e4b->bd_info->bb_first_free) 1405 if (first < e4b->bd_info->bb_first_free)
1282 e4b->bd_info->bb_first_free = first; 1406 e4b->bd_info->bb_first_free = first;
1283 1407
1284 /* let's maintain fragments counter */ 1408 /* access memory sequentially: check left neighbour,
1409 * clear range and then check right neighbour
1410 */
1285 if (first != 0) 1411 if (first != 0)
1286 block = !mb_test_bit(first - 1, e4b->bd_bitmap); 1412 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1287 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1413 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1288 max = !mb_test_bit(first + count, e4b->bd_bitmap); 1414 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1289 if (block && max) 1415 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1290 e4b->bd_info->bb_fragments--;
1291 else if (!block && !max)
1292 e4b->bd_info->bb_fragments++;
1293 1416
1294 /* let's maintain buddy itself */ 1417 if (unlikely(block != -1)) {
1295 while (count-- > 0) { 1418 ext4_fsblk_t blocknr;
1296 block = first++;
1297 order = 0;
1298 1419
1299 if (!mb_test_bit(block, e4b->bd_bitmap)) { 1420 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1300 ext4_fsblk_t blocknr; 1421 blocknr += EXT4_C2B(EXT4_SB(sb), block);
1301 1422 ext4_grp_locked_error(sb, e4b->bd_group,
1302 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1423 inode ? inode->i_ino : 0,
1303 blocknr += EXT4_C2B(EXT4_SB(sb), block); 1424 blocknr,
1304 ext4_grp_locked_error(sb, e4b->bd_group, 1425 "freeing already freed block "
1305 inode ? inode->i_ino : 0, 1426 "(bit %u)", block);
1306 blocknr, 1427 mb_regenerate_buddy(e4b);
1307 "freeing already freed block " 1428 goto done;
1308 "(bit %u)", block); 1429 }
1309 }
1310 mb_clear_bit(block, e4b->bd_bitmap);
1311 e4b->bd_info->bb_counters[order]++;
1312
1313 /* start of the buddy */
1314 buddy = mb_find_buddy(e4b, order, &max);
1315
1316 do {
1317 block &= ~1UL;
1318 if (mb_test_bit(block, buddy) ||
1319 mb_test_bit(block + 1, buddy))
1320 break;
1321
1322 /* both the buddies are free, try to coalesce them */
1323 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1324 1430
1325 if (!buddy2) 1431 /* let's maintain fragments counter */
1326 break; 1432 if (left_is_free && right_is_free)
1433 e4b->bd_info->bb_fragments--;
1434 else if (!left_is_free && !right_is_free)
1435 e4b->bd_info->bb_fragments++;
1327 1436
1328 if (order > 0) { 1437 /* buddy[0] == bd_bitmap is a special case, so handle
1329 /* for special purposes, we don't set 1438 * it right away and let mb_buddy_mark_free stay free of
1330 * free bits in bitmap */ 1439 * zero order checks.
1331 mb_set_bit(block, buddy); 1440 * Check if neighbours are to be coaleasced,
1332 mb_set_bit(block + 1, buddy); 1441 * adjust bitmap bb_counters and borders appropriately.
1333 } 1442 */
1334 e4b->bd_info->bb_counters[order]--; 1443 if (first & 1) {
1335 e4b->bd_info->bb_counters[order]--; 1444 first += !left_is_free;
1445 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1446 }
1447 if (!(last & 1)) {
1448 last -= !right_is_free;
1449 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1450 }
1336 1451
1337 block = block >> 1; 1452 if (first <= last)
1338 order++; 1453 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1339 e4b->bd_info->bb_counters[order]++;
1340 1454
1341 mb_clear_bit(block, buddy2); 1455done:
1342 buddy = buddy2;
1343 } while (1);
1344 }
1345 mb_set_largest_free_order(sb, e4b->bd_info); 1456 mb_set_largest_free_order(sb, e4b->bd_info);
1346 mb_check_buddy(e4b); 1457 mb_check_buddy(e4b);
1347} 1458}
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3342 if (pa->pa_type == MB_GROUP_PA) 3453 if (pa->pa_type == MB_GROUP_PA)
3343 grp_blk--; 3454 grp_blk--;
3344 3455
3345 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3456 grp = ext4_get_group_number(sb, grp_blk);
3346 3457
3347 /* 3458 /*
3348 * possible race: 3459 * possible race:
@@ -3807,7 +3918,7 @@ repeat:
3807 3918
3808 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3919 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3809 BUG_ON(pa->pa_type != MB_INODE_PA); 3920 BUG_ON(pa->pa_type != MB_INODE_PA);
3810 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3921 group = ext4_get_group_number(sb, pa->pa_pstart);
3811 3922
3812 err = ext4_mb_load_buddy(sb, group, &e4b); 3923 err = ext4_mb_load_buddy(sb, group, &e4b);
3813 if (err) { 3924 if (err) {
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4069 4180
4070 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4181 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4071 4182
4072 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4183 group = ext4_get_group_number(sb, pa->pa_pstart);
4073 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4184 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4074 ext4_error(sb, "Error loading buddy information for %u", 4185 ext4_error(sb, "Error loading buddy information for %u",
4075 group); 4186 group);
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4217 unsigned int inquota = 0; 4328 unsigned int inquota = 0;
4218 unsigned int reserv_clstrs = 0; 4329 unsigned int reserv_clstrs = 0;
4219 4330
4331 might_sleep();
4220 sb = ar->inode->i_sb; 4332 sb = ar->inode->i_sb;
4221 sbi = EXT4_SB(sb); 4333 sbi = EXT4_SB(sb);
4222 4334
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4420 node = rb_prev(new_node); 4532 node = rb_prev(new_node);
4421 if (node) { 4533 if (node) {
4422 entry = rb_entry(node, struct ext4_free_data, efd_node); 4534 entry = rb_entry(node, struct ext4_free_data, efd_node);
4423 if (can_merge(entry, new_entry)) { 4535 if (can_merge(entry, new_entry) &&
4536 ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4424 new_entry->efd_start_cluster = entry->efd_start_cluster; 4537 new_entry->efd_start_cluster = entry->efd_start_cluster;
4425 new_entry->efd_count += entry->efd_count; 4538 new_entry->efd_count += entry->efd_count;
4426 rb_erase(node, &(db->bb_free_root)); 4539 rb_erase(node, &(db->bb_free_root));
4427 ext4_journal_callback_del(handle, &entry->efd_jce);
4428 kmem_cache_free(ext4_free_data_cachep, entry); 4540 kmem_cache_free(ext4_free_data_cachep, entry);
4429 } 4541 }
4430 } 4542 }
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4432 node = rb_next(new_node); 4544 node = rb_next(new_node);
4433 if (node) { 4545 if (node) {
4434 entry = rb_entry(node, struct ext4_free_data, efd_node); 4546 entry = rb_entry(node, struct ext4_free_data, efd_node);
4435 if (can_merge(new_entry, entry)) { 4547 if (can_merge(new_entry, entry) &&
4548 ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4436 new_entry->efd_count += entry->efd_count; 4549 new_entry->efd_count += entry->efd_count;
4437 rb_erase(node, &(db->bb_free_root)); 4550 rb_erase(node, &(db->bb_free_root));
4438 ext4_journal_callback_del(handle, &entry->efd_jce);
4439 kmem_cache_free(ext4_free_data_cachep, entry); 4551 kmem_cache_free(ext4_free_data_cachep, entry);
4440 } 4552 }
4441 } 4553 }
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4470 int err = 0; 4582 int err = 0;
4471 int ret; 4583 int ret;
4472 4584
4585 might_sleep();
4473 if (bh) { 4586 if (bh) {
4474 if (block) 4587 if (block)
4475 BUG_ON(block != bh->b_blocknr); 4588 BUG_ON(block != bh->b_blocknr);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 480acf4a085f..49e8bdff9163 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
426 return retval; 426 return retval;
427 } 427 }
428 return retval; 428 return retval;
429
430} 429}
431 430
432int ext4_ext_migrate(struct inode *inode) 431int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
606 605
607 return retval; 606 return retval;
608} 607}
608
609/*
610 * Migrate a simple extent-based inode to use the i_blocks[] array
611 */
612int ext4_ind_migrate(struct inode *inode)
613{
614 struct ext4_extent_header *eh;
615 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
616 struct ext4_inode_info *ei = EXT4_I(inode);
617 struct ext4_extent *ex;
618 unsigned int i, len;
619 ext4_fsblk_t blk;
620 handle_t *handle;
621 int ret;
622
623 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
624 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
625 (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
626 return -EINVAL;
627
628 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
629 EXT4_FEATURE_RO_COMPAT_BIGALLOC))
630 return -EOPNOTSUPP;
631
632 handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
633 if (IS_ERR(handle))
634 return PTR_ERR(handle);
635
636 down_write(&EXT4_I(inode)->i_data_sem);
637 ret = ext4_ext_check_inode(inode);
638 if (ret)
639 goto errout;
640
641 eh = ext_inode_hdr(inode);
642 ex = EXT_FIRST_EXTENT(eh);
643 if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
644 eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
645 ret = -EOPNOTSUPP;
646 goto errout;
647 }
648 if (eh->eh_entries == 0)
649 blk = len = 0;
650 else {
651 len = le16_to_cpu(ex->ee_len);
652 blk = ext4_ext_pblock(ex);
653 if (len > EXT4_NDIR_BLOCKS) {
654 ret = -EOPNOTSUPP;
655 goto errout;
656 }
657 }
658
659 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
660 memset(ei->i_data, 0, sizeof(ei->i_data));
661 for (i=0; i < len; i++)
662 ei->i_data[i] = cpu_to_le32(blk++);
663 ext4_mark_inode_dirty(handle, inode);
664errout:
665 ext4_journal_stop(handle);
666 up_write(&EXT4_I(inode)->i_data_sem);
667 return ret;
668}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f9b551561d2c..214461e42a05 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
7#include "ext4.h" 7#include "ext4.h"
8 8
9/* Checksumming functions */ 9/* Checksumming functions */
10static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) 10static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
11{ 11{
12 struct ext4_sb_info *sbi = EXT4_SB(sb); 12 struct ext4_sb_info *sbi = EXT4_SB(sb);
13 int offset = offsetof(struct mmp_struct, mmp_checksum); 13 int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
54 lock_buffer(bh); 54 lock_buffer(bh);
55 bh->b_end_io = end_buffer_write_sync; 55 bh->b_end_io = end_buffer_write_sync;
56 get_bh(bh); 56 get_bh(bh);
57 submit_bh(WRITE_SYNC, bh); 57 submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
58 wait_on_buffer(bh); 58 wait_on_buffer(bh);
59 sb_end_write(sb); 59 sb_end_write(sb);
60 if (unlikely(!buffer_uptodate(bh))) 60 if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
86 get_bh(*bh); 86 get_bh(*bh);
87 lock_buffer(*bh); 87 lock_buffer(*bh);
88 (*bh)->b_end_io = end_buffer_read_sync; 88 (*bh)->b_end_io = end_buffer_read_sync;
89 submit_bh(READ_SYNC, *bh); 89 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
90 wait_on_buffer(*bh); 90 wait_on_buffer(*bh);
91 if (!buffer_uptodate(*bh)) { 91 if (!buffer_uptodate(*bh)) {
92 brelse(*bh); 92 brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 33e1c086858b..3dcbf364022f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
144} 144}
145 145
146/** 146/**
147 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 147 * ext4_double_down_write_data_sem - Acquire two inodes' write lock
148 * of i_data_sem
148 * 149 *
149 * Acquire write lock of i_data_sem of the two inodes 150 * Acquire write lock of i_data_sem of the two inodes
150 */ 151 */
151static void 152void
152double_down_write_data_sem(struct inode *first, struct inode *second) 153ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
153{ 154{
154 if (first < second) { 155 if (first < second) {
155 down_write(&EXT4_I(first)->i_data_sem); 156 down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
162} 163}
163 164
164/** 165/**
165 * double_up_write_data_sem - Release two inodes' write lock of i_data_sem 166 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
166 * 167 *
167 * @orig_inode: original inode structure to be released its lock first 168 * @orig_inode: original inode structure to be released its lock first
168 * @donor_inode: donor inode structure to be released its lock second 169 * @donor_inode: donor inode structure to be released its lock second
169 * Release write lock of i_data_sem of two inodes (orig and donor). 170 * Release write lock of i_data_sem of two inodes (orig and donor).
170 */ 171 */
171static void 172void
172double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 173ext4_double_up_write_data_sem(struct inode *orig_inode,
174 struct inode *donor_inode)
173{ 175{
174 up_write(&EXT4_I(orig_inode)->i_data_sem); 176 up_write(&EXT4_I(orig_inode)->i_data_sem);
175 up_write(&EXT4_I(donor_inode)->i_data_sem); 177 up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
407 mext_insert_inside_block(o_start, o_end, start_ext, new_ext, 409 mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
408 end_ext, eh, range_to_move); 410 end_ext, eh, range_to_move);
409 411
410 if (depth) { 412 return ext4_ext_dirty(handle, orig_inode, orig_path);
411 ret = ext4_handle_dirty_metadata(handle, orig_inode,
412 orig_path->p_bh);
413 if (ret)
414 return ret;
415 } else {
416 ret = ext4_mark_inode_dirty(handle, orig_inode);
417 if (ret < 0)
418 return ret;
419 }
420
421 return 0;
422} 413}
423 414
424/** 415/**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
737 donor_off += dext_alen; 728 donor_off += dext_alen;
738 orig_off += dext_alen; 729 orig_off += dext_alen;
739 730
731 BUG_ON(replaced_count > count);
740 /* Already moved the expected blocks */ 732 /* Already moved the expected blocks */
741 if (replaced_count >= count) 733 if (replaced_count >= count)
742 break; 734 break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
814 page_cache_release(page[0]); 806 page_cache_release(page[0]);
815 return -ENOMEM; 807 return -ENOMEM;
816 } 808 }
817 809 /*
810 * grab_cache_page_write_begin() may not wait on page's writeback if
811 * BDI not demand that. But it is reasonable to be very conservative
812 * here and explicitly wait on page's writeback
813 */
814 wait_on_page_writeback(page[0]);
815 wait_on_page_writeback(page[1]);
818 if (inode1 > inode2) { 816 if (inode1 > inode2) {
819 struct page *tmp; 817 struct page *tmp;
820 tmp = page[0]; 818 tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
856 if (buffer_uptodate(bh)) 854 if (buffer_uptodate(bh))
857 continue; 855 continue;
858 if (!buffer_mapped(bh)) { 856 if (!buffer_mapped(bh)) {
859 int err = 0;
860 err = ext4_get_block(inode, block, bh, 0); 857 err = ext4_get_block(inode, block, bh, 0);
861 if (err) { 858 if (err) {
862 SetPageError(page); 859 SetPageError(page);
@@ -976,7 +973,7 @@ again:
976 * necessary, just swap data blocks between orig and donor. 973 * necessary, just swap data blocks between orig and donor.
977 */ 974 */
978 if (uninit) { 975 if (uninit) {
979 double_down_write_data_sem(orig_inode, donor_inode); 976 ext4_double_down_write_data_sem(orig_inode, donor_inode);
980 /* If any of extents in range became initialized we have to 977 /* If any of extents in range became initialized we have to
981 * fallback to data copying */ 978 * fallback to data copying */
982 uninit = mext_check_coverage(orig_inode, orig_blk_offset, 979 uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ again:
990 goto drop_data_sem; 987 goto drop_data_sem;
991 988
992 if (!uninit) { 989 if (!uninit) {
993 double_up_write_data_sem(orig_inode, donor_inode); 990 ext4_double_up_write_data_sem(orig_inode, donor_inode);
994 goto data_copy; 991 goto data_copy;
995 } 992 }
996 if ((page_has_private(pagep[0]) && 993 if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ again:
1004 donor_inode, orig_blk_offset, 1001 donor_inode, orig_blk_offset,
1005 block_len_in_page, err); 1002 block_len_in_page, err);
1006 drop_data_sem: 1003 drop_data_sem:
1007 double_up_write_data_sem(orig_inode, donor_inode); 1004 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1008 goto unlock_pages; 1005 goto unlock_pages;
1009 } 1006 }
1010data_copy: 1007data_copy:
@@ -1033,7 +1030,7 @@ data_copy:
1033 } 1030 }
1034 /* Perform all necessary steps similar write_begin()/write_end() 1031 /* Perform all necessary steps similar write_begin()/write_end()
1035 * but keeping in mind that i_size will not change */ 1032 * but keeping in mind that i_size will not change */
1036 *err = __block_write_begin(pagep[0], from, from + replaced_size, 1033 *err = __block_write_begin(pagep[0], from, replaced_size,
1037 ext4_get_block); 1034 ext4_get_block);
1038 if (!*err) 1035 if (!*err)
1039 *err = block_commit_write(pagep[0], from, from + replaced_size); 1036 *err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ repair_branches:
1065 * Extents are swapped already, but we are not able to copy data. 1062 * Extents are swapped already, but we are not able to copy data.
1066 * Try to swap extents to it's original places 1063 * Try to swap extents to it's original places
1067 */ 1064 */
1068 double_down_write_data_sem(orig_inode, donor_inode); 1065 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1069 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 1066 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
1070 orig_blk_offset, 1067 orig_blk_offset,
1071 block_len_in_page, &err2); 1068 block_len_in_page, &err2);
1072 double_up_write_data_sem(orig_inode, donor_inode); 1069 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1073 if (replaced_count != block_len_in_page) { 1070 if (replaced_count != block_len_in_page) {
1074 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), 1071 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
1075 "Unable to copy data block," 1072 "Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
1209} 1206}
1210 1207
1211/** 1208/**
1212 * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 1209 * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
1213 * 1210 *
1214 * @inode1: the inode structure 1211 * @inode1: the inode structure
1215 * @inode2: the inode structure 1212 * @inode2: the inode structure
1216 * 1213 *
1217 * Lock two inodes' i_mutex 1214 * Lock two inodes' i_mutex
1218 */ 1215 */
1219static void 1216void
1220mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1217ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
1221{ 1218{
1222 BUG_ON(inode1 == inode2); 1219 BUG_ON(inode1 == inode2);
1223 if (inode1 < inode2) { 1220 if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1230} 1227}
1231 1228
1232/** 1229/**
1233 * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 1230 * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
1234 * 1231 *
1235 * @inode1: the inode that is released first 1232 * @inode1: the inode that is released first
1236 * @inode2: the inode that is released second 1233 * @inode2: the inode that is released second
1237 * 1234 *
1238 */ 1235 */
1239 1236
1240static void 1237void
1241mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1238ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1242{ 1239{
1243 mutex_unlock(&inode1->i_mutex); 1240 mutex_unlock(&inode1->i_mutex);
1244 mutex_unlock(&inode2->i_mutex); 1241 mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1333 return -EINVAL; 1330 return -EINVAL;
1334 } 1331 }
1335 /* Protect orig and donor inodes against a truncate */ 1332 /* Protect orig and donor inodes against a truncate */
1336 mext_inode_double_lock(orig_inode, donor_inode); 1333 ext4_inode_double_lock(orig_inode, donor_inode);
1337 1334
1338 /* Wait for all existing dio workers */ 1335 /* Wait for all existing dio workers */
1339 ext4_inode_block_unlocked_dio(orig_inode); 1336 ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1342 inode_dio_wait(donor_inode); 1339 inode_dio_wait(donor_inode);
1343 1340
1344 /* Protect extent tree against block allocations via delalloc */ 1341 /* Protect extent tree against block allocations via delalloc */
1345 double_down_write_data_sem(orig_inode, donor_inode); 1342 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1346 /* Check the filesystem environment whether move_extent can be done */ 1343 /* Check the filesystem environment whether move_extent can be done */
1347 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1344 ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
1348 donor_start, &len); 1345 donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1466 * b. racing with ->readpage, ->write_begin, and ext4_get_block 1463 * b. racing with ->readpage, ->write_begin, and ext4_get_block
1467 * in move_extent_per_page 1464 * in move_extent_per_page
1468 */ 1465 */
1469 double_up_write_data_sem(orig_inode, donor_inode); 1466 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1470 1467
1471 while (orig_page_offset <= seq_end_page) { 1468 while (orig_page_offset <= seq_end_page) {
1472 1469
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1500 block_len_in_page = rest_blocks; 1497 block_len_in_page = rest_blocks;
1501 } 1498 }
1502 1499
1503 double_down_write_data_sem(orig_inode, donor_inode); 1500 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1504 if (ret < 0) 1501 if (ret < 0)
1505 break; 1502 break;
1506 1503
@@ -1538,10 +1535,10 @@ out:
1538 ext4_ext_drop_refs(holecheck_path); 1535 ext4_ext_drop_refs(holecheck_path);
1539 kfree(holecheck_path); 1536 kfree(holecheck_path);
1540 } 1537 }
1541 double_up_write_data_sem(orig_inode, donor_inode); 1538 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1542 ext4_inode_resume_unlocked_dio(orig_inode); 1539 ext4_inode_resume_unlocked_dio(orig_inode);
1543 ext4_inode_resume_unlocked_dio(donor_inode); 1540 ext4_inode_resume_unlocked_dio(donor_inode);
1544 mext_inode_double_unlock(orig_inode, donor_inode); 1541 ext4_inode_double_unlock(orig_inode, donor_inode);
1545 1542
1546 return ret; 1543 return ret;
1547} 1544}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3825d6aa8336..6653fc35ecb7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
416{ 416{
417 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 417 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
418 struct ext4_inode_info *ei = EXT4_I(inode); 418 struct ext4_inode_info *ei = EXT4_I(inode);
419 __u32 csum, old_csum; 419 __u32 csum;
420 __le32 save_csum;
420 int size; 421 int size;
421 422
422 size = count_offset + (count * sizeof(struct dx_entry)); 423 size = count_offset + (count * sizeof(struct dx_entry));
423 old_csum = t->dt_checksum; 424 save_csum = t->dt_checksum;
424 t->dt_checksum = 0; 425 t->dt_checksum = 0;
425 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); 426 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
426 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); 427 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
427 t->dt_checksum = old_csum; 428 t->dt_checksum = save_csum;
428 429
429 return cpu_to_le32(csum); 430 return cpu_to_le32(csum);
430} 431}
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
971 hinfo.hash_version += 972 hinfo.hash_version +=
972 EXT4_SB(dir->i_sb)->s_hash_unsigned; 973 EXT4_SB(dir->i_sb)->s_hash_unsigned;
973 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 974 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
975 if (ext4_has_inline_data(dir)) {
976 int has_inline_data = 1;
977 count = htree_inlinedir_to_tree(dir_file, dir, 0,
978 &hinfo, start_hash,
979 start_minor_hash,
980 &has_inline_data);
981 if (has_inline_data) {
982 *next_hash = ~0;
983 return count;
984 }
985 }
974 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 986 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
975 start_hash, start_minor_hash); 987 start_hash, start_minor_hash);
976 *next_hash = ~0; 988 *next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
1455 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); 1467 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1456} 1468}
1457 1469
1458#define S_SHIFT 12
1459static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1460 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
1461 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1462 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
1463 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
1464 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1465 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1466 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
1467};
1468
1469static inline void ext4_set_de_type(struct super_block *sb,
1470 struct ext4_dir_entry_2 *de,
1471 umode_t mode) {
1472 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1473 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1474}
1475
1476/* 1470/*
1477 * Move count entries from end of map between two memory locations. 1471 * Move count entries from end of map between two memory locations.
1478 * Returns pointer to last entry moved. 1472 * Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2251 dquot_initialize(dir); 2245 dquot_initialize(dir);
2252 2246
2253 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2247 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2254 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2248 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2255 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2256retry: 2249retry:
2257 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2250 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
2258 NULL, EXT4_HT_DIR, credits); 2251 NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
2286 dquot_initialize(dir); 2279 dquot_initialize(dir);
2287 2280
2288 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2281 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2289 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2282 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2290 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2291retry: 2283retry:
2292 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2284 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
2293 NULL, EXT4_HT_DIR, credits); 2285 NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2396 dquot_initialize(dir); 2388 dquot_initialize(dir);
2397 2389
2398 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2390 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2399 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2391 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
2400 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2401retry: 2392retry:
2402 inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, 2393 inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
2403 &dentry->d_name, 2394 &dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
2826 * quota blocks, sb is already counted in previous macros). 2817 * quota blocks, sb is already counted in previous macros).
2827 */ 2818 */
2828 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2819 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2829 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2820 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
2830 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2831 } 2821 }
2832retry: 2822retry:
2833 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, 2823 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 047a6de04a0a..5929cd0baa20 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -29,25 +29,19 @@
29#include "xattr.h" 29#include "xattr.h"
30#include "acl.h" 30#include "acl.h"
31 31
32static struct kmem_cache *io_page_cachep, *io_end_cachep; 32static struct kmem_cache *io_end_cachep;
33 33
34int __init ext4_init_pageio(void) 34int __init ext4_init_pageio(void)
35{ 35{
36 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
37 if (io_page_cachep == NULL)
38 return -ENOMEM;
39 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 36 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
40 if (io_end_cachep == NULL) { 37 if (io_end_cachep == NULL)
41 kmem_cache_destroy(io_page_cachep);
42 return -ENOMEM; 38 return -ENOMEM;
43 }
44 return 0; 39 return 0;
45} 40}
46 41
47void ext4_exit_pageio(void) 42void ext4_exit_pageio(void)
48{ 43{
49 kmem_cache_destroy(io_end_cachep); 44 kmem_cache_destroy(io_end_cachep);
50 kmem_cache_destroy(io_page_cachep);
51} 45}
52 46
53/* 47/*
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
67 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 61 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
68} 62}
69 63
70static void put_io_page(struct ext4_io_page *io_page) 64static void ext4_release_io_end(ext4_io_end_t *io_end)
71{ 65{
72 if (atomic_dec_and_test(&io_page->p_count)) { 66 BUG_ON(!list_empty(&io_end->list));
73 end_page_writeback(io_page->p_page); 67 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
74 put_page(io_page->p_page); 68
75 kmem_cache_free(io_page_cachep, io_page); 69 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
76 } 70 wake_up_all(ext4_ioend_wq(io_end->inode));
71 if (io_end->flag & EXT4_IO_END_DIRECT)
72 inode_dio_done(io_end->inode);
73 if (io_end->iocb)
74 aio_complete(io_end->iocb, io_end->result, 0);
75 kmem_cache_free(io_end_cachep, io_end);
77} 76}
78 77
79void ext4_free_io_end(ext4_io_end_t *io) 78static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
80{ 79{
81 int i; 80 struct inode *inode = io_end->inode;
82
83 BUG_ON(!io);
84 BUG_ON(!list_empty(&io->list));
85 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
86 81
87 for (i = 0; i < io->num_io_pages; i++) 82 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
88 put_io_page(io->pages[i]); 83 /* Wake up anyone waiting on unwritten extent conversion */
89 io->num_io_pages = 0; 84 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
90 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 85 wake_up_all(ext4_ioend_wq(inode));
91 wake_up_all(ext4_ioend_wq(io->inode));
92 kmem_cache_free(io_end_cachep, io);
93} 86}
94 87
95/* check a range of space and convert unwritten extents to written. */ 88/* check a range of space and convert unwritten extents to written. */
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
112 "(inode %lu, offset %llu, size %zd, error %d)", 105 "(inode %lu, offset %llu, size %zd, error %d)",
113 inode->i_ino, offset, size, ret); 106 inode->i_ino, offset, size, ret);
114 } 107 }
115 /* Wake up anyone waiting on unwritten extent conversion */ 108 ext4_clear_io_unwritten_flag(io);
116 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 109 ext4_release_io_end(io);
117 wake_up_all(ext4_ioend_wq(inode));
118 if (io->flag & EXT4_IO_END_DIRECT)
119 inode_dio_done(inode);
120 if (io->iocb)
121 aio_complete(io->iocb, io->result, 0);
122 return ret; 110 return ret;
123} 111}
124 112
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
149} 137}
150 138
151/* Add the io_end to per-inode completed end_io list. */ 139/* Add the io_end to per-inode completed end_io list. */
152void ext4_add_complete_io(ext4_io_end_t *io_end) 140static void ext4_add_complete_io(ext4_io_end_t *io_end)
153{ 141{
154 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 142 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
155 struct workqueue_struct *wq; 143 struct workqueue_struct *wq;
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
186 err = ext4_end_io(io); 174 err = ext4_end_io(io);
187 if (unlikely(!ret && err)) 175 if (unlikely(!ret && err))
188 ret = err; 176 ret = err;
189 io->flag &= ~EXT4_IO_END_UNWRITTEN;
190 ext4_free_io_end(io);
191 } 177 }
192 return ret; 178 return ret;
193} 179}
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
219 atomic_inc(&EXT4_I(inode)->i_ioend_count); 205 atomic_inc(&EXT4_I(inode)->i_ioend_count);
220 io->inode = inode; 206 io->inode = inode;
221 INIT_LIST_HEAD(&io->list); 207 INIT_LIST_HEAD(&io->list);
208 atomic_set(&io->count, 1);
222 } 209 }
223 return io; 210 return io;
224} 211}
225 212
213void ext4_put_io_end_defer(ext4_io_end_t *io_end)
214{
215 if (atomic_dec_and_test(&io_end->count)) {
216 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
217 ext4_release_io_end(io_end);
218 return;
219 }
220 ext4_add_complete_io(io_end);
221 }
222}
223
224int ext4_put_io_end(ext4_io_end_t *io_end)
225{
226 int err = 0;
227
228 if (atomic_dec_and_test(&io_end->count)) {
229 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
230 err = ext4_convert_unwritten_extents(io_end->inode,
231 io_end->offset, io_end->size);
232 ext4_clear_io_unwritten_flag(io_end);
233 }
234 ext4_release_io_end(io_end);
235 }
236 return err;
237}
238
239ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
240{
241 atomic_inc(&io_end->count);
242 return io_end;
243}
244
226/* 245/*
227 * Print an buffer I/O error compatible with the fs/buffer.c. This 246 * Print an buffer I/O error compatible with the fs/buffer.c. This
228 * provides compatibility with dmesg scrapers that look for a specific 247 * provides compatibility with dmesg scrapers that look for a specific
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
243 ext4_io_end_t *io_end = bio->bi_private; 262 ext4_io_end_t *io_end = bio->bi_private;
244 struct inode *inode; 263 struct inode *inode;
245 int i; 264 int i;
265 int blocksize;
246 sector_t bi_sector = bio->bi_sector; 266 sector_t bi_sector = bio->bi_sector;
247 267
248 BUG_ON(!io_end); 268 BUG_ON(!io_end);
269 inode = io_end->inode;
270 blocksize = 1 << inode->i_blkbits;
249 bio->bi_private = NULL; 271 bio->bi_private = NULL;
250 bio->bi_end_io = NULL; 272 bio->bi_end_io = NULL;
251 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 273 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
252 error = 0; 274 error = 0;
253 bio_put(bio); 275 for (i = 0; i < bio->bi_vcnt; i++) {
254 276 struct bio_vec *bvec = &bio->bi_io_vec[i];
255 for (i = 0; i < io_end->num_io_pages; i++) { 277 struct page *page = bvec->bv_page;
256 struct page *page = io_end->pages[i]->p_page;
257 struct buffer_head *bh, *head; 278 struct buffer_head *bh, *head;
258 loff_t offset; 279 unsigned bio_start = bvec->bv_offset;
259 loff_t io_end_offset; 280 unsigned bio_end = bio_start + bvec->bv_len;
281 unsigned under_io = 0;
282 unsigned long flags;
283
284 if (!page)
285 continue;
260 286
261 if (error) { 287 if (error) {
262 SetPageError(page); 288 SetPageError(page);
263 set_bit(AS_EIO, &page->mapping->flags); 289 set_bit(AS_EIO, &page->mapping->flags);
264 head = page_buffers(page);
265 BUG_ON(!head);
266
267 io_end_offset = io_end->offset + io_end->size;
268
269 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
270 bh = head;
271 do {
272 if ((offset >= io_end->offset) &&
273 (offset+bh->b_size <= io_end_offset))
274 buffer_io_error(bh);
275
276 offset += bh->b_size;
277 bh = bh->b_this_page;
278 } while (bh != head);
279 } 290 }
280 291 bh = head = page_buffers(page);
281 put_io_page(io_end->pages[i]); 292 /*
293 * We check all buffers in the page under BH_Uptodate_Lock
294 * to avoid races with other end io clearing async_write flags
295 */
296 local_irq_save(flags);
297 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
298 do {
299 if (bh_offset(bh) < bio_start ||
300 bh_offset(bh) + blocksize > bio_end) {
301 if (buffer_async_write(bh))
302 under_io++;
303 continue;
304 }
305 clear_buffer_async_write(bh);
306 if (error)
307 buffer_io_error(bh);
308 } while ((bh = bh->b_this_page) != head);
309 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
310 local_irq_restore(flags);
311 if (!under_io)
312 end_page_writeback(page);
282 } 313 }
283 io_end->num_io_pages = 0; 314 bio_put(bio);
284 inode = io_end->inode;
285 315
286 if (error) { 316 if (error) {
287 io_end->flag |= EXT4_IO_END_ERROR; 317 io_end->flag |= EXT4_IO_END_ERROR;
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
294 bi_sector >> (inode->i_blkbits - 9)); 324 bi_sector >> (inode->i_blkbits - 9));
295 } 325 }
296 326
297 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 327 ext4_put_io_end_defer(io_end);
298 ext4_free_io_end(io_end);
299 return;
300 }
301
302 ext4_add_complete_io(io_end);
303} 328}
304 329
305void ext4_io_submit(struct ext4_io_submit *io) 330void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
313 bio_put(io->io_bio); 338 bio_put(io->io_bio);
314 } 339 }
315 io->io_bio = NULL; 340 io->io_bio = NULL;
316 io->io_op = 0; 341}
342
343void ext4_io_submit_init(struct ext4_io_submit *io,
344 struct writeback_control *wbc)
345{
346 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
347 io->io_bio = NULL;
317 io->io_end = NULL; 348 io->io_end = NULL;
318} 349}
319 350
320static int io_submit_init(struct ext4_io_submit *io, 351static int io_submit_init_bio(struct ext4_io_submit *io,
321 struct inode *inode, 352 struct buffer_head *bh)
322 struct writeback_control *wbc,
323 struct buffer_head *bh)
324{ 353{
325 ext4_io_end_t *io_end;
326 struct page *page = bh->b_page;
327 int nvecs = bio_get_nr_vecs(bh->b_bdev); 354 int nvecs = bio_get_nr_vecs(bh->b_bdev);
328 struct bio *bio; 355 struct bio *bio;
329 356
330 io_end = ext4_init_io_end(inode, GFP_NOFS);
331 if (!io_end)
332 return -ENOMEM;
333 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 357 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
334 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 358 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
335 bio->bi_bdev = bh->b_bdev; 359 bio->bi_bdev = bh->b_bdev;
336 bio->bi_private = io->io_end = io_end;
337 bio->bi_end_io = ext4_end_bio; 360 bio->bi_end_io = ext4_end_bio;
338 361 bio->bi_private = ext4_get_io_end(io->io_end);
339 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 362 if (!io->io_end->size)
340 363 io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
364 + bh_offset(bh);
341 io->io_bio = bio; 365 io->io_bio = bio;
342 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
343 io->io_next_block = bh->b_blocknr; 366 io->io_next_block = bh->b_blocknr;
344 return 0; 367 return 0;
345} 368}
346 369
347static int io_submit_add_bh(struct ext4_io_submit *io, 370static int io_submit_add_bh(struct ext4_io_submit *io,
348 struct ext4_io_page *io_page,
349 struct inode *inode, 371 struct inode *inode,
350 struct writeback_control *wbc,
351 struct buffer_head *bh) 372 struct buffer_head *bh)
352{ 373{
353 ext4_io_end_t *io_end; 374 ext4_io_end_t *io_end;
354 int ret; 375 int ret;
355 376
356 if (buffer_new(bh)) {
357 clear_buffer_new(bh);
358 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
359 }
360
361 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 377 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
362submit_and_retry: 378submit_and_retry:
363 ext4_io_submit(io); 379 ext4_io_submit(io);
364 } 380 }
365 if (io->io_bio == NULL) { 381 if (io->io_bio == NULL) {
366 ret = io_submit_init(io, inode, wbc, bh); 382 ret = io_submit_init_bio(io, bh);
367 if (ret) 383 if (ret)
368 return ret; 384 return ret;
369 } 385 }
370 io_end = io->io_end;
371 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
372 (io_end->pages[io_end->num_io_pages-1] != io_page))
373 goto submit_and_retry;
374 if (buffer_uninit(bh))
375 ext4_set_io_unwritten_flag(inode, io_end);
376 io->io_end->size += bh->b_size;
377 io->io_next_block++;
378 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 386 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
379 if (ret != bh->b_size) 387 if (ret != bh->b_size)
380 goto submit_and_retry; 388 goto submit_and_retry;
381 if ((io_end->num_io_pages == 0) || 389 io_end = io->io_end;
382 (io_end->pages[io_end->num_io_pages-1] != io_page)) { 390 if (test_clear_buffer_uninit(bh))
383 io_end->pages[io_end->num_io_pages++] = io_page; 391 ext4_set_io_unwritten_flag(inode, io_end);
384 atomic_inc(&io_page->p_count); 392 io_end->size += bh->b_size;
385 } 393 io->io_next_block++;
386 return 0; 394 return 0;
387} 395}
388 396
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
392 struct writeback_control *wbc) 400 struct writeback_control *wbc)
393{ 401{
394 struct inode *inode = page->mapping->host; 402 struct inode *inode = page->mapping->host;
395 unsigned block_start, block_end, blocksize; 403 unsigned block_start, blocksize;
396 struct ext4_io_page *io_page;
397 struct buffer_head *bh, *head; 404 struct buffer_head *bh, *head;
398 int ret = 0; 405 int ret = 0;
406 int nr_submitted = 0;
399 407
400 blocksize = 1 << inode->i_blkbits; 408 blocksize = 1 << inode->i_blkbits;
401 409
402 BUG_ON(!PageLocked(page)); 410 BUG_ON(!PageLocked(page));
403 BUG_ON(PageWriteback(page)); 411 BUG_ON(PageWriteback(page));
404 412
405 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
406 if (!io_page) {
407 redirty_page_for_writepage(wbc, page);
408 unlock_page(page);
409 return -ENOMEM;
410 }
411 io_page->p_page = page;
412 atomic_set(&io_page->p_count, 1);
413 get_page(page);
414 set_page_writeback(page); 413 set_page_writeback(page);
415 ClearPageError(page); 414 ClearPageError(page);
416 415
417 for (bh = head = page_buffers(page), block_start = 0; 416 /*
418 bh != head || !block_start; 417 * In the first loop we prepare and mark buffers to submit. We have to
419 block_start = block_end, bh = bh->b_this_page) { 418 * mark all buffers in the page before submitting so that
420 419 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
421 block_end = block_start + blocksize; 420 * on the first buffer finishes and we are still working on submitting
421 * the second buffer.
422 */
423 bh = head = page_buffers(page);
424 do {
425 block_start = bh_offset(bh);
422 if (block_start >= len) { 426 if (block_start >= len) {
423 /* 427 /*
424 * Comments copied from block_write_full_page_endio: 428 * Comments copied from block_write_full_page_endio:
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
431 * mapped, and writes to that region are not written 435 * mapped, and writes to that region are not written
432 * out to the file." 436 * out to the file."
433 */ 437 */
434 zero_user_segment(page, block_start, block_end); 438 zero_user_segment(page, block_start,
439 block_start + blocksize);
435 clear_buffer_dirty(bh); 440 clear_buffer_dirty(bh);
436 set_buffer_uptodate(bh); 441 set_buffer_uptodate(bh);
437 continue; 442 continue;
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
445 ext4_io_submit(io); 450 ext4_io_submit(io);
446 continue; 451 continue;
447 } 452 }
448 ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 453 if (buffer_new(bh)) {
454 clear_buffer_new(bh);
455 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
456 }
457 set_buffer_async_write(bh);
458 } while ((bh = bh->b_this_page) != head);
459
460 /* Now submit buffers to write */
461 bh = head = page_buffers(page);
462 do {
463 if (!buffer_async_write(bh))
464 continue;
465 ret = io_submit_add_bh(io, inode, bh);
449 if (ret) { 466 if (ret) {
450 /* 467 /*
451 * We only get here on ENOMEM. Not much else 468 * We only get here on ENOMEM. Not much else
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
455 redirty_page_for_writepage(wbc, page); 472 redirty_page_for_writepage(wbc, page);
456 break; 473 break;
457 } 474 }
475 nr_submitted++;
458 clear_buffer_dirty(bh); 476 clear_buffer_dirty(bh);
477 } while ((bh = bh->b_this_page) != head);
478
479 /* Error stopped previous loop? Clean up buffers... */
480 if (ret) {
481 do {
482 clear_buffer_async_write(bh);
483 bh = bh->b_this_page;
484 } while (bh != head);
459 } 485 }
460 unlock_page(page); 486 unlock_page(page);
461 /* 487 /* Nothing submitted - we have to end page writeback */
462 * If the page was truncated before we could do the writeback, 488 if (!nr_submitted)
463 * or we had a memory allocation error while trying to write 489 end_page_writeback(page);
464 * the first buffer head, we won't have submitted any pages for
465 * I/O. In that case we need to make sure we've cleared the
466 * PageWriteback bit from the page to prevent the system from
467 * wedging later on.
468 */
469 put_io_page(io_page);
470 return ret; 490 return ret;
471} 491}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c169477a62c9..b27c96d01965 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ next_group:
272 if (start_blk >= last_blk) 272 if (start_blk >= last_blk)
273 goto next_group; 273 goto next_group;
274 group_data[bb_index].block_bitmap = start_blk++; 274 group_data[bb_index].block_bitmap = start_blk++;
275 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 275 group = ext4_get_group_number(sb, start_blk - 1);
276 group -= group_data[0].group; 276 group -= group_data[0].group;
277 group_data[group].free_blocks_count--; 277 group_data[group].free_blocks_count--;
278 if (flexbg_size > 1) 278 if (flexbg_size > 1)
@@ -284,7 +284,7 @@ next_group:
284 if (start_blk >= last_blk) 284 if (start_blk >= last_blk)
285 goto next_group; 285 goto next_group;
286 group_data[ib_index].inode_bitmap = start_blk++; 286 group_data[ib_index].inode_bitmap = start_blk++;
287 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 287 group = ext4_get_group_number(sb, start_blk - 1);
288 group -= group_data[0].group; 288 group -= group_data[0].group;
289 group_data[group].free_blocks_count--; 289 group_data[group].free_blocks_count--;
290 if (flexbg_size > 1) 290 if (flexbg_size > 1)
@@ -296,7 +296,7 @@ next_group:
296 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) 296 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
297 goto next_group; 297 goto next_group;
298 group_data[it_index].inode_table = start_blk; 298 group_data[it_index].inode_table = start_blk;
299 ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); 299 group = ext4_get_group_number(sb, start_blk - 1);
300 group -= group_data[0].group; 300 group -= group_data[0].group;
301 group_data[group].free_blocks_count -= 301 group_data[group].free_blocks_count -=
302 EXT4_SB(sb)->s_itb_per_group; 302 EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
392 ext4_group_t group; 392 ext4_group_t group;
393 int err; 393 int err;
394 394
395 ext4_get_group_no_and_offset(sb, block, &group, NULL); 395 group = ext4_get_group_number(sb, block);
396 start = ext4_group_first_block_no(sb, group); 396 start = ext4_group_first_block_no(sb, group);
397 group -= flex_gd->groups[0].group; 397 group -= flex_gd->groups[0].group;
398 398
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
1341 1341
1342 /* Update the global fs size fields */ 1342 /* Update the global fs size fields */
1343 sbi->s_groups_count += flex_gd->count; 1343 sbi->s_groups_count += flex_gd->count;
1344 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
1345 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
1344 1346
1345 /* Update the reserved block counts only once the new group is 1347 /* Update the reserved block counts only once the new group is
1346 * active. */ 1348 * active. */
@@ -1879,7 +1881,11 @@ retry:
1879 /* Nothing need to do */ 1881 /* Nothing need to do */
1880 return 0; 1882 return 0;
1881 1883
1882 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1884 n_group = ext4_get_group_number(sb, n_blocks_count - 1);
1885 if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
1886 ext4_warning(sb, "resize would cause inodes_count overflow");
1887 return -EINVAL;
1888 }
1883 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1889 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1884 1890
1885 n_desc_blocks = num_desc_blocks(sb, n_group + 1); 1891 n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..dbc7c090c13a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
81static void ext4_destroy_lazyinit_thread(void); 81static void ext4_destroy_lazyinit_thread(void);
82static void ext4_unregister_li_request(struct super_block *sb); 82static void ext4_unregister_li_request(struct super_block *sb);
83static void ext4_clear_request_list(void); 83static void ext4_clear_request_list(void);
84static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
84 85
85#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
86static struct file_system_type ext2_fs_type = { 87static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
353 struct super_block *sb = journal->j_private; 354 struct super_block *sb = journal->j_private;
354 struct ext4_sb_info *sbi = EXT4_SB(sb); 355 struct ext4_sb_info *sbi = EXT4_SB(sb);
355 int error = is_journal_aborted(journal); 356 int error = is_journal_aborted(journal);
356 struct ext4_journal_cb_entry *jce, *tmp; 357 struct ext4_journal_cb_entry *jce;
357 358
359 BUG_ON(txn->t_state == T_FINISHED);
358 spin_lock(&sbi->s_md_lock); 360 spin_lock(&sbi->s_md_lock);
359 list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { 361 while (!list_empty(&txn->t_private_list)) {
362 jce = list_entry(txn->t_private_list.next,
363 struct ext4_journal_cb_entry, jce_list);
360 list_del_init(&jce->jce_list); 364 list_del_init(&jce->jce_list);
361 spin_unlock(&sbi->s_md_lock); 365 spin_unlock(&sbi->s_md_lock);
362 jce->jce_func(sb, jce, error); 366 jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1948 if ((sbi->s_es->s_feature_ro_compat & 1952 if ((sbi->s_es->s_feature_ro_compat &
1949 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { 1953 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
1950 /* Use new metadata_csum algorithm */ 1954 /* Use new metadata_csum algorithm */
1951 __u16 old_csum; 1955 __le16 save_csum;
1952 __u32 csum32; 1956 __u32 csum32;
1953 1957
1954 old_csum = gdp->bg_checksum; 1958 save_csum = gdp->bg_checksum;
1955 gdp->bg_checksum = 0; 1959 gdp->bg_checksum = 0;
1956 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, 1960 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
1957 sizeof(le_group)); 1961 sizeof(le_group));
1958 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, 1962 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
1959 sbi->s_desc_size); 1963 sbi->s_desc_size);
1960 gdp->bg_checksum = old_csum; 1964 gdp->bg_checksum = save_csum;
1961 1965
1962 crc = csum32 & 0xFFFF; 1966 crc = csum32 & 0xFFFF;
1963 goto out; 1967 goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
2379 int offset; 2383 int offset;
2380}; 2384};
2381 2385
2382static int parse_strtoul(const char *buf, 2386static int parse_strtoull(const char *buf,
2383 unsigned long max, unsigned long *value) 2387 unsigned long long max, unsigned long long *value)
2384{ 2388{
2385 char *endp; 2389 int ret;
2386
2387 *value = simple_strtoul(skip_spaces(buf), &endp, 0);
2388 endp = skip_spaces(endp);
2389 if (*endp || *value > max)
2390 return -EINVAL;
2391 2390
2392 return 0; 2391 ret = kstrtoull(skip_spaces(buf), 0, value);
2392 if (!ret && *value > max)
2393 ret = -EINVAL;
2394 return ret;
2393} 2395}
2394 2396
2395static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, 2397static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2431 const char *buf, size_t count) 2433 const char *buf, size_t count)
2432{ 2434{
2433 unsigned long t; 2435 unsigned long t;
2436 int ret;
2434 2437
2435 if (parse_strtoul(buf, 0x40000000, &t)) 2438 ret = kstrtoul(skip_spaces(buf), 0, &t);
2436 return -EINVAL; 2439 if (ret)
2440 return ret;
2437 2441
2438 if (t && !is_power_of_2(t)) 2442 if (t && (!is_power_of_2(t) || t > 0x40000000))
2439 return -EINVAL; 2443 return -EINVAL;
2440 2444
2441 sbi->s_inode_readahead_blks = t; 2445 sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2456{ 2460{
2457 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2461 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2458 unsigned long t; 2462 unsigned long t;
2463 int ret;
2459 2464
2460 if (parse_strtoul(buf, 0xffffffff, &t)) 2465 ret = kstrtoul(skip_spaces(buf), 0, &t);
2461 return -EINVAL; 2466 if (ret)
2467 return ret;
2462 *ui = t; 2468 *ui = t;
2463 return count; 2469 return count;
2464} 2470}
2465 2471
2472static ssize_t reserved_clusters_show(struct ext4_attr *a,
2473 struct ext4_sb_info *sbi, char *buf)
2474{
2475 return snprintf(buf, PAGE_SIZE, "%llu\n",
2476 (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
2477}
2478
2479static ssize_t reserved_clusters_store(struct ext4_attr *a,
2480 struct ext4_sb_info *sbi,
2481 const char *buf, size_t count)
2482{
2483 unsigned long long val;
2484 int ret;
2485
2486 if (parse_strtoull(buf, -1ULL, &val))
2487 return -EINVAL;
2488 ret = ext4_reserve_clusters(sbi, val);
2489
2490 return ret ? ret : count;
2491}
2492
2466static ssize_t trigger_test_error(struct ext4_attr *a, 2493static ssize_t trigger_test_error(struct ext4_attr *a,
2467 struct ext4_sb_info *sbi, 2494 struct ext4_sb_info *sbi,
2468 const char *buf, size_t count) 2495 const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2500EXT4_RO_ATTR(delayed_allocation_blocks); 2527EXT4_RO_ATTR(delayed_allocation_blocks);
2501EXT4_RO_ATTR(session_write_kbytes); 2528EXT4_RO_ATTR(session_write_kbytes);
2502EXT4_RO_ATTR(lifetime_write_kbytes); 2529EXT4_RO_ATTR(lifetime_write_kbytes);
2530EXT4_RW_ATTR(reserved_clusters);
2503EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2531EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2504 inode_readahead_blks_store, s_inode_readahead_blks); 2532 inode_readahead_blks_store, s_inode_readahead_blks);
2505EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2533EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
2517 ATTR_LIST(delayed_allocation_blocks), 2545 ATTR_LIST(delayed_allocation_blocks),
2518 ATTR_LIST(session_write_kbytes), 2546 ATTR_LIST(session_write_kbytes),
2519 ATTR_LIST(lifetime_write_kbytes), 2547 ATTR_LIST(lifetime_write_kbytes),
2548 ATTR_LIST(reserved_clusters),
2520 ATTR_LIST(inode_readahead_blks), 2549 ATTR_LIST(inode_readahead_blks),
2521 ATTR_LIST(inode_goal), 2550 ATTR_LIST(inode_goal),
2522 ATTR_LIST(mb_stats), 2551 ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
3192 return 0; 3221 return 0;
3193} 3222}
3194 3223
3224
3225static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
3226{
3227 ext4_fsblk_t resv_clusters;
3228
3229 /*
3230 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3231 * This should cover the situations where we can not afford to run
3232 * out of space like for example punch hole, or converting
3233 * uninitialized extents in delalloc path. In most cases such
3234 * allocation would require 1, or 2 blocks, higher numbers are
3235 * very rare.
3236 */
3237 resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
3238
3239 do_div(resv_clusters, 50);
3240 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3241
3242 return resv_clusters;
3243}
3244
3245
3246static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
3247{
3248 ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
3249 sbi->s_cluster_bits;
3250
3251 if (count >= clusters)
3252 return -EINVAL;
3253
3254 atomic64_set(&sbi->s_resv_clusters, count);
3255 return 0;
3256}
3257
3195static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3258static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3196{ 3259{
3197 char *orig_data = kstrdup(data, GFP_KERNEL); 3260 char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3526 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3589 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3527 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3590 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3528 3591
3592 /* Do we have standard group size of blocksize * 8 blocks ? */
3593 if (sbi->s_blocks_per_group == blocksize << 3)
3594 set_opt2(sb, STD_GROUP_SIZE);
3595
3529 for (i = 0; i < 4; i++) 3596 for (i = 0; i < 4; i++)
3530 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3597 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3531 sbi->s_def_hash_version = es->s_def_hash_version; 3598 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3698 sbi->s_err_report.function = print_daily_error_info; 3765 sbi->s_err_report.function = print_daily_error_info;
3699 sbi->s_err_report.data = (unsigned long) sb; 3766 sbi->s_err_report.data = (unsigned long) sb;
3700 3767
3768 /* Register extent status tree shrinker */
3769 ext4_es_register_shrinker(sb);
3770
3701 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3771 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3702 ext4_count_free_clusters(sb)); 3772 ext4_count_free_clusters(sb));
3703 if (!err) { 3773 if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3723 sbi->s_max_writeback_mb_bump = 128; 3793 sbi->s_max_writeback_mb_bump = 128;
3724 sbi->s_extent_max_zeroout_kb = 32; 3794 sbi->s_extent_max_zeroout_kb = 32;
3725 3795
3726 /* Register extent status tree shrinker */
3727 ext4_es_register_shrinker(sb);
3728
3729 /* 3796 /*
3730 * set up enough so that it can read an inode 3797 * set up enough so that it can read an inode
3731 */ 3798 */
@@ -3911,6 +3978,13 @@ no_journal:
3911 "available"); 3978 "available");
3912 } 3979 }
3913 3980
3981 err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
3982 if (err) {
3983 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
3984 "reserved pool", ext4_calculate_resv_clusters(sbi));
3985 goto failed_mount4a;
3986 }
3987
3914 err = ext4_setup_system_zone(sb); 3988 err = ext4_setup_system_zone(sb);
3915 if (err) { 3989 if (err) {
3916 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3990 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ failed_mount_wq:
4010 sbi->s_journal = NULL; 4084 sbi->s_journal = NULL;
4011 } 4085 }
4012failed_mount3: 4086failed_mount3:
4087 ext4_es_unregister_shrinker(sb);
4013 del_timer(&sbi->s_err_report); 4088 del_timer(&sbi->s_err_report);
4014 if (sbi->s_flex_groups) 4089 if (sbi->s_flex_groups)
4015 ext4_kvfree(sbi->s_flex_groups); 4090 ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
4177 goto out_bdev; 4252 goto out_bdev;
4178 } 4253 }
4179 journal->j_private = sb; 4254 journal->j_private = sb;
4180 ll_rw_block(READ, 1, &journal->j_sb_buffer); 4255 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
4181 wait_on_buffer(journal->j_sb_buffer); 4256 wait_on_buffer(journal->j_sb_buffer);
4182 if (!buffer_uptodate(journal->j_sb_buffer)) { 4257 if (!buffer_uptodate(journal->j_sb_buffer)) {
4183 ext4_msg(sb, KERN_ERR, "I/O error on journal device"); 4258 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4742 struct super_block *sb = dentry->d_sb; 4817 struct super_block *sb = dentry->d_sb;
4743 struct ext4_sb_info *sbi = EXT4_SB(sb); 4818 struct ext4_sb_info *sbi = EXT4_SB(sb);
4744 struct ext4_super_block *es = sbi->s_es; 4819 struct ext4_super_block *es = sbi->s_es;
4745 ext4_fsblk_t overhead = 0; 4820 ext4_fsblk_t overhead = 0, resv_blocks;
4746 u64 fsid; 4821 u64 fsid;
4747 s64 bfree; 4822 s64 bfree;
4823 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
4748 4824
4749 if (!test_opt(sb, MINIX_DF)) 4825 if (!test_opt(sb, MINIX_DF))
4750 overhead = sbi->s_overhead; 4826 overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4756 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4832 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4757 /* prevent underflow in case that few free space is available */ 4833 /* prevent underflow in case that few free space is available */
4758 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); 4834 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
4759 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4835 buf->f_bavail = buf->f_bfree -
4760 if (buf->f_bfree < ext4_r_blocks_count(es)) 4836 (ext4_r_blocks_count(es) + resv_blocks);
4837 if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
4761 buf->f_bavail = 0; 4838 buf->f_bavail = 0;
4762 buf->f_files = le32_to_cpu(es->s_inodes_count); 4839 buf->f_files = le32_to_cpu(es->s_inodes_count);
4763 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 4840 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
4945 return PTR_ERR(qf_inode); 5022 return PTR_ERR(qf_inode);
4946 } 5023 }
4947 5024
5025 /* Don't account quota for quota files to avoid recursion */
5026 qf_inode->i_flags |= S_NOQUOTA;
4948 err = dquot_enable(qf_inode, type, format_id, flags); 5027 err = dquot_enable(qf_inode, type, format_id, flags);
4949 iput(qf_inode); 5028 iput(qf_inode);
4950 5029
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a120b277240..c081e34f717f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
122 struct ext4_xattr_header *hdr) 122 struct ext4_xattr_header *hdr)
123{ 123{
124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
125 __u32 csum, old; 125 __u32 csum;
126 __le32 save_csum;
127 __le64 dsk_block_nr = cpu_to_le64(block_nr);
126 128
127 old = hdr->h_checksum; 129 save_csum = hdr->h_checksum;
128 hdr->h_checksum = 0; 130 hdr->h_checksum = 0;
129 block_nr = cpu_to_le64(block_nr); 131 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
130 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, 132 sizeof(dsk_block_nr));
131 sizeof(block_nr));
132 csum = ext4_chksum(sbi, csum, (__u8 *)hdr, 133 csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
133 EXT4_BLOCK_SIZE(inode->i_sb)); 134 EXT4_BLOCK_SIZE(inode->i_sb));
134 135
135 hdr->h_checksum = old; 136 hdr->h_checksum = save_csum;
136 return cpu_to_le32(csum); 137 return cpu_to_le32(csum);
137} 138}
138 139
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index aa25deb5c6cd..c767dbdd7fc4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
22#define EXT4_XATTR_INDEX_LUSTRE 5 22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6 23#define EXT4_XATTR_INDEX_SECURITY 6
24#define EXT4_XATTR_INDEX_SYSTEM 7 24#define EXT4_XATTR_INDEX_SYSTEM 7
25#define EXT4_XATTR_INDEX_RICHACL 8
25 26
26struct ext4_xattr_header { 27struct ext4_xattr_header {
27 __le32 h_magic; /* magic number for identification */ 28 __le32 h_magic; /* magic number for identification */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
382 int space_left = 0; 382 int space_left = 0;
383 int first_tag = 0; 383 int first_tag = 0;
384 int tag_flag; 384 int tag_flag;
385 int i, to_free = 0; 385 int i;
386 int tag_bytes = journal_tag_bytes(journal); 386 int tag_bytes = journal_tag_bytes(journal);
387 struct buffer_head *cbh = NULL; /* For transactional checksums */ 387 struct buffer_head *cbh = NULL; /* For transactional checksums */
388 __u32 crc32_sum = ~0; 388 __u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
1134 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1134 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1135 spin_unlock(&journal->j_history_lock); 1135 spin_unlock(&journal->j_history_lock);
1136 1136
1137 commit_transaction->t_state = T_FINISHED; 1137 commit_transaction->t_state = T_COMMIT_CALLBACK;
1138 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1138 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1139 journal->j_commit_sequence = commit_transaction->t_tid; 1139 journal->j_commit_sequence = commit_transaction->t_tid;
1140 journal->j_committing_transaction = NULL; 1140 journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
1149 journal->j_average_commit_time*3) / 4; 1149 journal->j_average_commit_time*3) / 4;
1150 else 1150 else
1151 journal->j_average_commit_time = commit_time; 1151 journal->j_average_commit_time = commit_time;
1152
1152 write_unlock(&journal->j_state_lock); 1153 write_unlock(&journal->j_state_lock);
1153 1154
1154 if (commit_transaction->t_checkpoint_list == NULL && 1155 if (journal->j_checkpoint_transactions == NULL) {
1155 commit_transaction->t_checkpoint_io_list == NULL) { 1156 journal->j_checkpoint_transactions = commit_transaction;
1156 __jbd2_journal_drop_transaction(journal, commit_transaction); 1157 commit_transaction->t_cpnext = commit_transaction;
1157 to_free = 1; 1158 commit_transaction->t_cpprev = commit_transaction;
1158 } else { 1159 } else {
1159 if (journal->j_checkpoint_transactions == NULL) { 1160 commit_transaction->t_cpnext =
1160 journal->j_checkpoint_transactions = commit_transaction; 1161 journal->j_checkpoint_transactions;
1161 commit_transaction->t_cpnext = commit_transaction; 1162 commit_transaction->t_cpprev =
1162 commit_transaction->t_cpprev = commit_transaction; 1163 commit_transaction->t_cpnext->t_cpprev;
1163 } else { 1164 commit_transaction->t_cpnext->t_cpprev =
1164 commit_transaction->t_cpnext = 1165 commit_transaction;
1165 journal->j_checkpoint_transactions; 1166 commit_transaction->t_cpprev->t_cpnext =
1166 commit_transaction->t_cpprev =
1167 commit_transaction->t_cpnext->t_cpprev;
1168 commit_transaction->t_cpnext->t_cpprev =
1169 commit_transaction;
1170 commit_transaction->t_cpprev->t_cpnext =
1171 commit_transaction; 1167 commit_transaction;
1172 }
1173 } 1168 }
1174 spin_unlock(&journal->j_list_lock); 1169 spin_unlock(&journal->j_list_lock);
1175 1170 /* Drop all spin_locks because commit_callback may be block.
1171 * __journal_remove_checkpoint() can not destroy transaction
1172 * under us because it is not marked as T_FINISHED yet */
1176 if (journal->j_commit_callback) 1173 if (journal->j_commit_callback)
1177 journal->j_commit_callback(journal, commit_transaction); 1174 journal->j_commit_callback(journal, commit_transaction);
1178 1175
1179 trace_jbd2_end_commit(journal, commit_transaction); 1176 trace_jbd2_end_commit(journal, commit_transaction);
1180 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1177 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1181 journal->j_commit_sequence, journal->j_tail_sequence); 1178 journal->j_commit_sequence, journal->j_tail_sequence);
1182 if (to_free)
1183 jbd2_journal_free_transaction(commit_transaction);
1184 1179
1180 write_lock(&journal->j_state_lock);
1181 spin_lock(&journal->j_list_lock);
1182 commit_transaction->t_state = T_FINISHED;
1183 /* Recheck checkpoint lists after j_list_lock was dropped */
1184 if (commit_transaction->t_checkpoint_list == NULL &&
1185 commit_transaction->t_checkpoint_io_list == NULL) {
1186 __jbd2_journal_drop_transaction(journal, commit_transaction);
1187 jbd2_journal_free_transaction(commit_transaction);
1188 }
1189 spin_unlock(&journal->j_list_lock);
1190 write_unlock(&journal->j_state_lock);
1185 wake_up(&journal->j_wait_done_commit); 1191 wake_up(&journal->j_wait_done_commit);
1186} 1192}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8b220f1ab54f..f6c5ba027f4f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -708,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
708} 708}
709 709
710/* 710/*
711 * When this function returns the transaction corresponding to tid
712 * will be completed. If the transaction has currently running, start
713 * committing that transaction before waiting for it to complete. If
714 * the transaction id is stale, it is by definition already completed,
715 * so just return SUCCESS.
716 */
717int jbd2_complete_transaction(journal_t *journal, tid_t tid)
718{
719 int need_to_wait = 1;
720
721 read_lock(&journal->j_state_lock);
722 if (journal->j_running_transaction &&
723 journal->j_running_transaction->t_tid == tid) {
724 if (journal->j_commit_request != tid) {
725 /* transaction not yet started, so request it */
726 read_unlock(&journal->j_state_lock);
727 jbd2_log_start_commit(journal, tid);
728 goto wait_commit;
729 }
730 } else if (!(journal->j_committing_transaction &&
731 journal->j_committing_transaction->t_tid == tid))
732 need_to_wait = 0;
733 read_unlock(&journal->j_state_lock);
734 if (!need_to_wait)
735 return 0;
736wait_commit:
737 return jbd2_log_wait_commit(journal, tid);
738}
739EXPORT_SYMBOL(jbd2_complete_transaction);
740
741/*
711 * Log buffer allocation routines: 742 * Log buffer allocation routines:
712 */ 743 */
713 744
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..10f524c59ea8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
332 handle_t *handle = jbd2_alloc_handle(GFP_NOFS); 332 handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
333 if (!handle) 333 if (!handle)
334 return NULL; 334 return NULL;
335 memset(handle, 0, sizeof(*handle));
336 handle->h_buffer_credits = nblocks; 335 handle->h_buffer_credits = nblocks;
337 handle->h_ref = 1; 336 handle->h_ref = 1;
338 337
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
640 int error; 639 int error;
641 char *frozen_buffer = NULL; 640 char *frozen_buffer = NULL;
642 int need_copy = 0; 641 int need_copy = 0;
642 unsigned long start_lock, time_lock;
643 643
644 if (is_handle_aborted(handle)) 644 if (is_handle_aborted(handle))
645 return -EROFS; 645 return -EROFS;
@@ -655,9 +655,16 @@ repeat:
655 655
656 /* @@@ Need to check for errors here at some point. */ 656 /* @@@ Need to check for errors here at some point. */
657 657
658 start_lock = jiffies;
658 lock_buffer(bh); 659 lock_buffer(bh);
659 jbd_lock_bh_state(bh); 660 jbd_lock_bh_state(bh);
660 661
662 /* If it takes too long to lock the buffer, trace it */
663 time_lock = jbd2_time_diff(start_lock, jiffies);
664 if (time_lock > HZ/10)
665 trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
666 jiffies_to_msecs(time_lock));
667
661 /* We now hold the buffer lock so it is safe to query the buffer 668 /* We now hold the buffer lock so it is safe to query the buffer
662 * state. Is the buffer dirty? 669 * state. Is the buffer dirty?
663 * 670 *
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 4c16c4a88d47..9e52b0626b39 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
34 BH_Write_EIO, /* I/O error on write */ 34 BH_Write_EIO, /* I/O error on write */
35 BH_Unwritten, /* Buffer is allocated on disk but not written */ 35 BH_Unwritten, /* Buffer is allocated on disk but not written */
36 BH_Quiet, /* Buffer Error Prinks to be quiet */ 36 BH_Quiet, /* Buffer Error Prinks to be quiet */
37 BH_Meta, /* Buffer contains metadata */
38 BH_Prio, /* Buffer should be submitted with REQ_PRIO */
37 39
38 BH_PrivateStart,/* not a state bit, but the first bit available 40 BH_PrivateStart,/* not a state bit, but the first bit available
39 * for private allocation by other entities 41 * for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
124BUFFER_FNS(Boundary, boundary) 126BUFFER_FNS(Boundary, boundary)
125BUFFER_FNS(Write_EIO, write_io_error) 127BUFFER_FNS(Write_EIO, write_io_error)
126BUFFER_FNS(Unwritten, unwritten) 128BUFFER_FNS(Unwritten, unwritten)
129BUFFER_FNS(Meta, meta)
130BUFFER_FNS(Prio, prio)
127 131
128#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) 132#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
129 133
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 50e5a5e6a712..6e051f472edb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
480 T_COMMIT, 480 T_COMMIT,
481 T_COMMIT_DFLUSH, 481 T_COMMIT_DFLUSH,
482 T_COMMIT_JFLUSH, 482 T_COMMIT_JFLUSH,
483 T_COMMIT_CALLBACK,
483 T_FINISHED 484 T_FINISHED
484 } t_state; 485 } t_state;
485 486
@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
1144 1145
1145static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) 1146static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
1146{ 1147{
1147 return kmem_cache_alloc(jbd2_handle_cache, gfp_flags); 1148 return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
1148} 1149}
1149 1150
1150static inline void jbd2_free_handle(handle_t *handle) 1151static inline void jbd2_free_handle(handle_t *handle)
@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
1200int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); 1201int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1201int jbd2_journal_force_commit_nested(journal_t *journal); 1202int jbd2_journal_force_commit_nested(journal_t *journal);
1202int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1203int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1204int jbd2_complete_transaction(journal_t *journal, tid_t tid);
1203int jbd2_log_do_checkpoint(journal_t *journal); 1205int jbd2_log_do_checkpoint(journal_t *journal);
1204int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); 1206int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
1205 1207
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index c18b46f8aeeb..13a3da25ff07 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
31 /* 31 /*
32 * Journalling list for this buffer [jbd_lock_bh_state()] 32 * Journalling list for this buffer [jbd_lock_bh_state()]
33 */ 33 */
34 unsigned b_jlist; 34 unsigned b_jlist:4;
35 35
36 /* 36 /*
37 * This flag signals the buffer has been modified by 37 * This flag signals the buffer has been modified by
38 * the currently running transaction 38 * the currently running transaction
39 * [jbd_lock_bh_state()] 39 * [jbd_lock_bh_state()]
40 */ 40 */
41 unsigned b_modified; 41 unsigned b_modified:1;
42
43 /*
44 * This feild tracks the last transaction id in which this buffer
45 * has been cowed
46 * [jbd_lock_bh_state()]
47 */
48 tid_t b_cow_tid;
49 42
50 /* 43 /*
51 * Copy of the buffer data frozen for writing to the log. 44 * Copy of the buffer data frozen for writing to the log.
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4ee471003859..d0e686402df8 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end,
257 __entry->pos, __entry->len, __entry->copied) 257 __entry->pos, __entry->len, __entry->copied)
258); 258);
259 259
260DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, 260DEFINE_EVENT(ext4__write_end, ext4_write_end,
261
262 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
263 unsigned int copied),
264
265 TP_ARGS(inode, pos, len, copied)
266);
267
268DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
269 261
270 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 262 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
271 unsigned int copied), 263 unsigned int copied),
@@ -1956,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks,
1956 __entry->to = to; 1948 __entry->to = to;
1957 __entry->partial = partial_cluster; 1949 __entry->partial = partial_cluster;
1958 __entry->ee_pblk = ext4_ext_pblock(ex); 1950 __entry->ee_pblk = ext4_ext_pblock(ex);
1959 __entry->ee_lblk = cpu_to_le32(ex->ee_block); 1951 __entry->ee_lblk = le32_to_cpu(ex->ee_block);
1960 __entry->ee_len = ext4_ext_get_actual_len(ex); 1952 __entry->ee_len = ext4_ext_get_actual_len(ex);
1961 ), 1953 ),
1962 1954
@@ -2060,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space,
2060 2052
2061TRACE_EVENT(ext4_ext_remove_space_done, 2053TRACE_EVENT(ext4_ext_remove_space_done,
2062 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, 2054 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
2063 ext4_lblk_t partial, unsigned short eh_entries), 2055 ext4_lblk_t partial, __le16 eh_entries),
2064 2056
2065 TP_ARGS(inode, start, depth, partial, eh_entries), 2057 TP_ARGS(inode, start, depth, partial, eh_entries),
2066 2058
@@ -2079,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
2079 __entry->start = start; 2071 __entry->start = start;
2080 __entry->depth = depth; 2072 __entry->depth = depth;
2081 __entry->partial = partial; 2073 __entry->partial = partial;
2082 __entry->eh_entries = eh_entries; 2074 __entry->eh_entries = le16_to_cpu(eh_entries);
2083 ), 2075 ),
2084 2076
2085 TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " 2077 TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 070df49e4a1d..c1d1f3eb242d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock,
358 MINOR(__entry->dev), __entry->write_op) 358 MINOR(__entry->dev), __entry->write_op)
359); 359);
360 360
361TRACE_EVENT(jbd2_lock_buffer_stall,
362
363 TP_PROTO(dev_t dev, unsigned long stall_ms),
364
365 TP_ARGS(dev, stall_ms),
366
367 TP_STRUCT__entry(
368 __field( dev_t, dev )
369 __field(unsigned long, stall_ms )
370 ),
371
372 TP_fast_assign(
373 __entry->dev = dev;
374 __entry->stall_ms = stall_ms;
375 ),
376
377 TP_printk("dev %d,%d stall_ms %lu",
378 MAJOR(__entry->dev), MINOR(__entry->dev),
379 __entry->stall_ms)
380);
381
361#endif /* _TRACE_JBD2_H */ 382#endif /* _TRACE_JBD2_H */
362 383
363/* This part must be outside protection */ 384/* This part must be outside protection */