diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-01 11:04:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-01 11:04:12 -0400 |
commit | 149b306089b88e186942a8d6647028ae6683aaf9 (patch) | |
tree | 1b7436034261947bae3efad41c55a91a8ef0f68d /fs | |
parent | b0ca4d0123608cfec73fc689c74295da89fc934e (diff) | |
parent | 0d606e2c9fccdd4e67febf1e2da500e1bfe9e045 (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"Mostly performance and bug fixes, plus some cleanups. The one new
feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which
allows installation of a hidden inode designed for boot loaders."
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits)
ext4: fix type-widening bug in inode table readahead code
ext4: add check for inodes_count overflow in new resize ioctl
ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG
ext4: fix online resizing for ext3-compat file systems
jbd2: trace when lock_buffer in do_get_write_access takes a long time
ext4: mark metadata blocks using bh flags
buffer: add BH_Prio and BH_Meta flags
ext4: mark all metadata I/O with REQ_META
ext4: fix readdir error in case inline_data+^dir_index.
ext4: fix readdir error in the case of inline_data+dir_index
jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset
ext4: mext_insert_extents should update extent block checksum
ext4: move quota initialization out of inode allocation transaction
ext4: reserve xattr index for Rich ACL support
jbd2: reduce journal_head size
ext4: clear buffer_uninit flag when submitting IO
ext4: use io_end for multiple bios
ext4: make ext4_bio_write_page() use BH_Async_Write flags
ext4: Use kstrtoul() instead of parse_strtoul()
ext4: defragmentation code cleanup
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/buffer.c | 5 | ||||
-rw-r--r-- | fs/ext4/Kconfig | 3 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 53 | ||||
-rw-r--r-- | fs/ext4/dir.c | 20 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 101 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 5 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 8 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 12 | ||||
-rw-r--r-- | fs/ext4/extents.c | 522 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 3 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 88 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 473 | ||||
-rw-r--r-- | fs/ext4/inline.c | 178 | ||||
-rw-r--r-- | fs/ext4/inode.c | 580 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 218 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 253 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 62 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 6 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 73 | ||||
-rw-r--r-- | fs/ext4/namei.c | 48 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 280 | ||||
-rw-r--r-- | fs/ext4/resize.c | 16 | ||||
-rw-r--r-- | fs/ext4/super.c | 131 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 13 | ||||
-rw-r--r-- | fs/ext4/xattr.h | 1 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 50 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 31 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 9 |
28 files changed, 1845 insertions, 1397 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 10ef81e10b20..bc1fe14aaa3e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) | |||
2987 | /* Take care of bh's that straddle the end of the device */ | 2987 | /* Take care of bh's that straddle the end of the device */ |
2988 | guard_bh_eod(rw, bio, bh); | 2988 | guard_bh_eod(rw, bio, bh); |
2989 | 2989 | ||
2990 | if (buffer_meta(bh)) | ||
2991 | rw |= REQ_META; | ||
2992 | if (buffer_prio(bh)) | ||
2993 | rw |= REQ_PRIO; | ||
2994 | |||
2990 | bio_get(bio); | 2995 | bio_get(bio); |
2991 | submit_bio(rw, bio); | 2996 | submit_bio(rw, bio); |
2992 | 2997 | ||
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 987358740cb9..efea5d5c44ce 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig | |||
@@ -71,4 +71,5 @@ config EXT4_DEBUG | |||
71 | Enables run-time debugging support for the ext4 filesystem. | 71 | Enables run-time debugging support for the ext4 filesystem. |
72 | 72 | ||
73 | If you select Y here, then you will be able to turn on debugging | 73 | If you select Y here, then you will be able to turn on debugging |
74 | with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" | 74 | with a command such as: |
75 | echo 1 > /sys/module/ext4/parameters/mballoc_debug | ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92e68b33fffd..d0f13eada0ed 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, | |||
30 | */ | 30 | */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Calculate block group number for a given block number | ||
34 | */ | ||
35 | ext4_group_t ext4_get_group_number(struct super_block *sb, | ||
36 | ext4_fsblk_t block) | ||
37 | { | ||
38 | ext4_group_t group; | ||
39 | |||
40 | if (test_opt2(sb, STD_GROUP_SIZE)) | ||
41 | group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + | ||
42 | block) >> | ||
43 | (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); | ||
44 | else | ||
45 | ext4_get_group_no_and_offset(sb, block, &group, NULL); | ||
46 | return group; | ||
47 | } | ||
48 | |||
49 | /* | ||
33 | * Calculate the block group number and offset into the block/cluster | 50 | * Calculate the block group number and offset into the block/cluster |
34 | * allocation bitmap, given a block number | 51 | * allocation bitmap, given a block number |
35 | */ | 52 | */ |
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, | |||
49 | 66 | ||
50 | } | 67 | } |
51 | 68 | ||
52 | static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, | 69 | /* |
53 | ext4_group_t block_group) | 70 | * Check whether the 'block' lives within the 'block_group'. Returns 1 if so |
71 | * and 0 otherwise. | ||
72 | */ | ||
73 | static inline int ext4_block_in_group(struct super_block *sb, | ||
74 | ext4_fsblk_t block, | ||
75 | ext4_group_t block_group) | ||
54 | { | 76 | { |
55 | ext4_group_t actual_group; | 77 | ext4_group_t actual_group; |
56 | ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); | 78 | |
57 | if (actual_group == block_group) | 79 | actual_group = ext4_get_group_number(sb, block); |
58 | return 1; | 80 | return (actual_group == block_group) ? 1 : 0; |
59 | return 0; | ||
60 | } | 81 | } |
61 | 82 | ||
62 | /* Return the number of clusters used for file system metadata; this | 83 | /* Return the number of clusters used for file system metadata; this |
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) | |||
420 | trace_ext4_read_block_bitmap_load(sb, block_group); | 441 | trace_ext4_read_block_bitmap_load(sb, block_group); |
421 | bh->b_end_io = ext4_end_bitmap_read; | 442 | bh->b_end_io = ext4_end_bitmap_read; |
422 | get_bh(bh); | 443 | get_bh(bh); |
423 | submit_bh(READ, bh); | 444 | submit_bh(READ | REQ_META | REQ_PRIO, bh); |
424 | return bh; | 445 | return bh; |
425 | verify: | 446 | verify: |
426 | ext4_validate_block_bitmap(sb, desc, block_group, bh); | 447 | ext4_validate_block_bitmap(sb, desc, block_group, bh); |
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
478 | static int ext4_has_free_clusters(struct ext4_sb_info *sbi, | 499 | static int ext4_has_free_clusters(struct ext4_sb_info *sbi, |
479 | s64 nclusters, unsigned int flags) | 500 | s64 nclusters, unsigned int flags) |
480 | { | 501 | { |
481 | s64 free_clusters, dirty_clusters, root_clusters; | 502 | s64 free_clusters, dirty_clusters, rsv, resv_clusters; |
482 | struct percpu_counter *fcc = &sbi->s_freeclusters_counter; | 503 | struct percpu_counter *fcc = &sbi->s_freeclusters_counter; |
483 | struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; | 504 | struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; |
484 | 505 | ||
485 | free_clusters = percpu_counter_read_positive(fcc); | 506 | free_clusters = percpu_counter_read_positive(fcc); |
486 | dirty_clusters = percpu_counter_read_positive(dcc); | 507 | dirty_clusters = percpu_counter_read_positive(dcc); |
508 | resv_clusters = atomic64_read(&sbi->s_resv_clusters); | ||
487 | 509 | ||
488 | /* | 510 | /* |
489 | * r_blocks_count should always be multiple of the cluster ratio so | 511 | * r_blocks_count should always be multiple of the cluster ratio so |
490 | * we are safe to do a plane bit shift only. | 512 | * we are safe to do a plane bit shift only. |
491 | */ | 513 | */ |
492 | root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; | 514 | rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + |
515 | resv_clusters; | ||
493 | 516 | ||
494 | if (free_clusters - (nclusters + root_clusters + dirty_clusters) < | 517 | if (free_clusters - (nclusters + rsv + dirty_clusters) < |
495 | EXT4_FREECLUSTERS_WATERMARK) { | 518 | EXT4_FREECLUSTERS_WATERMARK) { |
496 | free_clusters = percpu_counter_sum_positive(fcc); | 519 | free_clusters = percpu_counter_sum_positive(fcc); |
497 | dirty_clusters = percpu_counter_sum_positive(dcc); | 520 | dirty_clusters = percpu_counter_sum_positive(dcc); |
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, | |||
499 | /* Check whether we have space after accounting for current | 522 | /* Check whether we have space after accounting for current |
500 | * dirty clusters & root reserved clusters. | 523 | * dirty clusters & root reserved clusters. |
501 | */ | 524 | */ |
502 | if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) | 525 | if (free_clusters >= (rsv + nclusters + dirty_clusters)) |
503 | return 1; | 526 | return 1; |
504 | 527 | ||
505 | /* Hm, nope. Are (enough) root reserved clusters available? */ | 528 | /* Hm, nope. Are (enough) root reserved clusters available? */ |
506 | if (uid_eq(sbi->s_resuid, current_fsuid()) || | 529 | if (uid_eq(sbi->s_resuid, current_fsuid()) || |
507 | (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || | 530 | (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || |
508 | capable(CAP_SYS_RESOURCE) || | 531 | capable(CAP_SYS_RESOURCE) || |
509 | (flags & EXT4_MB_USE_ROOT_BLOCKS)) { | 532 | (flags & EXT4_MB_USE_ROOT_BLOCKS)) { |
510 | 533 | ||
534 | if (free_clusters >= (nclusters + dirty_clusters + | ||
535 | resv_clusters)) | ||
536 | return 1; | ||
537 | } | ||
538 | /* No free blocks. Let's see if we can dip into reserved pool */ | ||
539 | if (flags & EXT4_MB_USE_RESERVED) { | ||
511 | if (free_clusters >= (nclusters + dirty_clusters)) | 540 | if (free_clusters >= (nclusters + dirty_clusters)) |
512 | return 1; | 541 | return 1; |
513 | } | 542 | } |
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d8cd1f0f4661..f8d56e4254e0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c | |||
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode) | |||
46 | if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, | 46 | if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, |
47 | EXT4_FEATURE_COMPAT_DIR_INDEX) && | 47 | EXT4_FEATURE_COMPAT_DIR_INDEX) && |
48 | ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || | 48 | ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || |
49 | ((inode->i_size >> sb->s_blocksize_bits) == 1))) | 49 | ((inode->i_size >> sb->s_blocksize_bits) == 1) || |
50 | ext4_has_inline_data(inode))) | ||
50 | return 1; | 51 | return 1; |
51 | 52 | ||
52 | return 0; | 53 | return 0; |
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp, | |||
115 | int ret = 0; | 116 | int ret = 0; |
116 | int dir_has_error = 0; | 117 | int dir_has_error = 0; |
117 | 118 | ||
118 | if (ext4_has_inline_data(inode)) { | ||
119 | int has_inline_data = 1; | ||
120 | ret = ext4_read_inline_dir(filp, dirent, filldir, | ||
121 | &has_inline_data); | ||
122 | if (has_inline_data) | ||
123 | return ret; | ||
124 | } | ||
125 | |||
126 | if (is_dx_dir(inode)) { | 119 | if (is_dx_dir(inode)) { |
127 | err = ext4_dx_readdir(filp, dirent, filldir); | 120 | err = ext4_dx_readdir(filp, dirent, filldir); |
128 | if (err != ERR_BAD_DX_DIR) { | 121 | if (err != ERR_BAD_DX_DIR) { |
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp, | |||
136 | ext4_clear_inode_flag(file_inode(filp), | 129 | ext4_clear_inode_flag(file_inode(filp), |
137 | EXT4_INODE_INDEX); | 130 | EXT4_INODE_INDEX); |
138 | } | 131 | } |
132 | |||
133 | if (ext4_has_inline_data(inode)) { | ||
134 | int has_inline_data = 1; | ||
135 | ret = ext4_read_inline_dir(filp, dirent, filldir, | ||
136 | &has_inline_data); | ||
137 | if (has_inline_data) | ||
138 | return ret; | ||
139 | } | ||
140 | |||
139 | stored = 0; | 141 | stored = 0; |
140 | offset = filp->f_pos & (sb->s_blocksize - 1); | 142 | offset = filp->f_pos & (sb->s_blocksize - 1); |
141 | 143 | ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b83cd604796..0aabb344b02e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t; | |||
121 | #define EXT4_MB_STREAM_ALLOC 0x0800 | 121 | #define EXT4_MB_STREAM_ALLOC 0x0800 |
122 | /* Use reserved root blocks if needed */ | 122 | /* Use reserved root blocks if needed */ |
123 | #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 | 123 | #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 |
124 | /* Use blocks from reserved pool */ | ||
125 | #define EXT4_MB_USE_RESERVED 0x2000 | ||
124 | 126 | ||
125 | struct ext4_allocation_request { | 127 | struct ext4_allocation_request { |
126 | /* target inode for block we're allocating */ | 128 | /* target inode for block we're allocating */ |
@@ -196,19 +198,8 @@ struct mpage_da_data { | |||
196 | #define EXT4_IO_END_ERROR 0x0002 | 198 | #define EXT4_IO_END_ERROR 0x0002 |
197 | #define EXT4_IO_END_DIRECT 0x0004 | 199 | #define EXT4_IO_END_DIRECT 0x0004 |
198 | 200 | ||
199 | struct ext4_io_page { | ||
200 | struct page *p_page; | ||
201 | atomic_t p_count; | ||
202 | }; | ||
203 | |||
204 | #define MAX_IO_PAGES 128 | ||
205 | |||
206 | /* | 201 | /* |
207 | * For converting uninitialized extents on a work queue. | 202 | * For converting uninitialized extents on a work queue. |
208 | * | ||
209 | * 'page' is only used from the writepage() path; 'pages' is only used for | ||
210 | * buffered writes; they are used to keep page references until conversion | ||
211 | * takes place. For AIO/DIO, neither field is filled in. | ||
212 | */ | 203 | */ |
213 | typedef struct ext4_io_end { | 204 | typedef struct ext4_io_end { |
214 | struct list_head list; /* per-file finished IO list */ | 205 | struct list_head list; /* per-file finished IO list */ |
@@ -218,15 +209,13 @@ typedef struct ext4_io_end { | |||
218 | ssize_t size; /* size of the extent */ | 209 | ssize_t size; /* size of the extent */ |
219 | struct kiocb *iocb; /* iocb struct for AIO */ | 210 | struct kiocb *iocb; /* iocb struct for AIO */ |
220 | int result; /* error value for AIO */ | 211 | int result; /* error value for AIO */ |
221 | int num_io_pages; /* for writepages() */ | 212 | atomic_t count; /* reference counter */ |
222 | struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ | ||
223 | } ext4_io_end_t; | 213 | } ext4_io_end_t; |
224 | 214 | ||
225 | struct ext4_io_submit { | 215 | struct ext4_io_submit { |
226 | int io_op; | 216 | int io_op; |
227 | struct bio *io_bio; | 217 | struct bio *io_bio; |
228 | ext4_io_end_t *io_end; | 218 | ext4_io_end_t *io_end; |
229 | struct ext4_io_page *io_page; | ||
230 | sector_t io_next_block; | 219 | sector_t io_next_block; |
231 | }; | 220 | }; |
232 | 221 | ||
@@ -403,7 +392,7 @@ struct flex_groups { | |||
403 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ | 392 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ |
404 | 393 | ||
405 | #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ | 394 | #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ |
406 | #define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ | 395 | #define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ |
407 | 396 | ||
408 | /* Flags that should be inherited by new inodes from their parent. */ | 397 | /* Flags that should be inherited by new inodes from their parent. */ |
409 | #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ | 398 | #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ |
@@ -557,9 +546,8 @@ enum { | |||
557 | #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 | 546 | #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 |
558 | #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ | 547 | #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ |
559 | EXT4_GET_BLOCKS_CREATE) | 548 | EXT4_GET_BLOCKS_CREATE) |
560 | /* Caller is from the delayed allocation writeout path, | 549 | /* Caller is from the delayed allocation writeout path |
561 | so set the magic i_delalloc_reserve_flag after taking the | 550 | * finally doing the actual allocation of delayed blocks */ |
562 | inode allocation semaphore for */ | ||
563 | #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 | 551 | #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 |
564 | /* caller is from the direct IO path, request to creation of an | 552 | /* caller is from the direct IO path, request to creation of an |
565 | unitialized extents if not allocated, split the uninitialized | 553 | unitialized extents if not allocated, split the uninitialized |
@@ -571,8 +559,9 @@ enum { | |||
571 | /* Convert extent to initialized after IO complete */ | 559 | /* Convert extent to initialized after IO complete */ |
572 | #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ | 560 | #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ |
573 | EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) | 561 | EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) |
574 | /* Punch out blocks of an extent */ | 562 | /* Eventual metadata allocation (due to growing extent tree) |
575 | #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 | 563 | * should not fail, so try to use reserved blocks for that.*/ |
564 | #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 | ||
576 | /* Don't normalize allocation size (used for fallocate) */ | 565 | /* Don't normalize allocation size (used for fallocate) */ |
577 | #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 | 566 | #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 |
578 | /* Request will not result in inode size update (user for fallocate) */ | 567 | /* Request will not result in inode size update (user for fallocate) */ |
@@ -616,6 +605,7 @@ enum { | |||
616 | #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) | 605 | #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) |
617 | #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) | 606 | #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) |
618 | #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) | 607 | #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) |
608 | #define EXT4_IOC_SWAP_BOOT _IO('f', 17) | ||
619 | 609 | ||
620 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) | 610 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) |
621 | /* | 611 | /* |
@@ -949,7 +939,7 @@ struct ext4_inode_info { | |||
949 | #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ | 939 | #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ |
950 | 940 | ||
951 | /* | 941 | /* |
952 | * Mount flags | 942 | * Mount flags set via mount options or defaults |
953 | */ | 943 | */ |
954 | #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ | 944 | #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ |
955 | #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ | 945 | #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ |
@@ -981,8 +971,16 @@ struct ext4_inode_info { | |||
981 | #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ | 971 | #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ |
982 | #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ | 972 | #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ |
983 | 973 | ||
974 | /* | ||
975 | * Mount flags set either automatically (could not be set by mount option) | ||
976 | * based on per file system feature or property or in special cases such as | ||
977 | * distinguishing between explicit mount option definition and default. | ||
978 | */ | ||
984 | #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly | 979 | #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly |
985 | specified delalloc */ | 980 | specified delalloc */ |
981 | #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group | ||
982 | size of blocksize * 8 | ||
983 | blocks */ | ||
986 | 984 | ||
987 | #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ | 985 | #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ |
988 | ~EXT4_MOUNT_##opt | 986 | ~EXT4_MOUNT_##opt |
@@ -1179,6 +1177,7 @@ struct ext4_sb_info { | |||
1179 | unsigned int s_mount_flags; | 1177 | unsigned int s_mount_flags; |
1180 | unsigned int s_def_mount_opt; | 1178 | unsigned int s_def_mount_opt; |
1181 | ext4_fsblk_t s_sb_block; | 1179 | ext4_fsblk_t s_sb_block; |
1180 | atomic64_t s_resv_clusters; | ||
1182 | kuid_t s_resuid; | 1181 | kuid_t s_resuid; |
1183 | kgid_t s_resgid; | 1182 | kgid_t s_resgid; |
1184 | unsigned short s_mount_state; | 1183 | unsigned short s_mount_state; |
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
1333 | return ino == EXT4_ROOT_INO || | 1332 | return ino == EXT4_ROOT_INO || |
1334 | ino == EXT4_USR_QUOTA_INO || | 1333 | ino == EXT4_USR_QUOTA_INO || |
1335 | ino == EXT4_GRP_QUOTA_INO || | 1334 | ino == EXT4_GRP_QUOTA_INO || |
1335 | ino == EXT4_BOOT_LOADER_INO || | ||
1336 | ino == EXT4_JOURNAL_INO || | 1336 | ino == EXT4_JOURNAL_INO || |
1337 | ino == EXT4_RESIZE_INO || | 1337 | ino == EXT4_RESIZE_INO || |
1338 | (ino >= EXT4_FIRST_INO(sb) && | 1338 | (ino >= EXT4_FIRST_INO(sb) && |
@@ -1374,6 +1374,7 @@ enum { | |||
1374 | EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read | 1374 | EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read |
1375 | nolocking */ | 1375 | nolocking */ |
1376 | EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ | 1376 | EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ |
1377 | EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ | ||
1377 | }; | 1378 | }; |
1378 | 1379 | ||
1379 | #define EXT4_INODE_BIT_FNS(name, field, offset) \ | 1380 | #define EXT4_INODE_BIT_FNS(name, field, offset) \ |
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) | |||
1784 | */ | 1785 | */ |
1785 | #define ERR_BAD_DX_DIR -75000 | 1786 | #define ERR_BAD_DX_DIR -75000 |
1786 | 1787 | ||
1787 | void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, | ||
1788 | ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); | ||
1789 | |||
1790 | /* | 1788 | /* |
1791 | * Timeout and state flag for lazy initialization inode thread. | 1789 | * Timeout and state flag for lazy initialization inode thread. |
1792 | */ | 1790 | */ |
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, | |||
1908 | struct buffer_head *bh); | 1906 | struct buffer_head *bh); |
1909 | 1907 | ||
1910 | /* balloc.c */ | 1908 | /* balloc.c */ |
1909 | extern void ext4_get_group_no_and_offset(struct super_block *sb, | ||
1910 | ext4_fsblk_t blocknr, | ||
1911 | ext4_group_t *blockgrpp, | ||
1912 | ext4_grpblk_t *offsetp); | ||
1913 | extern ext4_group_t ext4_get_group_number(struct super_block *sb, | ||
1914 | ext4_fsblk_t block); | ||
1915 | |||
1911 | extern void ext4_validate_block_bitmap(struct super_block *sb, | 1916 | extern void ext4_validate_block_bitmap(struct super_block *sb, |
1912 | struct ext4_group_desc *desc, | 1917 | struct ext4_group_desc *desc, |
1913 | unsigned int block_group, | 1918 | unsigned int block_group, |
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
2108 | unsigned long nr_segs); | 2113 | unsigned long nr_segs); |
2109 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | 2114 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); |
2110 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); | 2115 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); |
2111 | extern void ext4_ind_truncate(struct inode *inode); | 2116 | extern void ext4_ind_truncate(handle_t *, struct inode *inode); |
2112 | extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); | 2117 | extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, |
2118 | ext4_lblk_t first, ext4_lblk_t stop); | ||
2113 | 2119 | ||
2114 | /* ioctl.c */ | 2120 | /* ioctl.c */ |
2115 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 2121 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | |||
2117 | 2123 | ||
2118 | /* migrate.c */ | 2124 | /* migrate.c */ |
2119 | extern int ext4_ext_migrate(struct inode *); | 2125 | extern int ext4_ext_migrate(struct inode *); |
2126 | extern int ext4_ind_migrate(struct inode *inode); | ||
2120 | 2127 | ||
2121 | /* namei.c */ | 2128 | /* namei.c */ |
2122 | extern int ext4_dirent_csum_verify(struct inode *inode, | 2129 | extern int ext4_dirent_csum_verify(struct inode *inode, |
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle, | |||
2511 | extern int ext4_read_inline_dir(struct file *filp, | 2518 | extern int ext4_read_inline_dir(struct file *filp, |
2512 | void *dirent, filldir_t filldir, | 2519 | void *dirent, filldir_t filldir, |
2513 | int *has_inline_data); | 2520 | int *has_inline_data); |
2521 | extern int htree_inlinedir_to_tree(struct file *dir_file, | ||
2522 | struct inode *dir, ext4_lblk_t block, | ||
2523 | struct dx_hash_info *hinfo, | ||
2524 | __u32 start_hash, __u32 start_minor_hash, | ||
2525 | int *has_inline_data); | ||
2514 | extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, | 2526 | extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, |
2515 | const struct qstr *d_name, | 2527 | const struct qstr *d_name, |
2516 | struct ext4_dir_entry_2 **res_dir, | 2528 | struct ext4_dir_entry_2 **res_dir, |
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, | |||
2547 | extern int ext4_handle_dirty_dirent_node(handle_t *handle, | 2559 | extern int ext4_handle_dirty_dirent_node(handle_t *handle, |
2548 | struct inode *inode, | 2560 | struct inode *inode, |
2549 | struct buffer_head *bh); | 2561 | struct buffer_head *bh); |
2562 | #define S_SHIFT 12 | ||
2563 | static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { | ||
2564 | [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, | ||
2565 | [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, | ||
2566 | [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, | ||
2567 | [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, | ||
2568 | [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, | ||
2569 | [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, | ||
2570 | [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, | ||
2571 | }; | ||
2572 | |||
2573 | static inline void ext4_set_de_type(struct super_block *sb, | ||
2574 | struct ext4_dir_entry_2 *de, | ||
2575 | umode_t mode) { | ||
2576 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) | ||
2577 | de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; | ||
2578 | } | ||
2579 | |||
2550 | 2580 | ||
2551 | /* symlink.c */ | 2581 | /* symlink.c */ |
2552 | extern const struct inode_operations ext4_symlink_inode_operations; | 2582 | extern const struct inode_operations ext4_symlink_inode_operations; |
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, | |||
2573 | int chunk); | 2603 | int chunk); |
2574 | extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | 2604 | extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, |
2575 | struct ext4_map_blocks *map, int flags); | 2605 | struct ext4_map_blocks *map, int flags); |
2576 | extern void ext4_ext_truncate(struct inode *); | 2606 | extern void ext4_ext_truncate(handle_t *, struct inode *); |
2577 | extern int ext4_ext_punch_hole(struct file *file, loff_t offset, | 2607 | extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, |
2578 | loff_t length); | 2608 | ext4_lblk_t end); |
2579 | extern void ext4_ext_init(struct super_block *); | 2609 | extern void ext4_ext_init(struct super_block *); |
2580 | extern void ext4_ext_release(struct super_block *); | 2610 | extern void ext4_ext_release(struct super_block *); |
2581 | extern long ext4_fallocate(struct file *file, int mode, loff_t offset, | 2611 | extern long ext4_fallocate(struct file *file, int mode, loff_t offset, |
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
2609 | 2639 | ||
2610 | 2640 | ||
2611 | /* move_extent.c */ | 2641 | /* move_extent.c */ |
2642 | extern void ext4_double_down_write_data_sem(struct inode *first, | ||
2643 | struct inode *second); | ||
2644 | extern void ext4_double_up_write_data_sem(struct inode *orig_inode, | ||
2645 | struct inode *donor_inode); | ||
2646 | void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); | ||
2647 | void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); | ||
2612 | extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, | 2648 | extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, |
2613 | __u64 start_orig, __u64 start_donor, | 2649 | __u64 start_orig, __u64 start_donor, |
2614 | __u64 len, __u64 *moved_len); | 2650 | __u64 len, __u64 *moved_len); |
2615 | 2651 | ||
2616 | /* page-io.c */ | 2652 | /* page-io.c */ |
2617 | extern int __init ext4_init_pageio(void); | 2653 | extern int __init ext4_init_pageio(void); |
2618 | extern void ext4_add_complete_io(ext4_io_end_t *io_end); | ||
2619 | extern void ext4_exit_pageio(void); | 2654 | extern void ext4_exit_pageio(void); |
2620 | extern void ext4_ioend_shutdown(struct inode *); | 2655 | extern void ext4_ioend_shutdown(struct inode *); |
2621 | extern void ext4_free_io_end(ext4_io_end_t *io); | ||
2622 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); | 2656 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); |
2657 | extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); | ||
2658 | extern int ext4_put_io_end(ext4_io_end_t *io_end); | ||
2659 | extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); | ||
2660 | extern void ext4_io_submit_init(struct ext4_io_submit *io, | ||
2661 | struct writeback_control *wbc); | ||
2623 | extern void ext4_end_io_work(struct work_struct *work); | 2662 | extern void ext4_end_io_work(struct work_struct *work); |
2624 | extern void ext4_io_submit(struct ext4_io_submit *io); | 2663 | extern void ext4_io_submit(struct ext4_io_submit *io); |
2625 | extern int ext4_bio_write_page(struct ext4_io_submit *io, | 2664 | extern int ext4_bio_write_page(struct ext4_io_submit *io, |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8643ff5bbeb7..51bc821ade90 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, | |||
270 | 0xffff); | 270 | 0xffff); |
271 | } | 271 | } |
272 | 272 | ||
273 | #define ext4_ext_dirty(handle, inode, path) \ | ||
274 | __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) | ||
275 | int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, | ||
276 | struct inode *inode, struct ext4_ext_path *path); | ||
277 | |||
273 | #endif /* _EXT4_EXTENTS */ | 278 | #endif /* _EXT4_EXTENTS */ |
274 | 279 | ||
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7058975e3a55..451eb4045330 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c | |||
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, | |||
43 | { | 43 | { |
44 | journal_t *journal; | 44 | journal_t *journal; |
45 | 45 | ||
46 | might_sleep(); | ||
47 | |||
46 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); | 48 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); |
47 | if (sb->s_flags & MS_RDONLY) | 49 | if (sb->s_flags & MS_RDONLY) |
48 | return ERR_PTR(-EROFS); | 50 | return ERR_PTR(-EROFS); |
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, | |||
113 | { | 115 | { |
114 | int err = 0; | 116 | int err = 0; |
115 | 117 | ||
118 | might_sleep(); | ||
119 | |||
116 | if (ext4_handle_valid(handle)) { | 120 | if (ext4_handle_valid(handle)) { |
117 | err = jbd2_journal_get_write_access(handle, bh); | 121 | err = jbd2_journal_get_write_access(handle, bh); |
118 | if (err) | 122 | if (err) |
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, | |||
209 | { | 213 | { |
210 | int err = 0; | 214 | int err = 0; |
211 | 215 | ||
216 | might_sleep(); | ||
217 | |||
218 | set_buffer_meta(bh); | ||
219 | set_buffer_prio(bh); | ||
212 | if (ext4_handle_valid(handle)) { | 220 | if (ext4_handle_valid(handle)) { |
213 | err = jbd2_journal_dirty_metadata(handle, bh); | 221 | err = jbd2_journal_dirty_metadata(handle, bh); |
214 | if (err) { | 222 | if (err) { |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4c216b1bf20c..c8c6885406db 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -29,11 +29,13 @@ | |||
29 | * block to complete the transaction. | 29 | * block to complete the transaction. |
30 | * | 30 | * |
31 | * For extents-enabled fs we may have to allocate and modify up to | 31 | * For extents-enabled fs we may have to allocate and modify up to |
32 | * 5 levels of tree + root which are stored in the inode. */ | 32 | * 5 levels of tree, data block (for each of these we need bitmap + group |
33 | * summaries), root which is stored in the inode, sb | ||
34 | */ | ||
33 | 35 | ||
34 | #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ | 36 | #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ |
35 | (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ | 37 | (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ |
36 | ? 27U : 8U) | 38 | ? 20U : 8U) |
37 | 39 | ||
38 | /* Extended attribute operations touch at most two data buffers, | 40 | /* Extended attribute operations touch at most two data buffers, |
39 | * two bitmap buffers, and two group summaries, in addition to the inode | 41 | * two bitmap buffers, and two group summaries, in addition to the inode |
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle, | |||
194 | * ext4_journal_callback_del: delete a registered callback | 196 | * ext4_journal_callback_del: delete a registered callback |
195 | * @handle: active journal transaction handle on which callback was registered | 197 | * @handle: active journal transaction handle on which callback was registered |
196 | * @jce: registered journal callback entry to unregister | 198 | * @jce: registered journal callback entry to unregister |
199 | * Return true if object was sucessfully removed | ||
197 | */ | 200 | */ |
198 | static inline void ext4_journal_callback_del(handle_t *handle, | 201 | static inline bool ext4_journal_callback_try_del(handle_t *handle, |
199 | struct ext4_journal_cb_entry *jce) | 202 | struct ext4_journal_cb_entry *jce) |
200 | { | 203 | { |
204 | bool deleted; | ||
201 | struct ext4_sb_info *sbi = | 205 | struct ext4_sb_info *sbi = |
202 | EXT4_SB(handle->h_transaction->t_journal->j_private); | 206 | EXT4_SB(handle->h_transaction->t_journal->j_private); |
203 | 207 | ||
204 | spin_lock(&sbi->s_md_lock); | 208 | spin_lock(&sbi->s_md_lock); |
209 | deleted = !list_empty(&jce->jce_list); | ||
205 | list_del_init(&jce->jce_list); | 210 | list_del_init(&jce->jce_list); |
206 | spin_unlock(&sbi->s_md_lock); | 211 | spin_unlock(&sbi->s_md_lock); |
212 | return deleted; | ||
207 | } | 213 | } |
208 | 214 | ||
209 | int | 215 | int |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9c6d06dcef8b..107936db244e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, | |||
157 | * - ENOMEM | 157 | * - ENOMEM |
158 | * - EIO | 158 | * - EIO |
159 | */ | 159 | */ |
160 | #define ext4_ext_dirty(handle, inode, path) \ | 160 | int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, |
161 | __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) | 161 | struct inode *inode, struct ext4_ext_path *path) |
162 | static int __ext4_ext_dirty(const char *where, unsigned int line, | ||
163 | handle_t *handle, struct inode *inode, | ||
164 | struct ext4_ext_path *path) | ||
165 | { | 162 | { |
166 | int err; | 163 | int err; |
167 | if (path->p_bh) { | 164 | if (path->p_bh) { |
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1813 | } | 1810 | } |
1814 | depth = ext_depth(inode); | 1811 | depth = ext_depth(inode); |
1815 | ex = path[depth].p_ext; | 1812 | ex = path[depth].p_ext; |
1813 | eh = path[depth].p_hdr; | ||
1816 | if (unlikely(path[depth].p_hdr == NULL)) { | 1814 | if (unlikely(path[depth].p_hdr == NULL)) { |
1817 | EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); | 1815 | EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); |
1818 | return -EIO; | 1816 | return -EIO; |
1819 | } | 1817 | } |
1820 | 1818 | ||
1821 | /* try to insert block into found extent and return */ | 1819 | /* try to insert block into found extent and return */ |
1822 | if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) | 1820 | if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { |
1823 | && ext4_can_extents_be_merged(inode, ex, newext)) { | ||
1824 | ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", | ||
1825 | ext4_ext_is_uninitialized(newext), | ||
1826 | ext4_ext_get_actual_len(newext), | ||
1827 | le32_to_cpu(ex->ee_block), | ||
1828 | ext4_ext_is_uninitialized(ex), | ||
1829 | ext4_ext_get_actual_len(ex), | ||
1830 | ext4_ext_pblock(ex)); | ||
1831 | err = ext4_ext_get_access(handle, inode, path + depth); | ||
1832 | if (err) | ||
1833 | return err; | ||
1834 | 1821 | ||
1835 | /* | 1822 | /* |
1836 | * ext4_can_extents_be_merged should have checked that either | 1823 | * Try to see whether we should rather test the extent on |
1837 | * both extents are uninitialized, or both aren't. Thus we | 1824 | * right from ex, or from the left of ex. This is because |
1838 | * need to check only one of them here. | 1825 | * ext4_ext_find_extent() can return either extent on the |
1826 | * left, or on the right from the searched position. This | ||
1827 | * will make merging more effective. | ||
1839 | */ | 1828 | */ |
1840 | if (ext4_ext_is_uninitialized(ex)) | 1829 | if (ex < EXT_LAST_EXTENT(eh) && |
1841 | uninitialized = 1; | 1830 | (le32_to_cpu(ex->ee_block) + |
1842 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) | 1831 | ext4_ext_get_actual_len(ex) < |
1832 | le32_to_cpu(newext->ee_block))) { | ||
1833 | ex += 1; | ||
1834 | goto prepend; | ||
1835 | } else if ((ex > EXT_FIRST_EXTENT(eh)) && | ||
1836 | (le32_to_cpu(newext->ee_block) + | ||
1837 | ext4_ext_get_actual_len(newext) < | ||
1838 | le32_to_cpu(ex->ee_block))) | ||
1839 | ex -= 1; | ||
1840 | |||
1841 | /* Try to append newex to the ex */ | ||
1842 | if (ext4_can_extents_be_merged(inode, ex, newext)) { | ||
1843 | ext_debug("append [%d]%d block to %u:[%d]%d" | ||
1844 | "(from %llu)\n", | ||
1845 | ext4_ext_is_uninitialized(newext), | ||
1846 | ext4_ext_get_actual_len(newext), | ||
1847 | le32_to_cpu(ex->ee_block), | ||
1848 | ext4_ext_is_uninitialized(ex), | ||
1849 | ext4_ext_get_actual_len(ex), | ||
1850 | ext4_ext_pblock(ex)); | ||
1851 | err = ext4_ext_get_access(handle, inode, | ||
1852 | path + depth); | ||
1853 | if (err) | ||
1854 | return err; | ||
1855 | |||
1856 | /* | ||
1857 | * ext4_can_extents_be_merged should have checked | ||
1858 | * that either both extents are uninitialized, or | ||
1859 | * both aren't. Thus we need to check only one of | ||
1860 | * them here. | ||
1861 | */ | ||
1862 | if (ext4_ext_is_uninitialized(ex)) | ||
1863 | uninitialized = 1; | ||
1864 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) | ||
1843 | + ext4_ext_get_actual_len(newext)); | 1865 | + ext4_ext_get_actual_len(newext)); |
1844 | if (uninitialized) | 1866 | if (uninitialized) |
1845 | ext4_ext_mark_uninitialized(ex); | 1867 | ext4_ext_mark_uninitialized(ex); |
1846 | eh = path[depth].p_hdr; | 1868 | eh = path[depth].p_hdr; |
1847 | nearex = ex; | 1869 | nearex = ex; |
1848 | goto merge; | 1870 | goto merge; |
1871 | } | ||
1872 | |||
1873 | prepend: | ||
1874 | /* Try to prepend newex to the ex */ | ||
1875 | if (ext4_can_extents_be_merged(inode, newext, ex)) { | ||
1876 | ext_debug("prepend %u[%d]%d block to %u:[%d]%d" | ||
1877 | "(from %llu)\n", | ||
1878 | le32_to_cpu(newext->ee_block), | ||
1879 | ext4_ext_is_uninitialized(newext), | ||
1880 | ext4_ext_get_actual_len(newext), | ||
1881 | le32_to_cpu(ex->ee_block), | ||
1882 | ext4_ext_is_uninitialized(ex), | ||
1883 | ext4_ext_get_actual_len(ex), | ||
1884 | ext4_ext_pblock(ex)); | ||
1885 | err = ext4_ext_get_access(handle, inode, | ||
1886 | path + depth); | ||
1887 | if (err) | ||
1888 | return err; | ||
1889 | |||
1890 | /* | ||
1891 | * ext4_can_extents_be_merged should have checked | ||
1892 | * that either both extents are uninitialized, or | ||
1893 | * both aren't. Thus we need to check only one of | ||
1894 | * them here. | ||
1895 | */ | ||
1896 | if (ext4_ext_is_uninitialized(ex)) | ||
1897 | uninitialized = 1; | ||
1898 | ex->ee_block = newext->ee_block; | ||
1899 | ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); | ||
1900 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) | ||
1901 | + ext4_ext_get_actual_len(newext)); | ||
1902 | if (uninitialized) | ||
1903 | ext4_ext_mark_uninitialized(ex); | ||
1904 | eh = path[depth].p_hdr; | ||
1905 | nearex = ex; | ||
1906 | goto merge; | ||
1907 | } | ||
1849 | } | 1908 | } |
1850 | 1909 | ||
1851 | depth = ext_depth(inode); | 1910 | depth = ext_depth(inode); |
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1880 | * There is no free space in the found leaf. | 1939 | * There is no free space in the found leaf. |
1881 | * We're gonna add a new leaf in the tree. | 1940 | * We're gonna add a new leaf in the tree. |
1882 | */ | 1941 | */ |
1883 | if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) | 1942 | if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) |
1884 | flags = EXT4_MB_USE_ROOT_BLOCKS; | 1943 | flags = EXT4_MB_USE_RESERVED; |
1885 | err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); | 1944 | err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); |
1886 | if (err) | 1945 | if (err) |
1887 | goto cleanup; | 1946 | goto cleanup; |
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) | |||
2599 | return 1; | 2658 | return 1; |
2600 | } | 2659 | } |
2601 | 2660 | ||
2602 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | 2661 | int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, |
2603 | ext4_lblk_t end) | 2662 | ext4_lblk_t end) |
2604 | { | 2663 | { |
2605 | struct super_block *sb = inode->i_sb; | 2664 | struct super_block *sb = inode->i_sb; |
2606 | int depth = ext_depth(inode); | 2665 | int depth = ext_depth(inode); |
@@ -2667,12 +2726,14 @@ again: | |||
2667 | 2726 | ||
2668 | /* | 2727 | /* |
2669 | * Split the extent in two so that 'end' is the last | 2728 | * Split the extent in two so that 'end' is the last |
2670 | * block in the first new extent | 2729 | * block in the first new extent. Also we should not |
2730 | * fail removing space due to ENOSPC so try to use | ||
2731 | * reserved block if that happens. | ||
2671 | */ | 2732 | */ |
2672 | err = ext4_split_extent_at(handle, inode, path, | 2733 | err = ext4_split_extent_at(handle, inode, path, |
2673 | end + 1, split_flag, | 2734 | end + 1, split_flag, |
2674 | EXT4_GET_BLOCKS_PRE_IO | | 2735 | EXT4_GET_BLOCKS_PRE_IO | |
2675 | EXT4_GET_BLOCKS_PUNCH_OUT_EXT); | 2736 | EXT4_GET_BLOCKS_METADATA_NOFAIL); |
2676 | 2737 | ||
2677 | if (err < 0) | 2738 | if (err < 0) |
2678 | goto out; | 2739 | goto out; |
@@ -3147,35 +3208,35 @@ out: | |||
3147 | static int ext4_ext_convert_to_initialized(handle_t *handle, | 3208 | static int ext4_ext_convert_to_initialized(handle_t *handle, |
3148 | struct inode *inode, | 3209 | struct inode *inode, |
3149 | struct ext4_map_blocks *map, | 3210 | struct ext4_map_blocks *map, |
3150 | struct ext4_ext_path *path) | 3211 | struct ext4_ext_path *path, |
3212 | int flags) | ||
3151 | { | 3213 | { |
3152 | struct ext4_sb_info *sbi; | 3214 | struct ext4_sb_info *sbi; |
3153 | struct ext4_extent_header *eh; | 3215 | struct ext4_extent_header *eh; |
3154 | struct ext4_map_blocks split_map; | 3216 | struct ext4_map_blocks split_map; |
3155 | struct ext4_extent zero_ex; | 3217 | struct ext4_extent zero_ex; |
3156 | struct ext4_extent *ex; | 3218 | struct ext4_extent *ex, *abut_ex; |
3157 | ext4_lblk_t ee_block, eof_block; | 3219 | ext4_lblk_t ee_block, eof_block; |
3158 | unsigned int ee_len, depth; | 3220 | unsigned int ee_len, depth, map_len = map->m_len; |
3159 | int allocated, max_zeroout = 0; | 3221 | int allocated = 0, max_zeroout = 0; |
3160 | int err = 0; | 3222 | int err = 0; |
3161 | int split_flag = 0; | 3223 | int split_flag = 0; |
3162 | 3224 | ||
3163 | ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" | 3225 | ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" |
3164 | "block %llu, max_blocks %u\n", inode->i_ino, | 3226 | "block %llu, max_blocks %u\n", inode->i_ino, |
3165 | (unsigned long long)map->m_lblk, map->m_len); | 3227 | (unsigned long long)map->m_lblk, map_len); |
3166 | 3228 | ||
3167 | sbi = EXT4_SB(inode->i_sb); | 3229 | sbi = EXT4_SB(inode->i_sb); |
3168 | eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> | 3230 | eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> |
3169 | inode->i_sb->s_blocksize_bits; | 3231 | inode->i_sb->s_blocksize_bits; |
3170 | if (eof_block < map->m_lblk + map->m_len) | 3232 | if (eof_block < map->m_lblk + map_len) |
3171 | eof_block = map->m_lblk + map->m_len; | 3233 | eof_block = map->m_lblk + map_len; |
3172 | 3234 | ||
3173 | depth = ext_depth(inode); | 3235 | depth = ext_depth(inode); |
3174 | eh = path[depth].p_hdr; | 3236 | eh = path[depth].p_hdr; |
3175 | ex = path[depth].p_ext; | 3237 | ex = path[depth].p_ext; |
3176 | ee_block = le32_to_cpu(ex->ee_block); | 3238 | ee_block = le32_to_cpu(ex->ee_block); |
3177 | ee_len = ext4_ext_get_actual_len(ex); | 3239 | ee_len = ext4_ext_get_actual_len(ex); |
3178 | allocated = ee_len - (map->m_lblk - ee_block); | ||
3179 | zero_ex.ee_len = 0; | 3240 | zero_ex.ee_len = 0; |
3180 | 3241 | ||
3181 | trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); | 3242 | trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); |
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |||
3186 | 3247 | ||
3187 | /* | 3248 | /* |
3188 | * Attempt to transfer newly initialized blocks from the currently | 3249 | * Attempt to transfer newly initialized blocks from the currently |
3189 | * uninitialized extent to its left neighbor. This is much cheaper | 3250 | * uninitialized extent to its neighbor. This is much cheaper |
3190 | * than an insertion followed by a merge as those involve costly | 3251 | * than an insertion followed by a merge as those involve costly |
3191 | * memmove() calls. This is the common case in steady state for | 3252 | * memmove() calls. Transferring to the left is the common case in |
3192 | * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append | 3253 | * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) |
3193 | * writes. | 3254 | * followed by append writes. |
3194 | * | 3255 | * |
3195 | * Limitations of the current logic: | 3256 | * Limitations of the current logic: |
3196 | * - L1: we only deal with writes at the start of the extent. | 3257 | * - L1: we do not deal with writes covering the whole extent. |
3197 | * The approach could be extended to writes at the end | ||
3198 | * of the extent but this scenario was deemed less common. | ||
3199 | * - L2: we do not deal with writes covering the whole extent. | ||
3200 | * This would require removing the extent if the transfer | 3258 | * This would require removing the extent if the transfer |
3201 | * is possible. | 3259 | * is possible. |
3202 | * - L3: we only attempt to merge with an extent stored in the | 3260 | * - L2: we only attempt to merge with an extent stored in the |
3203 | * same extent tree node. | 3261 | * same extent tree node. |
3204 | */ | 3262 | */ |
3205 | if ((map->m_lblk == ee_block) && /*L1*/ | 3263 | if ((map->m_lblk == ee_block) && |
3206 | (map->m_len < ee_len) && /*L2*/ | 3264 | /* See if we can merge left */ |
3207 | (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ | 3265 | (map_len < ee_len) && /*L1*/ |
3208 | struct ext4_extent *prev_ex; | 3266 | (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ |
3209 | ext4_lblk_t prev_lblk; | 3267 | ext4_lblk_t prev_lblk; |
3210 | ext4_fsblk_t prev_pblk, ee_pblk; | 3268 | ext4_fsblk_t prev_pblk, ee_pblk; |
3211 | unsigned int prev_len, write_len; | 3269 | unsigned int prev_len; |
3212 | 3270 | ||
3213 | prev_ex = ex - 1; | 3271 | abut_ex = ex - 1; |
3214 | prev_lblk = le32_to_cpu(prev_ex->ee_block); | 3272 | prev_lblk = le32_to_cpu(abut_ex->ee_block); |
3215 | prev_len = ext4_ext_get_actual_len(prev_ex); | 3273 | prev_len = ext4_ext_get_actual_len(abut_ex); |
3216 | prev_pblk = ext4_ext_pblock(prev_ex); | 3274 | prev_pblk = ext4_ext_pblock(abut_ex); |
3217 | ee_pblk = ext4_ext_pblock(ex); | 3275 | ee_pblk = ext4_ext_pblock(ex); |
3218 | write_len = map->m_len; | ||
3219 | 3276 | ||
3220 | /* | 3277 | /* |
3221 | * A transfer of blocks from 'ex' to 'prev_ex' is allowed | 3278 | * A transfer of blocks from 'ex' to 'abut_ex' is allowed |
3222 | * upon those conditions: | 3279 | * upon those conditions: |
3223 | * - C1: prev_ex is initialized, | 3280 | * - C1: abut_ex is initialized, |
3224 | * - C2: prev_ex is logically abutting ex, | 3281 | * - C2: abut_ex is logically abutting ex, |
3225 | * - C3: prev_ex is physically abutting ex, | 3282 | * - C3: abut_ex is physically abutting ex, |
3226 | * - C4: prev_ex can receive the additional blocks without | 3283 | * - C4: abut_ex can receive the additional blocks without |
3227 | * overflowing the (initialized) length limit. | 3284 | * overflowing the (initialized) length limit. |
3228 | */ | 3285 | */ |
3229 | if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ | 3286 | if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ |
3230 | ((prev_lblk + prev_len) == ee_block) && /*C2*/ | 3287 | ((prev_lblk + prev_len) == ee_block) && /*C2*/ |
3231 | ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ | 3288 | ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ |
3232 | (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ | 3289 | (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ |
3233 | err = ext4_ext_get_access(handle, inode, path + depth); | 3290 | err = ext4_ext_get_access(handle, inode, path + depth); |
3234 | if (err) | 3291 | if (err) |
3235 | goto out; | 3292 | goto out; |
3236 | 3293 | ||
3237 | trace_ext4_ext_convert_to_initialized_fastpath(inode, | 3294 | trace_ext4_ext_convert_to_initialized_fastpath(inode, |
3238 | map, ex, prev_ex); | 3295 | map, ex, abut_ex); |
3239 | 3296 | ||
3240 | /* Shift the start of ex by 'write_len' blocks */ | 3297 | /* Shift the start of ex by 'map_len' blocks */ |
3241 | ex->ee_block = cpu_to_le32(ee_block + write_len); | 3298 | ex->ee_block = cpu_to_le32(ee_block + map_len); |
3242 | ext4_ext_store_pblock(ex, ee_pblk + write_len); | 3299 | ext4_ext_store_pblock(ex, ee_pblk + map_len); |
3243 | ex->ee_len = cpu_to_le16(ee_len - write_len); | 3300 | ex->ee_len = cpu_to_le16(ee_len - map_len); |
3244 | ext4_ext_mark_uninitialized(ex); /* Restore the flag */ | 3301 | ext4_ext_mark_uninitialized(ex); /* Restore the flag */ |
3245 | 3302 | ||
3246 | /* Extend prev_ex by 'write_len' blocks */ | 3303 | /* Extend abut_ex by 'map_len' blocks */ |
3247 | prev_ex->ee_len = cpu_to_le16(prev_len + write_len); | 3304 | abut_ex->ee_len = cpu_to_le16(prev_len + map_len); |
3248 | 3305 | ||
3249 | /* Mark the block containing both extents as dirty */ | 3306 | /* Result: number of initialized blocks past m_lblk */ |
3250 | ext4_ext_dirty(handle, inode, path + depth); | 3307 | allocated = map_len; |
3308 | } | ||
3309 | } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && | ||
3310 | (map_len < ee_len) && /*L1*/ | ||
3311 | ex < EXT_LAST_EXTENT(eh)) { /*L2*/ | ||
3312 | /* See if we can merge right */ | ||
3313 | ext4_lblk_t next_lblk; | ||
3314 | ext4_fsblk_t next_pblk, ee_pblk; | ||
3315 | unsigned int next_len; | ||
3316 | |||
3317 | abut_ex = ex + 1; | ||
3318 | next_lblk = le32_to_cpu(abut_ex->ee_block); | ||
3319 | next_len = ext4_ext_get_actual_len(abut_ex); | ||
3320 | next_pblk = ext4_ext_pblock(abut_ex); | ||
3321 | ee_pblk = ext4_ext_pblock(ex); | ||
3251 | 3322 | ||
3252 | /* Update path to point to the right extent */ | 3323 | /* |
3253 | path[depth].p_ext = prev_ex; | 3324 | * A transfer of blocks from 'ex' to 'abut_ex' is allowed |
3325 | * upon those conditions: | ||
3326 | * - C1: abut_ex is initialized, | ||
3327 | * - C2: abut_ex is logically abutting ex, | ||
3328 | * - C3: abut_ex is physically abutting ex, | ||
3329 | * - C4: abut_ex can receive the additional blocks without | ||
3330 | * overflowing the (initialized) length limit. | ||
3331 | */ | ||
3332 | if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ | ||
3333 | ((map->m_lblk + map_len) == next_lblk) && /*C2*/ | ||
3334 | ((ee_pblk + ee_len) == next_pblk) && /*C3*/ | ||
3335 | (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ | ||
3336 | err = ext4_ext_get_access(handle, inode, path + depth); | ||
3337 | if (err) | ||
3338 | goto out; | ||
3339 | |||
3340 | trace_ext4_ext_convert_to_initialized_fastpath(inode, | ||
3341 | map, ex, abut_ex); | ||
3342 | |||
3343 | /* Shift the start of abut_ex by 'map_len' blocks */ | ||
3344 | abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); | ||
3345 | ext4_ext_store_pblock(abut_ex, next_pblk - map_len); | ||
3346 | ex->ee_len = cpu_to_le16(ee_len - map_len); | ||
3347 | ext4_ext_mark_uninitialized(ex); /* Restore the flag */ | ||
3348 | |||
3349 | /* Extend abut_ex by 'map_len' blocks */ | ||
3350 | abut_ex->ee_len = cpu_to_le16(next_len + map_len); | ||
3254 | 3351 | ||
3255 | /* Result: number of initialized blocks past m_lblk */ | 3352 | /* Result: number of initialized blocks past m_lblk */ |
3256 | allocated = write_len; | 3353 | allocated = map_len; |
3257 | goto out; | ||
3258 | } | 3354 | } |
3259 | } | 3355 | } |
3356 | if (allocated) { | ||
3357 | /* Mark the block containing both extents as dirty */ | ||
3358 | ext4_ext_dirty(handle, inode, path + depth); | ||
3359 | |||
3360 | /* Update path to point to the right extent */ | ||
3361 | path[depth].p_ext = abut_ex; | ||
3362 | goto out; | ||
3363 | } else | ||
3364 | allocated = ee_len - (map->m_lblk - ee_block); | ||
3260 | 3365 | ||
3261 | WARN_ON(map->m_lblk < ee_block); | 3366 | WARN_ON(map->m_lblk < ee_block); |
3262 | /* | 3367 | /* |
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |||
3330 | } | 3435 | } |
3331 | 3436 | ||
3332 | allocated = ext4_split_extent(handle, inode, path, | 3437 | allocated = ext4_split_extent(handle, inode, path, |
3333 | &split_map, split_flag, 0); | 3438 | &split_map, split_flag, flags); |
3334 | if (allocated < 0) | 3439 | if (allocated < 0) |
3335 | err = allocated; | 3440 | err = allocated; |
3336 | 3441 | ||
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
3650 | flags, allocated); | 3755 | flags, allocated); |
3651 | ext4_ext_show_leaf(inode, path); | 3756 | ext4_ext_show_leaf(inode, path); |
3652 | 3757 | ||
3758 | /* | ||
3759 | * When writing into uninitialized space, we should not fail to | ||
3760 | * allocate metadata blocks for the new extent block if needed. | ||
3761 | */ | ||
3762 | flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; | ||
3763 | |||
3653 | trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, | 3764 | trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, |
3654 | allocated, newblock); | 3765 | allocated, newblock); |
3655 | 3766 | ||
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
3713 | } | 3824 | } |
3714 | 3825 | ||
3715 | /* buffered write, writepage time, convert*/ | 3826 | /* buffered write, writepage time, convert*/ |
3716 | ret = ext4_ext_convert_to_initialized(handle, inode, map, path); | 3827 | ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); |
3717 | if (ret >= 0) | 3828 | if (ret >= 0) |
3718 | ext4_update_inode_fsync_trans(handle, inode, 1); | 3829 | ext4_update_inode_fsync_trans(handle, inode, 1); |
3719 | out: | 3830 | out: |
@@ -4257,48 +4368,13 @@ out3: | |||
4257 | return err ? err : allocated; | 4368 | return err ? err : allocated; |
4258 | } | 4369 | } |
4259 | 4370 | ||
4260 | void ext4_ext_truncate(struct inode *inode) | 4371 | void ext4_ext_truncate(handle_t *handle, struct inode *inode) |
4261 | { | 4372 | { |
4262 | struct address_space *mapping = inode->i_mapping; | ||
4263 | struct super_block *sb = inode->i_sb; | 4373 | struct super_block *sb = inode->i_sb; |
4264 | ext4_lblk_t last_block; | 4374 | ext4_lblk_t last_block; |
4265 | handle_t *handle; | ||
4266 | loff_t page_len; | ||
4267 | int err = 0; | 4375 | int err = 0; |
4268 | 4376 | ||
4269 | /* | 4377 | /* |
4270 | * finish any pending end_io work so we won't run the risk of | ||
4271 | * converting any truncated blocks to initialized later | ||
4272 | */ | ||
4273 | ext4_flush_unwritten_io(inode); | ||
4274 | |||
4275 | /* | ||
4276 | * probably first extent we're gonna free will be last in block | ||
4277 | */ | ||
4278 | err = ext4_writepage_trans_blocks(inode); | ||
4279 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); | ||
4280 | if (IS_ERR(handle)) | ||
4281 | return; | ||
4282 | |||
4283 | if (inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
4284 | page_len = PAGE_CACHE_SIZE - | ||
4285 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
4286 | |||
4287 | err = ext4_discard_partial_page_buffers(handle, | ||
4288 | mapping, inode->i_size, page_len, 0); | ||
4289 | |||
4290 | if (err) | ||
4291 | goto out_stop; | ||
4292 | } | ||
4293 | |||
4294 | if (ext4_orphan_add(handle, inode)) | ||
4295 | goto out_stop; | ||
4296 | |||
4297 | down_write(&EXT4_I(inode)->i_data_sem); | ||
4298 | |||
4299 | ext4_discard_preallocations(inode); | ||
4300 | |||
4301 | /* | ||
4302 | * TODO: optimization is possible here. | 4378 | * TODO: optimization is possible here. |
4303 | * Probably we need not scan at all, | 4379 | * Probably we need not scan at all, |
4304 | * because page truncation is enough. | 4380 | * because page truncation is enough. |
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode) | |||
4313 | err = ext4_es_remove_extent(inode, last_block, | 4389 | err = ext4_es_remove_extent(inode, last_block, |
4314 | EXT_MAX_BLOCKS - last_block); | 4390 | EXT_MAX_BLOCKS - last_block); |
4315 | err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); | 4391 | err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); |
4316 | |||
4317 | /* In a multi-transaction truncate, we only make the final | ||
4318 | * transaction synchronous. | ||
4319 | */ | ||
4320 | if (IS_SYNC(inode)) | ||
4321 | ext4_handle_sync(handle); | ||
4322 | |||
4323 | up_write(&EXT4_I(inode)->i_data_sem); | ||
4324 | |||
4325 | out_stop: | ||
4326 | /* | ||
4327 | * If this was a simple ftruncate() and the file will remain alive, | ||
4328 | * then we need to clear up the orphan record which we created above. | ||
4329 | * However, if this was a real unlink then we were called by | ||
4330 | * ext4_delete_inode(), and we allow that function to clean up the | ||
4331 | * orphan info for us. | ||
4332 | */ | ||
4333 | if (inode->i_nlink) | ||
4334 | ext4_orphan_del(handle, inode); | ||
4335 | |||
4336 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4337 | ext4_mark_inode_dirty(handle, inode); | ||
4338 | ext4_journal_stop(handle); | ||
4339 | } | 4392 | } |
4340 | 4393 | ||
4341 | static void ext4_falloc_update_inode(struct inode *inode, | 4394 | static void ext4_falloc_update_inode(struct inode *inode, |
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode, | |||
4623 | return (error < 0 ? error : 0); | 4676 | return (error < 0 ? error : 0); |
4624 | } | 4677 | } |
4625 | 4678 | ||
4626 | /* | ||
4627 | * ext4_ext_punch_hole | ||
4628 | * | ||
4629 | * Punches a hole of "length" bytes in a file starting | ||
4630 | * at byte "offset" | ||
4631 | * | ||
4632 | * @inode: The inode of the file to punch a hole in | ||
4633 | * @offset: The starting byte offset of the hole | ||
4634 | * @length: The length of the hole | ||
4635 | * | ||
4636 | * Returns the number of blocks removed or negative on err | ||
4637 | */ | ||
4638 | int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) | ||
4639 | { | ||
4640 | struct inode *inode = file_inode(file); | ||
4641 | struct super_block *sb = inode->i_sb; | ||
4642 | ext4_lblk_t first_block, stop_block; | ||
4643 | struct address_space *mapping = inode->i_mapping; | ||
4644 | handle_t *handle; | ||
4645 | loff_t first_page, last_page, page_len; | ||
4646 | loff_t first_page_offset, last_page_offset; | ||
4647 | int credits, err = 0; | ||
4648 | |||
4649 | /* | ||
4650 | * Write out all dirty pages to avoid race conditions | ||
4651 | * Then release them. | ||
4652 | */ | ||
4653 | if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
4654 | err = filemap_write_and_wait_range(mapping, | ||
4655 | offset, offset + length - 1); | ||
4656 | |||
4657 | if (err) | ||
4658 | return err; | ||
4659 | } | ||
4660 | |||
4661 | mutex_lock(&inode->i_mutex); | ||
4662 | /* It's not possible punch hole on append only file */ | ||
4663 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { | ||
4664 | err = -EPERM; | ||
4665 | goto out_mutex; | ||
4666 | } | ||
4667 | if (IS_SWAPFILE(inode)) { | ||
4668 | err = -ETXTBSY; | ||
4669 | goto out_mutex; | ||
4670 | } | ||
4671 | |||
4672 | /* No need to punch hole beyond i_size */ | ||
4673 | if (offset >= inode->i_size) | ||
4674 | goto out_mutex; | ||
4675 | |||
4676 | /* | ||
4677 | * If the hole extends beyond i_size, set the hole | ||
4678 | * to end after the page that contains i_size | ||
4679 | */ | ||
4680 | if (offset + length > inode->i_size) { | ||
4681 | length = inode->i_size + | ||
4682 | PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - | ||
4683 | offset; | ||
4684 | } | ||
4685 | |||
4686 | first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
4687 | last_page = (offset + length) >> PAGE_CACHE_SHIFT; | ||
4688 | |||
4689 | first_page_offset = first_page << PAGE_CACHE_SHIFT; | ||
4690 | last_page_offset = last_page << PAGE_CACHE_SHIFT; | ||
4691 | |||
4692 | /* Now release the pages */ | ||
4693 | if (last_page_offset > first_page_offset) { | ||
4694 | truncate_pagecache_range(inode, first_page_offset, | ||
4695 | last_page_offset - 1); | ||
4696 | } | ||
4697 | |||
4698 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | ||
4699 | ext4_inode_block_unlocked_dio(inode); | ||
4700 | err = ext4_flush_unwritten_io(inode); | ||
4701 | if (err) | ||
4702 | goto out_dio; | ||
4703 | inode_dio_wait(inode); | ||
4704 | |||
4705 | credits = ext4_writepage_trans_blocks(inode); | ||
4706 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); | ||
4707 | if (IS_ERR(handle)) { | ||
4708 | err = PTR_ERR(handle); | ||
4709 | goto out_dio; | ||
4710 | } | ||
4711 | |||
4712 | |||
4713 | /* | ||
4714 | * Now we need to zero out the non-page-aligned data in the | ||
4715 | * pages at the start and tail of the hole, and unmap the buffer | ||
4716 | * heads for the block aligned regions of the page that were | ||
4717 | * completely zeroed. | ||
4718 | */ | ||
4719 | if (first_page > last_page) { | ||
4720 | /* | ||
4721 | * If the file space being truncated is contained within a page | ||
4722 | * just zero out and unmap the middle of that page | ||
4723 | */ | ||
4724 | err = ext4_discard_partial_page_buffers(handle, | ||
4725 | mapping, offset, length, 0); | ||
4726 | |||
4727 | if (err) | ||
4728 | goto out; | ||
4729 | } else { | ||
4730 | /* | ||
4731 | * zero out and unmap the partial page that contains | ||
4732 | * the start of the hole | ||
4733 | */ | ||
4734 | page_len = first_page_offset - offset; | ||
4735 | if (page_len > 0) { | ||
4736 | err = ext4_discard_partial_page_buffers(handle, mapping, | ||
4737 | offset, page_len, 0); | ||
4738 | if (err) | ||
4739 | goto out; | ||
4740 | } | ||
4741 | |||
4742 | /* | ||
4743 | * zero out and unmap the partial page that contains | ||
4744 | * the end of the hole | ||
4745 | */ | ||
4746 | page_len = offset + length - last_page_offset; | ||
4747 | if (page_len > 0) { | ||
4748 | err = ext4_discard_partial_page_buffers(handle, mapping, | ||
4749 | last_page_offset, page_len, 0); | ||
4750 | if (err) | ||
4751 | goto out; | ||
4752 | } | ||
4753 | } | ||
4754 | |||
4755 | /* | ||
4756 | * If i_size is contained in the last page, we need to | ||
4757 | * unmap and zero the partial page after i_size | ||
4758 | */ | ||
4759 | if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && | ||
4760 | inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
4761 | |||
4762 | page_len = PAGE_CACHE_SIZE - | ||
4763 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
4764 | |||
4765 | if (page_len > 0) { | ||
4766 | err = ext4_discard_partial_page_buffers(handle, | ||
4767 | mapping, inode->i_size, page_len, 0); | ||
4768 | |||
4769 | if (err) | ||
4770 | goto out; | ||
4771 | } | ||
4772 | } | ||
4773 | |||
4774 | first_block = (offset + sb->s_blocksize - 1) >> | ||
4775 | EXT4_BLOCK_SIZE_BITS(sb); | ||
4776 | stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); | ||
4777 | |||
4778 | /* If there are no blocks to remove, return now */ | ||
4779 | if (first_block >= stop_block) | ||
4780 | goto out; | ||
4781 | |||
4782 | down_write(&EXT4_I(inode)->i_data_sem); | ||
4783 | ext4_discard_preallocations(inode); | ||
4784 | |||
4785 | err = ext4_es_remove_extent(inode, first_block, | ||
4786 | stop_block - first_block); | ||
4787 | err = ext4_ext_remove_space(inode, first_block, stop_block - 1); | ||
4788 | |||
4789 | ext4_discard_preallocations(inode); | ||
4790 | |||
4791 | if (IS_SYNC(inode)) | ||
4792 | ext4_handle_sync(handle); | ||
4793 | |||
4794 | up_write(&EXT4_I(inode)->i_data_sem); | ||
4795 | |||
4796 | out: | ||
4797 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4798 | ext4_mark_inode_dirty(handle, inode); | ||
4799 | ext4_journal_stop(handle); | ||
4800 | out_dio: | ||
4801 | ext4_inode_resume_unlocked_dio(inode); | ||
4802 | out_mutex: | ||
4803 | mutex_unlock(&inode->i_mutex); | ||
4804 | return err; | ||
4805 | } | ||
4806 | |||
4807 | int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 4679 | int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
4808 | __u64 start, __u64 len) | 4680 | __u64 start, __u64 len) |
4809 | { | 4681 | { |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3278e64e57b6..e0ba8a408def 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
166 | if (journal->j_flags & JBD2_BARRIER && | 166 | if (journal->j_flags & JBD2_BARRIER && |
167 | !jbd2_trans_will_send_data_barrier(journal, commit_tid)) | 167 | !jbd2_trans_will_send_data_barrier(journal, commit_tid)) |
168 | needs_barrier = true; | 168 | needs_barrier = true; |
169 | jbd2_log_start_commit(journal, commit_tid); | 169 | ret = jbd2_complete_transaction(journal, commit_tid); |
170 | ret = jbd2_log_wait_commit(journal, commit_tid); | ||
171 | if (needs_barrier) { | 170 | if (needs_barrier) { |
172 | err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); | 171 | err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); |
173 | if (!ret) | 172 | if (!ret) |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6c5bb8d993fe..00a818d67b54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
166 | trace_ext4_load_inode_bitmap(sb, block_group); | 166 | trace_ext4_load_inode_bitmap(sb, block_group); |
167 | bh->b_end_io = ext4_end_bitmap_read; | 167 | bh->b_end_io = ext4_end_bitmap_read; |
168 | get_bh(bh); | 168 | get_bh(bh); |
169 | submit_bh(READ, bh); | 169 | submit_bh(READ | REQ_META | REQ_PRIO, bh); |
170 | wait_on_buffer(bh); | 170 | wait_on_buffer(bh); |
171 | if (!buffer_uptodate(bh)) { | 171 | if (!buffer_uptodate(bh)) { |
172 | put_bh(bh); | 172 | put_bh(bh); |
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, | |||
666 | ei = EXT4_I(inode); | 666 | ei = EXT4_I(inode); |
667 | sbi = EXT4_SB(sb); | 667 | sbi = EXT4_SB(sb); |
668 | 668 | ||
669 | /* | ||
670 | * Initalize owners and quota early so that we don't have to account | ||
671 | * for quota initialization worst case in standard inode creating | ||
672 | * transaction | ||
673 | */ | ||
674 | if (owner) { | ||
675 | inode->i_mode = mode; | ||
676 | i_uid_write(inode, owner[0]); | ||
677 | i_gid_write(inode, owner[1]); | ||
678 | } else if (test_opt(sb, GRPID)) { | ||
679 | inode->i_mode = mode; | ||
680 | inode->i_uid = current_fsuid(); | ||
681 | inode->i_gid = dir->i_gid; | ||
682 | } else | ||
683 | inode_init_owner(inode, dir, mode); | ||
684 | dquot_initialize(inode); | ||
685 | |||
669 | if (!goal) | 686 | if (!goal) |
670 | goal = sbi->s_inode_goal; | 687 | goal = sbi->s_inode_goal; |
671 | 688 | ||
@@ -697,7 +714,7 @@ got_group: | |||
697 | 714 | ||
698 | gdp = ext4_get_group_desc(sb, group, &group_desc_bh); | 715 | gdp = ext4_get_group_desc(sb, group, &group_desc_bh); |
699 | if (!gdp) | 716 | if (!gdp) |
700 | goto fail; | 717 | goto out; |
701 | 718 | ||
702 | /* | 719 | /* |
703 | * Check free inodes count before loading bitmap. | 720 | * Check free inodes count before loading bitmap. |
@@ -711,7 +728,7 @@ got_group: | |||
711 | brelse(inode_bitmap_bh); | 728 | brelse(inode_bitmap_bh); |
712 | inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); | 729 | inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); |
713 | if (!inode_bitmap_bh) | 730 | if (!inode_bitmap_bh) |
714 | goto fail; | 731 | goto out; |
715 | 732 | ||
716 | repeat_in_this_group: | 733 | repeat_in_this_group: |
717 | ino = ext4_find_next_zero_bit((unsigned long *) | 734 | ino = ext4_find_next_zero_bit((unsigned long *) |
@@ -733,13 +750,16 @@ repeat_in_this_group: | |||
733 | handle_type, nblocks); | 750 | handle_type, nblocks); |
734 | if (IS_ERR(handle)) { | 751 | if (IS_ERR(handle)) { |
735 | err = PTR_ERR(handle); | 752 | err = PTR_ERR(handle); |
736 | goto fail; | 753 | ext4_std_error(sb, err); |
754 | goto out; | ||
737 | } | 755 | } |
738 | } | 756 | } |
739 | BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); | 757 | BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); |
740 | err = ext4_journal_get_write_access(handle, inode_bitmap_bh); | 758 | err = ext4_journal_get_write_access(handle, inode_bitmap_bh); |
741 | if (err) | 759 | if (err) { |
742 | goto fail; | 760 | ext4_std_error(sb, err); |
761 | goto out; | ||
762 | } | ||
743 | ext4_lock_group(sb, group); | 763 | ext4_lock_group(sb, group); |
744 | ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); | 764 | ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); |
745 | ext4_unlock_group(sb, group); | 765 | ext4_unlock_group(sb, group); |
@@ -755,8 +775,10 @@ repeat_in_this_group: | |||
755 | got: | 775 | got: |
756 | BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); | 776 | BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); |
757 | err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); | 777 | err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); |
758 | if (err) | 778 | if (err) { |
759 | goto fail; | 779 | ext4_std_error(sb, err); |
780 | goto out; | ||
781 | } | ||
760 | 782 | ||
761 | /* We may have to initialize the block bitmap if it isn't already */ | 783 | /* We may have to initialize the block bitmap if it isn't already */ |
762 | if (ext4_has_group_desc_csum(sb) && | 784 | if (ext4_has_group_desc_csum(sb) && |
@@ -768,7 +790,8 @@ got: | |||
768 | err = ext4_journal_get_write_access(handle, block_bitmap_bh); | 790 | err = ext4_journal_get_write_access(handle, block_bitmap_bh); |
769 | if (err) { | 791 | if (err) { |
770 | brelse(block_bitmap_bh); | 792 | brelse(block_bitmap_bh); |
771 | goto fail; | 793 | ext4_std_error(sb, err); |
794 | goto out; | ||
772 | } | 795 | } |
773 | 796 | ||
774 | BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); | 797 | BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); |
@@ -787,14 +810,18 @@ got: | |||
787 | ext4_unlock_group(sb, group); | 810 | ext4_unlock_group(sb, group); |
788 | brelse(block_bitmap_bh); | 811 | brelse(block_bitmap_bh); |
789 | 812 | ||
790 | if (err) | 813 | if (err) { |
791 | goto fail; | 814 | ext4_std_error(sb, err); |
815 | goto out; | ||
816 | } | ||
792 | } | 817 | } |
793 | 818 | ||
794 | BUFFER_TRACE(group_desc_bh, "get_write_access"); | 819 | BUFFER_TRACE(group_desc_bh, "get_write_access"); |
795 | err = ext4_journal_get_write_access(handle, group_desc_bh); | 820 | err = ext4_journal_get_write_access(handle, group_desc_bh); |
796 | if (err) | 821 | if (err) { |
797 | goto fail; | 822 | ext4_std_error(sb, err); |
823 | goto out; | ||
824 | } | ||
798 | 825 | ||
799 | /* Update the relevant bg descriptor fields */ | 826 | /* Update the relevant bg descriptor fields */ |
800 | if (ext4_has_group_desc_csum(sb)) { | 827 | if (ext4_has_group_desc_csum(sb)) { |
@@ -840,8 +867,10 @@ got: | |||
840 | 867 | ||
841 | BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); | 868 | BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); |
842 | err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); | 869 | err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); |
843 | if (err) | 870 | if (err) { |
844 | goto fail; | 871 | ext4_std_error(sb, err); |
872 | goto out; | ||
873 | } | ||
845 | 874 | ||
846 | percpu_counter_dec(&sbi->s_freeinodes_counter); | 875 | percpu_counter_dec(&sbi->s_freeinodes_counter); |
847 | if (S_ISDIR(mode)) | 876 | if (S_ISDIR(mode)) |
@@ -851,16 +880,6 @@ got: | |||
851 | flex_group = ext4_flex_group(sbi, group); | 880 | flex_group = ext4_flex_group(sbi, group); |
852 | atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); | 881 | atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); |
853 | } | 882 | } |
854 | if (owner) { | ||
855 | inode->i_mode = mode; | ||
856 | i_uid_write(inode, owner[0]); | ||
857 | i_gid_write(inode, owner[1]); | ||
858 | } else if (test_opt(sb, GRPID)) { | ||
859 | inode->i_mode = mode; | ||
860 | inode->i_uid = current_fsuid(); | ||
861 | inode->i_gid = dir->i_gid; | ||
862 | } else | ||
863 | inode_init_owner(inode, dir, mode); | ||
864 | 883 | ||
865 | inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); | 884 | inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); |
866 | /* This is the optimal IO size (for stat), not the fs block size */ | 885 | /* This is the optimal IO size (for stat), not the fs block size */ |
@@ -889,7 +908,9 @@ got: | |||
889 | * twice. | 908 | * twice. |
890 | */ | 909 | */ |
891 | err = -EIO; | 910 | err = -EIO; |
892 | goto fail; | 911 | ext4_error(sb, "failed to insert inode %lu: doubly allocated?", |
912 | inode->i_ino); | ||
913 | goto out; | ||
893 | } | 914 | } |
894 | spin_lock(&sbi->s_next_gen_lock); | 915 | spin_lock(&sbi->s_next_gen_lock); |
895 | inode->i_generation = sbi->s_next_generation++; | 916 | inode->i_generation = sbi->s_next_generation++; |
@@ -899,7 +920,6 @@ got: | |||
899 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 920 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
900 | EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { | 921 | EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { |
901 | __u32 csum; | 922 | __u32 csum; |
902 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
903 | __le32 inum = cpu_to_le32(inode->i_ino); | 923 | __le32 inum = cpu_to_le32(inode->i_ino); |
904 | __le32 gen = cpu_to_le32(inode->i_generation); | 924 | __le32 gen = cpu_to_le32(inode->i_generation); |
905 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, | 925 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, |
@@ -918,7 +938,6 @@ got: | |||
918 | ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); | 938 | ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); |
919 | 939 | ||
920 | ret = inode; | 940 | ret = inode; |
921 | dquot_initialize(inode); | ||
922 | err = dquot_alloc_inode(inode); | 941 | err = dquot_alloc_inode(inode); |
923 | if (err) | 942 | if (err) |
924 | goto fail_drop; | 943 | goto fail_drop; |
@@ -952,24 +971,17 @@ got: | |||
952 | 971 | ||
953 | ext4_debug("allocating inode %lu\n", inode->i_ino); | 972 | ext4_debug("allocating inode %lu\n", inode->i_ino); |
954 | trace_ext4_allocate_inode(inode, dir, mode); | 973 | trace_ext4_allocate_inode(inode, dir, mode); |
955 | goto really_out; | ||
956 | fail: | ||
957 | ext4_std_error(sb, err); | ||
958 | out: | ||
959 | iput(inode); | ||
960 | ret = ERR_PTR(err); | ||
961 | really_out: | ||
962 | brelse(inode_bitmap_bh); | 974 | brelse(inode_bitmap_bh); |
963 | return ret; | 975 | return ret; |
964 | 976 | ||
965 | fail_free_drop: | 977 | fail_free_drop: |
966 | dquot_free_inode(inode); | 978 | dquot_free_inode(inode); |
967 | |||
968 | fail_drop: | 979 | fail_drop: |
969 | dquot_drop(inode); | ||
970 | inode->i_flags |= S_NOQUOTA; | ||
971 | clear_nlink(inode); | 980 | clear_nlink(inode); |
972 | unlock_new_inode(inode); | 981 | unlock_new_inode(inode); |
982 | out: | ||
983 | dquot_drop(inode); | ||
984 | inode->i_flags |= S_NOQUOTA; | ||
973 | iput(inode); | 985 | iput(inode); |
974 | brelse(inode_bitmap_bh); | 986 | brelse(inode_bitmap_bh); |
975 | return ERR_PTR(err); | 987 | return ERR_PTR(err); |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a04183127ef0..98be6f697463 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | |||
292 | } | 292 | } |
293 | 293 | ||
294 | /** | 294 | /** |
295 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
296 | * @handle: handle for this transaction | ||
297 | * @inode: inode which needs allocated blocks | ||
298 | * @iblock: the logical block to start allocated at | ||
299 | * @goal: preferred physical block of allocation | ||
300 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
301 | * blocks | ||
302 | * @blks: number of desired blocks | ||
303 | * @new_blocks: on return it will store the new block numbers for | ||
304 | * the indirect blocks(if needed) and the first direct block, | ||
305 | * @err: on return it will store the error code | ||
306 | * | ||
307 | * This function will return the number of blocks allocated as | ||
308 | * requested by the passed-in parameters. | ||
309 | */ | ||
310 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
311 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
312 | int indirect_blks, int blks, | ||
313 | ext4_fsblk_t new_blocks[4], int *err) | ||
314 | { | ||
315 | struct ext4_allocation_request ar; | ||
316 | int target, i; | ||
317 | unsigned long count = 0, blk_allocated = 0; | ||
318 | int index = 0; | ||
319 | ext4_fsblk_t current_block = 0; | ||
320 | int ret = 0; | ||
321 | |||
322 | /* | ||
323 | * Here we try to allocate the requested multiple blocks at once, | ||
324 | * on a best-effort basis. | ||
325 | * To build a branch, we should allocate blocks for | ||
326 | * the indirect blocks(if not allocated yet), and at least | ||
327 | * the first direct block of this branch. That's the | ||
328 | * minimum number of blocks need to allocate(required) | ||
329 | */ | ||
330 | /* first we try to allocate the indirect blocks */ | ||
331 | target = indirect_blks; | ||
332 | while (target > 0) { | ||
333 | count = target; | ||
334 | /* allocating blocks for indirect blocks and direct blocks */ | ||
335 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
336 | 0, &count, err); | ||
337 | if (*err) | ||
338 | goto failed_out; | ||
339 | |||
340 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
341 | EXT4_ERROR_INODE(inode, | ||
342 | "current_block %llu + count %lu > %d!", | ||
343 | current_block, count, | ||
344 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
345 | *err = -EIO; | ||
346 | goto failed_out; | ||
347 | } | ||
348 | |||
349 | target -= count; | ||
350 | /* allocate blocks for indirect blocks */ | ||
351 | while (index < indirect_blks && count) { | ||
352 | new_blocks[index++] = current_block++; | ||
353 | count--; | ||
354 | } | ||
355 | if (count > 0) { | ||
356 | /* | ||
357 | * save the new block number | ||
358 | * for the first direct block | ||
359 | */ | ||
360 | new_blocks[index] = current_block; | ||
361 | WARN(1, KERN_INFO "%s returned more blocks than " | ||
362 | "requested\n", __func__); | ||
363 | break; | ||
364 | } | ||
365 | } | ||
366 | |||
367 | target = blks - count ; | ||
368 | blk_allocated = count; | ||
369 | if (!target) | ||
370 | goto allocated; | ||
371 | /* Now allocate data blocks */ | ||
372 | memset(&ar, 0, sizeof(ar)); | ||
373 | ar.inode = inode; | ||
374 | ar.goal = goal; | ||
375 | ar.len = target; | ||
376 | ar.logical = iblock; | ||
377 | if (S_ISREG(inode->i_mode)) | ||
378 | /* enable in-core preallocation only for regular files */ | ||
379 | ar.flags = EXT4_MB_HINT_DATA; | ||
380 | |||
381 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
382 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
383 | EXT4_ERROR_INODE(inode, | ||
384 | "current_block %llu + ar.len %d > %d!", | ||
385 | current_block, ar.len, | ||
386 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
387 | *err = -EIO; | ||
388 | goto failed_out; | ||
389 | } | ||
390 | |||
391 | if (*err && (target == blks)) { | ||
392 | /* | ||
393 | * if the allocation failed and we didn't allocate | ||
394 | * any blocks before | ||
395 | */ | ||
396 | goto failed_out; | ||
397 | } | ||
398 | if (!*err) { | ||
399 | if (target == blks) { | ||
400 | /* | ||
401 | * save the new block number | ||
402 | * for the first direct block | ||
403 | */ | ||
404 | new_blocks[index] = current_block; | ||
405 | } | ||
406 | blk_allocated += ar.len; | ||
407 | } | ||
408 | allocated: | ||
409 | /* total number of blocks allocated for direct blocks */ | ||
410 | ret = blk_allocated; | ||
411 | *err = 0; | ||
412 | return ret; | ||
413 | failed_out: | ||
414 | for (i = 0; i < index; i++) | ||
415 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | /** | ||
420 | * ext4_alloc_branch - allocate and set up a chain of blocks. | 295 | * ext4_alloc_branch - allocate and set up a chain of blocks. |
421 | * @handle: handle for this transaction | 296 | * @handle: handle for this transaction |
422 | * @inode: owner | 297 | * @inode: owner |
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
448 | int *blks, ext4_fsblk_t goal, | 323 | int *blks, ext4_fsblk_t goal, |
449 | ext4_lblk_t *offsets, Indirect *branch) | 324 | ext4_lblk_t *offsets, Indirect *branch) |
450 | { | 325 | { |
451 | int blocksize = inode->i_sb->s_blocksize; | 326 | struct ext4_allocation_request ar; |
452 | int i, n = 0; | 327 | struct buffer_head * bh; |
453 | int err = 0; | 328 | ext4_fsblk_t b, new_blocks[4]; |
454 | struct buffer_head *bh; | 329 | __le32 *p; |
455 | int num; | 330 | int i, j, err, len = 1; |
456 | ext4_fsblk_t new_blocks[4]; | ||
457 | ext4_fsblk_t current_block; | ||
458 | |||
459 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
460 | *blks, new_blocks, &err); | ||
461 | if (err) | ||
462 | return err; | ||
463 | 331 | ||
464 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
465 | /* | 332 | /* |
466 | * metadata blocks and data blocks are allocated. | 333 | * Set up for the direct block allocation |
467 | */ | 334 | */ |
468 | for (n = 1; n <= indirect_blks; n++) { | 335 | memset(&ar, 0, sizeof(ar)); |
469 | /* | 336 | ar.inode = inode; |
470 | * Get buffer_head for parent block, zero it out | 337 | ar.len = *blks; |
471 | * and set the pointer to new one, then send | 338 | ar.logical = iblock; |
472 | * parent to disk. | 339 | if (S_ISREG(inode->i_mode)) |
473 | */ | 340 | ar.flags = EXT4_MB_HINT_DATA; |
474 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | 341 | |
342 | for (i = 0; i <= indirect_blks; i++) { | ||
343 | if (i == indirect_blks) { | ||
344 | ar.goal = goal; | ||
345 | new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); | ||
346 | } else | ||
347 | goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, | ||
348 | goal, 0, NULL, &err); | ||
349 | if (err) { | ||
350 | i--; | ||
351 | goto failed; | ||
352 | } | ||
353 | branch[i].key = cpu_to_le32(new_blocks[i]); | ||
354 | if (i == 0) | ||
355 | continue; | ||
356 | |||
357 | bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); | ||
475 | if (unlikely(!bh)) { | 358 | if (unlikely(!bh)) { |
476 | err = -ENOMEM; | 359 | err = -ENOMEM; |
477 | goto failed; | 360 | goto failed; |
478 | } | 361 | } |
479 | |||
480 | branch[n].bh = bh; | ||
481 | lock_buffer(bh); | 362 | lock_buffer(bh); |
482 | BUFFER_TRACE(bh, "call get_create_access"); | 363 | BUFFER_TRACE(bh, "call get_create_access"); |
483 | err = ext4_journal_get_create_access(handle, bh); | 364 | err = ext4_journal_get_create_access(handle, bh); |
484 | if (err) { | 365 | if (err) { |
485 | /* Don't brelse(bh) here; it's done in | ||
486 | * ext4_journal_forget() below */ | ||
487 | unlock_buffer(bh); | 366 | unlock_buffer(bh); |
488 | goto failed; | 367 | goto failed; |
489 | } | 368 | } |
490 | 369 | ||
491 | memset(bh->b_data, 0, blocksize); | 370 | memset(bh->b_data, 0, bh->b_size); |
492 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | 371 | p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; |
493 | branch[n].key = cpu_to_le32(new_blocks[n]); | 372 | b = new_blocks[i]; |
494 | *branch[n].p = branch[n].key; | 373 | |
495 | if (n == indirect_blks) { | 374 | if (i == indirect_blks) |
496 | current_block = new_blocks[n]; | 375 | len = ar.len; |
497 | /* | 376 | for (j = 0; j < len; j++) |
498 | * End of chain, update the last new metablock of | 377 | *p++ = cpu_to_le32(b++); |
499 | * the chain to point to the new allocated | 378 | |
500 | * data blocks numbers | ||
501 | */ | ||
502 | for (i = 1; i < num; i++) | ||
503 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
504 | } | ||
505 | BUFFER_TRACE(bh, "marking uptodate"); | 379 | BUFFER_TRACE(bh, "marking uptodate"); |
506 | set_buffer_uptodate(bh); | 380 | set_buffer_uptodate(bh); |
507 | unlock_buffer(bh); | 381 | unlock_buffer(bh); |
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
511 | if (err) | 385 | if (err) |
512 | goto failed; | 386 | goto failed; |
513 | } | 387 | } |
514 | *blks = num; | 388 | *blks = ar.len; |
515 | return err; | 389 | return 0; |
516 | failed: | 390 | failed: |
517 | /* Allocation failed, free what we already allocated */ | 391 | for (; i >= 0; i--) { |
518 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | 392 | if (i != indirect_blks && branch[i].bh) |
519 | for (i = 1; i <= n ; i++) { | 393 | ext4_forget(handle, 1, inode, branch[i].bh, |
520 | /* | 394 | branch[i].bh->b_blocknr); |
521 | * branch[i].bh is newly allocated, so there is no | 395 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], |
522 | * need to revoke the block, which is why we don't | 396 | (i == indirect_blks) ? ar.len : 1, 0); |
523 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
524 | */ | ||
525 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
526 | EXT4_FREE_BLOCKS_FORGET); | ||
527 | } | 397 | } |
528 | for (i = n+1; i < indirect_blks; i++) | ||
529 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
530 | |||
531 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
532 | |||
533 | return err; | 398 | return err; |
534 | } | 399 | } |
535 | 400 | ||
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
941 | * be able to restart the transaction at a conventient checkpoint to make | 806 | * be able to restart the transaction at a conventient checkpoint to make |
942 | * sure we don't overflow the journal. | 807 | * sure we don't overflow the journal. |
943 | * | 808 | * |
944 | * start_transaction gets us a new handle for a truncate transaction, | 809 | * Try to extend this transaction for the purposes of truncation. If |
945 | * and extend_transaction tries to extend the existing one a bit. If | ||
946 | * extend fails, we need to propagate the failure up and restart the | 810 | * extend fails, we need to propagate the failure up and restart the |
947 | * transaction in the top-level truncate loop. --sct | 811 | * transaction in the top-level truncate loop. --sct |
948 | */ | ||
949 | static handle_t *start_transaction(struct inode *inode) | ||
950 | { | ||
951 | handle_t *result; | ||
952 | |||
953 | result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, | ||
954 | ext4_blocks_for_truncate(inode)); | ||
955 | if (!IS_ERR(result)) | ||
956 | return result; | ||
957 | |||
958 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
959 | return result; | ||
960 | } | ||
961 | |||
962 | /* | ||
963 | * Try to extend this transaction for the purposes of truncation. | ||
964 | * | 812 | * |
965 | * Returns 0 if we managed to create more room. If we can't create more | 813 | * Returns 0 if we managed to create more room. If we can't create more |
966 | * room, and the transaction must be restarted we return 1. | 814 | * room, and the transaction must be restarted we return 1. |
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
1353 | } | 1201 | } |
1354 | } | 1202 | } |
1355 | 1203 | ||
1356 | void ext4_ind_truncate(struct inode *inode) | 1204 | void ext4_ind_truncate(handle_t *handle, struct inode *inode) |
1357 | { | 1205 | { |
1358 | handle_t *handle; | ||
1359 | struct ext4_inode_info *ei = EXT4_I(inode); | 1206 | struct ext4_inode_info *ei = EXT4_I(inode); |
1360 | __le32 *i_data = ei->i_data; | 1207 | __le32 *i_data = ei->i_data; |
1361 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 1208 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
1362 | struct address_space *mapping = inode->i_mapping; | ||
1363 | ext4_lblk_t offsets[4]; | 1209 | ext4_lblk_t offsets[4]; |
1364 | Indirect chain[4]; | 1210 | Indirect chain[4]; |
1365 | Indirect *partial; | 1211 | Indirect *partial; |
1366 | __le32 nr = 0; | 1212 | __le32 nr = 0; |
1367 | int n = 0; | 1213 | int n = 0; |
1368 | ext4_lblk_t last_block, max_block; | 1214 | ext4_lblk_t last_block, max_block; |
1369 | loff_t page_len; | ||
1370 | unsigned blocksize = inode->i_sb->s_blocksize; | 1215 | unsigned blocksize = inode->i_sb->s_blocksize; |
1371 | int err; | ||
1372 | |||
1373 | handle = start_transaction(inode); | ||
1374 | if (IS_ERR(handle)) | ||
1375 | return; /* AKPM: return what? */ | ||
1376 | 1216 | ||
1377 | last_block = (inode->i_size + blocksize-1) | 1217 | last_block = (inode->i_size + blocksize-1) |
1378 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | 1218 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
1379 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | 1219 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) |
1380 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | 1220 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
1381 | 1221 | ||
1382 | if (inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
1383 | page_len = PAGE_CACHE_SIZE - | ||
1384 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
1385 | |||
1386 | err = ext4_discard_partial_page_buffers(handle, | ||
1387 | mapping, inode->i_size, page_len, 0); | ||
1388 | |||
1389 | if (err) | ||
1390 | goto out_stop; | ||
1391 | } | ||
1392 | |||
1393 | if (last_block != max_block) { | 1222 | if (last_block != max_block) { |
1394 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | 1223 | n = ext4_block_to_path(inode, last_block, offsets, NULL); |
1395 | if (n == 0) | 1224 | if (n == 0) |
1396 | goto out_stop; /* error */ | 1225 | return; |
1397 | } | 1226 | } |
1398 | 1227 | ||
1399 | /* | ||
1400 | * OK. This truncate is going to happen. We add the inode to the | ||
1401 | * orphan list, so that if this truncate spans multiple transactions, | ||
1402 | * and we crash, we will resume the truncate when the filesystem | ||
1403 | * recovers. It also marks the inode dirty, to catch the new size. | ||
1404 | * | ||
1405 | * Implication: the file must always be in a sane, consistent | ||
1406 | * truncatable state while each transaction commits. | ||
1407 | */ | ||
1408 | if (ext4_orphan_add(handle, inode)) | ||
1409 | goto out_stop; | ||
1410 | |||
1411 | /* | ||
1412 | * From here we block out all ext4_get_block() callers who want to | ||
1413 | * modify the block allocation tree. | ||
1414 | */ | ||
1415 | down_write(&ei->i_data_sem); | ||
1416 | |||
1417 | ext4_discard_preallocations(inode); | ||
1418 | ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); | 1228 | ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); |
1419 | 1229 | ||
1420 | /* | 1230 | /* |
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode) | |||
1431 | * It is unnecessary to free any data blocks if last_block is | 1241 | * It is unnecessary to free any data blocks if last_block is |
1432 | * equal to the indirect block limit. | 1242 | * equal to the indirect block limit. |
1433 | */ | 1243 | */ |
1434 | goto out_unlock; | 1244 | return; |
1435 | } else if (n == 1) { /* direct blocks */ | 1245 | } else if (n == 1) { /* direct blocks */ |
1436 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | 1246 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
1437 | i_data + EXT4_NDIR_BLOCKS); | 1247 | i_data + EXT4_NDIR_BLOCKS); |
@@ -1491,31 +1301,6 @@ do_indirects: | |||
1491 | case EXT4_TIND_BLOCK: | 1301 | case EXT4_TIND_BLOCK: |
1492 | ; | 1302 | ; |
1493 | } | 1303 | } |
1494 | |||
1495 | out_unlock: | ||
1496 | up_write(&ei->i_data_sem); | ||
1497 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
1498 | ext4_mark_inode_dirty(handle, inode); | ||
1499 | |||
1500 | /* | ||
1501 | * In a multi-transaction truncate, we only make the final transaction | ||
1502 | * synchronous | ||
1503 | */ | ||
1504 | if (IS_SYNC(inode)) | ||
1505 | ext4_handle_sync(handle); | ||
1506 | out_stop: | ||
1507 | /* | ||
1508 | * If this was a simple ftruncate(), and the file will remain alive | ||
1509 | * then we need to clear up the orphan record which we created above. | ||
1510 | * However, if this was a real unlink then we were called by | ||
1511 | * ext4_delete_inode(), and we allow that function to clean up the | ||
1512 | * orphan info for us. | ||
1513 | */ | ||
1514 | if (inode->i_nlink) | ||
1515 | ext4_orphan_del(handle, inode); | ||
1516 | |||
1517 | ext4_journal_stop(handle); | ||
1518 | trace_ext4_truncate_exit(inode); | ||
1519 | } | 1304 | } |
1520 | 1305 | ||
1521 | static int free_hole_blocks(handle_t *handle, struct inode *inode, | 1306 | static int free_hole_blocks(handle_t *handle, struct inode *inode, |
@@ -1569,8 +1354,8 @@ err: | |||
1569 | return ret; | 1354 | return ret; |
1570 | } | 1355 | } |
1571 | 1356 | ||
1572 | static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, | 1357 | int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, |
1573 | ext4_lblk_t first, ext4_lblk_t stop) | 1358 | ext4_lblk_t first, ext4_lblk_t stop) |
1574 | { | 1359 | { |
1575 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 1360 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
1576 | int level, ret = 0; | 1361 | int level, ret = 0; |
@@ -1604,157 +1389,3 @@ err: | |||
1604 | return ret; | 1389 | return ret; |
1605 | } | 1390 | } |
1606 | 1391 | ||
1607 | int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) | ||
1608 | { | ||
1609 | struct inode *inode = file_inode(file); | ||
1610 | struct super_block *sb = inode->i_sb; | ||
1611 | ext4_lblk_t first_block, stop_block; | ||
1612 | struct address_space *mapping = inode->i_mapping; | ||
1613 | handle_t *handle = NULL; | ||
1614 | loff_t first_page, last_page, page_len; | ||
1615 | loff_t first_page_offset, last_page_offset; | ||
1616 | int err = 0; | ||
1617 | |||
1618 | /* | ||
1619 | * Write out all dirty pages to avoid race conditions | ||
1620 | * Then release them. | ||
1621 | */ | ||
1622 | if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
1623 | err = filemap_write_and_wait_range(mapping, | ||
1624 | offset, offset + length - 1); | ||
1625 | if (err) | ||
1626 | return err; | ||
1627 | } | ||
1628 | |||
1629 | mutex_lock(&inode->i_mutex); | ||
1630 | /* It's not possible punch hole on append only file */ | ||
1631 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { | ||
1632 | err = -EPERM; | ||
1633 | goto out_mutex; | ||
1634 | } | ||
1635 | if (IS_SWAPFILE(inode)) { | ||
1636 | err = -ETXTBSY; | ||
1637 | goto out_mutex; | ||
1638 | } | ||
1639 | |||
1640 | /* No need to punch hole beyond i_size */ | ||
1641 | if (offset >= inode->i_size) | ||
1642 | goto out_mutex; | ||
1643 | |||
1644 | /* | ||
1645 | * If the hole extents beyond i_size, set the hole | ||
1646 | * to end after the page that contains i_size | ||
1647 | */ | ||
1648 | if (offset + length > inode->i_size) { | ||
1649 | length = inode->i_size + | ||
1650 | PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - | ||
1651 | offset; | ||
1652 | } | ||
1653 | |||
1654 | first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1655 | last_page = (offset + length) >> PAGE_CACHE_SHIFT; | ||
1656 | |||
1657 | first_page_offset = first_page << PAGE_CACHE_SHIFT; | ||
1658 | last_page_offset = last_page << PAGE_CACHE_SHIFT; | ||
1659 | |||
1660 | /* Now release the pages */ | ||
1661 | if (last_page_offset > first_page_offset) { | ||
1662 | truncate_pagecache_range(inode, first_page_offset, | ||
1663 | last_page_offset - 1); | ||
1664 | } | ||
1665 | |||
1666 | /* Wait all existing dio works, newcomers will block on i_mutex */ | ||
1667 | inode_dio_wait(inode); | ||
1668 | |||
1669 | handle = start_transaction(inode); | ||
1670 | if (IS_ERR(handle)) | ||
1671 | goto out_mutex; | ||
1672 | |||
1673 | /* | ||
1674 | * Now we need to zero out the non-page-aligned data in the | ||
1675 | * pages at the start and tail of the hole, and unmap the buffer | ||
1676 | * heads for the block aligned regions of the page that were | ||
1677 | * completely zerod. | ||
1678 | */ | ||
1679 | if (first_page > last_page) { | ||
1680 | /* | ||
1681 | * If the file space being truncated is contained within a page | ||
1682 | * just zero out and unmap the middle of that page | ||
1683 | */ | ||
1684 | err = ext4_discard_partial_page_buffers(handle, | ||
1685 | mapping, offset, length, 0); | ||
1686 | if (err) | ||
1687 | goto out; | ||
1688 | } else { | ||
1689 | /* | ||
1690 | * Zero out and unmap the paritial page that contains | ||
1691 | * the start of the hole | ||
1692 | */ | ||
1693 | page_len = first_page_offset - offset; | ||
1694 | if (page_len > 0) { | ||
1695 | err = ext4_discard_partial_page_buffers(handle, mapping, | ||
1696 | offset, page_len, 0); | ||
1697 | if (err) | ||
1698 | goto out; | ||
1699 | } | ||
1700 | |||
1701 | /* | ||
1702 | * Zero out and unmap the partial page that contains | ||
1703 | * the end of the hole | ||
1704 | */ | ||
1705 | page_len = offset + length - last_page_offset; | ||
1706 | if (page_len > 0) { | ||
1707 | err = ext4_discard_partial_page_buffers(handle, mapping, | ||
1708 | last_page_offset, page_len, 0); | ||
1709 | if (err) | ||
1710 | goto out; | ||
1711 | } | ||
1712 | } | ||
1713 | |||
1714 | /* | ||
1715 | * If i_size contained in the last page, we need to | ||
1716 | * unmap and zero the paritial page after i_size | ||
1717 | */ | ||
1718 | if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && | ||
1719 | inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
1720 | page_len = PAGE_CACHE_SIZE - | ||
1721 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
1722 | if (page_len > 0) { | ||
1723 | err = ext4_discard_partial_page_buffers(handle, | ||
1724 | mapping, inode->i_size, page_len, 0); | ||
1725 | if (err) | ||
1726 | goto out; | ||
1727 | } | ||
1728 | } | ||
1729 | |||
1730 | first_block = (offset + sb->s_blocksize - 1) >> | ||
1731 | EXT4_BLOCK_SIZE_BITS(sb); | ||
1732 | stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); | ||
1733 | |||
1734 | if (first_block >= stop_block) | ||
1735 | goto out; | ||
1736 | |||
1737 | down_write(&EXT4_I(inode)->i_data_sem); | ||
1738 | ext4_discard_preallocations(inode); | ||
1739 | |||
1740 | err = ext4_es_remove_extent(inode, first_block, | ||
1741 | stop_block - first_block); | ||
1742 | err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); | ||
1743 | |||
1744 | ext4_discard_preallocations(inode); | ||
1745 | |||
1746 | if (IS_SYNC(inode)) | ||
1747 | ext4_handle_sync(handle); | ||
1748 | |||
1749 | up_write(&EXT4_I(inode)->i_data_sem); | ||
1750 | |||
1751 | out: | ||
1752 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
1753 | ext4_mark_inode_dirty(handle, inode); | ||
1754 | ext4_journal_stop(handle); | ||
1755 | |||
1756 | out_mutex: | ||
1757 | mutex_unlock(&inode->i_mutex); | ||
1758 | |||
1759 | return err; | ||
1760 | } | ||
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0fd1a123f7d..3e2bf873e8a8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c | |||
@@ -19,7 +19,8 @@ | |||
19 | 19 | ||
20 | #define EXT4_XATTR_SYSTEM_DATA "data" | 20 | #define EXT4_XATTR_SYSTEM_DATA "data" |
21 | #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) | 21 | #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) |
22 | #define EXT4_INLINE_DOTDOT_SIZE 4 | 22 | #define EXT4_INLINE_DOTDOT_OFFSET 2 |
23 | #define EXT4_INLINE_DOTDOT_SIZE 4 | ||
23 | 24 | ||
24 | int ext4_get_inline_size(struct inode *inode) | 25 | int ext4_get_inline_size(struct inode *inode) |
25 | { | 26 | { |
@@ -1289,6 +1290,120 @@ out: | |||
1289 | return ret; | 1290 | return ret; |
1290 | } | 1291 | } |
1291 | 1292 | ||
1293 | /* | ||
1294 | * This function fills a red-black tree with information from an | ||
1295 | * inlined dir. It returns the number directory entries loaded | ||
1296 | * into the tree. If there is an error it is returned in err. | ||
1297 | */ | ||
1298 | int htree_inlinedir_to_tree(struct file *dir_file, | ||
1299 | struct inode *dir, ext4_lblk_t block, | ||
1300 | struct dx_hash_info *hinfo, | ||
1301 | __u32 start_hash, __u32 start_minor_hash, | ||
1302 | int *has_inline_data) | ||
1303 | { | ||
1304 | int err = 0, count = 0; | ||
1305 | unsigned int parent_ino; | ||
1306 | int pos; | ||
1307 | struct ext4_dir_entry_2 *de; | ||
1308 | struct inode *inode = file_inode(dir_file); | ||
1309 | int ret, inline_size = 0; | ||
1310 | struct ext4_iloc iloc; | ||
1311 | void *dir_buf = NULL; | ||
1312 | struct ext4_dir_entry_2 fake; | ||
1313 | |||
1314 | ret = ext4_get_inode_loc(inode, &iloc); | ||
1315 | if (ret) | ||
1316 | return ret; | ||
1317 | |||
1318 | down_read(&EXT4_I(inode)->xattr_sem); | ||
1319 | if (!ext4_has_inline_data(inode)) { | ||
1320 | up_read(&EXT4_I(inode)->xattr_sem); | ||
1321 | *has_inline_data = 0; | ||
1322 | goto out; | ||
1323 | } | ||
1324 | |||
1325 | inline_size = ext4_get_inline_size(inode); | ||
1326 | dir_buf = kmalloc(inline_size, GFP_NOFS); | ||
1327 | if (!dir_buf) { | ||
1328 | ret = -ENOMEM; | ||
1329 | up_read(&EXT4_I(inode)->xattr_sem); | ||
1330 | goto out; | ||
1331 | } | ||
1332 | |||
1333 | ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); | ||
1334 | up_read(&EXT4_I(inode)->xattr_sem); | ||
1335 | if (ret < 0) | ||
1336 | goto out; | ||
1337 | |||
1338 | pos = 0; | ||
1339 | parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); | ||
1340 | while (pos < inline_size) { | ||
1341 | /* | ||
1342 | * As inlined dir doesn't store any information about '.' and | ||
1343 | * only the inode number of '..' is stored, we have to handle | ||
1344 | * them differently. | ||
1345 | */ | ||
1346 | if (pos == 0) { | ||
1347 | fake.inode = cpu_to_le32(inode->i_ino); | ||
1348 | fake.name_len = 1; | ||
1349 | strcpy(fake.name, "."); | ||
1350 | fake.rec_len = ext4_rec_len_to_disk( | ||
1351 | EXT4_DIR_REC_LEN(fake.name_len), | ||
1352 | inline_size); | ||
1353 | ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); | ||
1354 | de = &fake; | ||
1355 | pos = EXT4_INLINE_DOTDOT_OFFSET; | ||
1356 | } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { | ||
1357 | fake.inode = cpu_to_le32(parent_ino); | ||
1358 | fake.name_len = 2; | ||
1359 | strcpy(fake.name, ".."); | ||
1360 | fake.rec_len = ext4_rec_len_to_disk( | ||
1361 | EXT4_DIR_REC_LEN(fake.name_len), | ||
1362 | inline_size); | ||
1363 | ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); | ||
1364 | de = &fake; | ||
1365 | pos = EXT4_INLINE_DOTDOT_SIZE; | ||
1366 | } else { | ||
1367 | de = (struct ext4_dir_entry_2 *)(dir_buf + pos); | ||
1368 | pos += ext4_rec_len_from_disk(de->rec_len, inline_size); | ||
1369 | if (ext4_check_dir_entry(inode, dir_file, de, | ||
1370 | iloc.bh, dir_buf, | ||
1371 | inline_size, pos)) { | ||
1372 | ret = count; | ||
1373 | goto out; | ||
1374 | } | ||
1375 | } | ||
1376 | |||
1377 | ext4fs_dirhash(de->name, de->name_len, hinfo); | ||
1378 | if ((hinfo->hash < start_hash) || | ||
1379 | ((hinfo->hash == start_hash) && | ||
1380 | (hinfo->minor_hash < start_minor_hash))) | ||
1381 | continue; | ||
1382 | if (de->inode == 0) | ||
1383 | continue; | ||
1384 | err = ext4_htree_store_dirent(dir_file, | ||
1385 | hinfo->hash, hinfo->minor_hash, de); | ||
1386 | if (err) { | ||
1387 | count = err; | ||
1388 | goto out; | ||
1389 | } | ||
1390 | count++; | ||
1391 | } | ||
1392 | ret = count; | ||
1393 | out: | ||
1394 | kfree(dir_buf); | ||
1395 | brelse(iloc.bh); | ||
1396 | return ret; | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1400 | * So this function is called when the volume is mkfsed with | ||
1401 | * dir_index disabled. In order to keep f_pos persistent | ||
1402 | * after we convert from an inlined dir to a blocked based, | ||
1403 | * we just pretend that we are a normal dir and return the | ||
1404 | * offset as if '.' and '..' really take place. | ||
1405 | * | ||
1406 | */ | ||
1292 | int ext4_read_inline_dir(struct file *filp, | 1407 | int ext4_read_inline_dir(struct file *filp, |
1293 | void *dirent, filldir_t filldir, | 1408 | void *dirent, filldir_t filldir, |
1294 | int *has_inline_data) | 1409 | int *has_inline_data) |
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp, | |||
1302 | int ret, inline_size = 0; | 1417 | int ret, inline_size = 0; |
1303 | struct ext4_iloc iloc; | 1418 | struct ext4_iloc iloc; |
1304 | void *dir_buf = NULL; | 1419 | void *dir_buf = NULL; |
1420 | int dotdot_offset, dotdot_size, extra_offset, extra_size; | ||
1305 | 1421 | ||
1306 | ret = ext4_get_inode_loc(inode, &iloc); | 1422 | ret = ext4_get_inode_loc(inode, &iloc); |
1307 | if (ret) | 1423 | if (ret) |
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp, | |||
1330 | sb = inode->i_sb; | 1446 | sb = inode->i_sb; |
1331 | stored = 0; | 1447 | stored = 0; |
1332 | parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); | 1448 | parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); |
1449 | offset = filp->f_pos; | ||
1333 | 1450 | ||
1334 | while (!error && !stored && filp->f_pos < inode->i_size) { | 1451 | /* |
1452 | * dotdot_offset and dotdot_size is the real offset and | ||
1453 | * size for ".." and "." if the dir is block based while | ||
1454 | * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. | ||
1455 | * So we will use extra_offset and extra_size to indicate them | ||
1456 | * during the inline dir iteration. | ||
1457 | */ | ||
1458 | dotdot_offset = EXT4_DIR_REC_LEN(1); | ||
1459 | dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); | ||
1460 | extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; | ||
1461 | extra_size = extra_offset + inline_size; | ||
1462 | |||
1463 | while (!error && !stored && filp->f_pos < extra_size) { | ||
1335 | revalidate: | 1464 | revalidate: |
1336 | /* | 1465 | /* |
1337 | * If the version has changed since the last call to | 1466 | * If the version has changed since the last call to |
@@ -1340,15 +1469,23 @@ revalidate: | |||
1340 | * dir to make sure. | 1469 | * dir to make sure. |
1341 | */ | 1470 | */ |
1342 | if (filp->f_version != inode->i_version) { | 1471 | if (filp->f_version != inode->i_version) { |
1343 | for (i = 0; | 1472 | for (i = 0; i < extra_size && i < offset;) { |
1344 | i < inode->i_size && i < offset;) { | 1473 | /* |
1474 | * "." is with offset 0 and | ||
1475 | * ".." is dotdot_offset. | ||
1476 | */ | ||
1345 | if (!i) { | 1477 | if (!i) { |
1346 | /* skip "." and ".." if needed. */ | 1478 | i = dotdot_offset; |
1347 | i += EXT4_INLINE_DOTDOT_SIZE; | 1479 | continue; |
1480 | } else if (i == dotdot_offset) { | ||
1481 | i = dotdot_size; | ||
1348 | continue; | 1482 | continue; |
1349 | } | 1483 | } |
1484 | /* for other entry, the real offset in | ||
1485 | * the buf has to be tuned accordingly. | ||
1486 | */ | ||
1350 | de = (struct ext4_dir_entry_2 *) | 1487 | de = (struct ext4_dir_entry_2 *) |
1351 | (dir_buf + i); | 1488 | (dir_buf + i - extra_offset); |
1352 | /* It's too expensive to do a full | 1489 | /* It's too expensive to do a full |
1353 | * dirent test each time round this | 1490 | * dirent test each time round this |
1354 | * loop, but we do have to test at | 1491 | * loop, but we do have to test at |
@@ -1356,43 +1493,47 @@ revalidate: | |||
1356 | * failure will be detected in the | 1493 | * failure will be detected in the |
1357 | * dirent test below. */ | 1494 | * dirent test below. */ |
1358 | if (ext4_rec_len_from_disk(de->rec_len, | 1495 | if (ext4_rec_len_from_disk(de->rec_len, |
1359 | inline_size) < EXT4_DIR_REC_LEN(1)) | 1496 | extra_size) < EXT4_DIR_REC_LEN(1)) |
1360 | break; | 1497 | break; |
1361 | i += ext4_rec_len_from_disk(de->rec_len, | 1498 | i += ext4_rec_len_from_disk(de->rec_len, |
1362 | inline_size); | 1499 | extra_size); |
1363 | } | 1500 | } |
1364 | offset = i; | 1501 | offset = i; |
1365 | filp->f_pos = offset; | 1502 | filp->f_pos = offset; |
1366 | filp->f_version = inode->i_version; | 1503 | filp->f_version = inode->i_version; |
1367 | } | 1504 | } |
1368 | 1505 | ||
1369 | while (!error && filp->f_pos < inode->i_size) { | 1506 | while (!error && filp->f_pos < extra_size) { |
1370 | if (filp->f_pos == 0) { | 1507 | if (filp->f_pos == 0) { |
1371 | error = filldir(dirent, ".", 1, 0, inode->i_ino, | 1508 | error = filldir(dirent, ".", 1, 0, inode->i_ino, |
1372 | DT_DIR); | 1509 | DT_DIR); |
1373 | if (error) | 1510 | if (error) |
1374 | break; | 1511 | break; |
1375 | stored++; | 1512 | stored++; |
1513 | filp->f_pos = dotdot_offset; | ||
1514 | continue; | ||
1515 | } | ||
1376 | 1516 | ||
1377 | error = filldir(dirent, "..", 2, 0, parent_ino, | 1517 | if (filp->f_pos == dotdot_offset) { |
1378 | DT_DIR); | 1518 | error = filldir(dirent, "..", 2, |
1519 | dotdot_offset, | ||
1520 | parent_ino, DT_DIR); | ||
1379 | if (error) | 1521 | if (error) |
1380 | break; | 1522 | break; |
1381 | stored++; | 1523 | stored++; |
1382 | 1524 | ||
1383 | filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; | 1525 | filp->f_pos = dotdot_size; |
1384 | continue; | 1526 | continue; |
1385 | } | 1527 | } |
1386 | 1528 | ||
1387 | de = (struct ext4_dir_entry_2 *)(dir_buf + offset); | 1529 | de = (struct ext4_dir_entry_2 *) |
1530 | (dir_buf + filp->f_pos - extra_offset); | ||
1388 | if (ext4_check_dir_entry(inode, filp, de, | 1531 | if (ext4_check_dir_entry(inode, filp, de, |
1389 | iloc.bh, dir_buf, | 1532 | iloc.bh, dir_buf, |
1390 | inline_size, offset)) { | 1533 | extra_size, filp->f_pos)) { |
1391 | ret = stored; | 1534 | ret = stored; |
1392 | goto out; | 1535 | goto out; |
1393 | } | 1536 | } |
1394 | offset += ext4_rec_len_from_disk(de->rec_len, | ||
1395 | inline_size); | ||
1396 | if (le32_to_cpu(de->inode)) { | 1537 | if (le32_to_cpu(de->inode)) { |
1397 | /* We might block in the next section | 1538 | /* We might block in the next section |
1398 | * if the data destination is | 1539 | * if the data destination is |
@@ -1415,9 +1556,8 @@ revalidate: | |||
1415 | stored++; | 1556 | stored++; |
1416 | } | 1557 | } |
1417 | filp->f_pos += ext4_rec_len_from_disk(de->rec_len, | 1558 | filp->f_pos += ext4_rec_len_from_disk(de->rec_len, |
1418 | inline_size); | 1559 | extra_size); |
1419 | } | 1560 | } |
1420 | offset = 0; | ||
1421 | } | 1561 | } |
1422 | out: | 1562 | out: |
1423 | kfree(dir_buf); | 1563 | kfree(dir_buf); |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3a5213bc73e..793d44b84d7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, | |||
55 | __u16 csum_hi = 0; | 55 | __u16 csum_hi = 0; |
56 | __u32 csum; | 56 | __u32 csum; |
57 | 57 | ||
58 | csum_lo = raw->i_checksum_lo; | 58 | csum_lo = le16_to_cpu(raw->i_checksum_lo); |
59 | raw->i_checksum_lo = 0; | 59 | raw->i_checksum_lo = 0; |
60 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && | 60 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && |
61 | EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { | 61 | EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { |
62 | csum_hi = raw->i_checksum_hi; | 62 | csum_hi = le16_to_cpu(raw->i_checksum_hi); |
63 | raw->i_checksum_hi = 0; | 63 | raw->i_checksum_hi = 0; |
64 | } | 64 | } |
65 | 65 | ||
66 | csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, | 66 | csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, |
67 | EXT4_INODE_SIZE(inode->i_sb)); | 67 | EXT4_INODE_SIZE(inode->i_sb)); |
68 | 68 | ||
69 | raw->i_checksum_lo = csum_lo; | 69 | raw->i_checksum_lo = cpu_to_le16(csum_lo); |
70 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && | 70 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && |
71 | EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) | 71 | EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) |
72 | raw->i_checksum_hi = csum_hi; | 72 | raw->i_checksum_hi = cpu_to_le16(csum_hi); |
73 | 73 | ||
74 | return csum; | 74 | return csum; |
75 | } | 75 | } |
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode) | |||
210 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | 210 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; |
211 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; | 211 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; |
212 | 212 | ||
213 | jbd2_log_start_commit(journal, commit_tid); | 213 | jbd2_complete_transaction(journal, commit_tid); |
214 | jbd2_log_wait_commit(journal, commit_tid); | ||
215 | filemap_write_and_wait(&inode->i_data); | 214 | filemap_write_and_wait(&inode->i_data); |
216 | } | 215 | } |
217 | truncate_inode_pages(&inode->i_data, 0); | 216 | truncate_inode_pages(&inode->i_data, 0); |
@@ -1081,20 +1080,42 @@ retry_journal: | |||
1081 | /* For write_end() in data=journal mode */ | 1080 | /* For write_end() in data=journal mode */ |
1082 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | 1081 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) |
1083 | { | 1082 | { |
1083 | int ret; | ||
1084 | if (!buffer_mapped(bh) || buffer_freed(bh)) | 1084 | if (!buffer_mapped(bh) || buffer_freed(bh)) |
1085 | return 0; | 1085 | return 0; |
1086 | set_buffer_uptodate(bh); | 1086 | set_buffer_uptodate(bh); |
1087 | return ext4_handle_dirty_metadata(handle, NULL, bh); | 1087 | ret = ext4_handle_dirty_metadata(handle, NULL, bh); |
1088 | clear_buffer_meta(bh); | ||
1089 | clear_buffer_prio(bh); | ||
1090 | return ret; | ||
1088 | } | 1091 | } |
1089 | 1092 | ||
1090 | static int ext4_generic_write_end(struct file *file, | 1093 | /* |
1091 | struct address_space *mapping, | 1094 | * We need to pick up the new inode size which generic_commit_write gave us |
1092 | loff_t pos, unsigned len, unsigned copied, | 1095 | * `file' can be NULL - eg, when called from page_symlink(). |
1093 | struct page *page, void *fsdata) | 1096 | * |
1097 | * ext4 never places buffers on inode->i_mapping->private_list. metadata | ||
1098 | * buffers are managed internally. | ||
1099 | */ | ||
1100 | static int ext4_write_end(struct file *file, | ||
1101 | struct address_space *mapping, | ||
1102 | loff_t pos, unsigned len, unsigned copied, | ||
1103 | struct page *page, void *fsdata) | ||
1094 | { | 1104 | { |
1095 | int i_size_changed = 0; | ||
1096 | struct inode *inode = mapping->host; | ||
1097 | handle_t *handle = ext4_journal_current_handle(); | 1105 | handle_t *handle = ext4_journal_current_handle(); |
1106 | struct inode *inode = mapping->host; | ||
1107 | int ret = 0, ret2; | ||
1108 | int i_size_changed = 0; | ||
1109 | |||
1110 | trace_ext4_write_end(inode, pos, len, copied); | ||
1111 | if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { | ||
1112 | ret = ext4_jbd2_file_inode(handle, inode); | ||
1113 | if (ret) { | ||
1114 | unlock_page(page); | ||
1115 | page_cache_release(page); | ||
1116 | goto errout; | ||
1117 | } | ||
1118 | } | ||
1098 | 1119 | ||
1099 | if (ext4_has_inline_data(inode)) | 1120 | if (ext4_has_inline_data(inode)) |
1100 | copied = ext4_write_inline_data_end(inode, pos, len, | 1121 | copied = ext4_write_inline_data_end(inode, pos, len, |
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file, | |||
1105 | 1126 | ||
1106 | /* | 1127 | /* |
1107 | * No need to use i_size_read() here, the i_size | 1128 | * No need to use i_size_read() here, the i_size |
1108 | * cannot change under us because we hold i_mutex. | 1129 | * cannot change under us because we hole i_mutex. |
1109 | * | 1130 | * |
1110 | * But it's important to update i_size while still holding page lock: | 1131 | * But it's important to update i_size while still holding page lock: |
1111 | * page writeout could otherwise come in and zero beyond i_size. | 1132 | * page writeout could otherwise come in and zero beyond i_size. |
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file, | |||
1115 | i_size_changed = 1; | 1136 | i_size_changed = 1; |
1116 | } | 1137 | } |
1117 | 1138 | ||
1118 | if (pos + copied > EXT4_I(inode)->i_disksize) { | 1139 | if (pos + copied > EXT4_I(inode)->i_disksize) { |
1119 | /* We need to mark inode dirty even if | 1140 | /* We need to mark inode dirty even if |
1120 | * new_i_size is less that inode->i_size | 1141 | * new_i_size is less that inode->i_size |
1121 | * bu greater than i_disksize.(hint delalloc) | 1142 | * but greater than i_disksize. (hint delalloc) |
1122 | */ | 1143 | */ |
1123 | ext4_update_i_disksize(inode, (pos + copied)); | 1144 | ext4_update_i_disksize(inode, (pos + copied)); |
1124 | i_size_changed = 1; | 1145 | i_size_changed = 1; |
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file, | |||
1135 | if (i_size_changed) | 1156 | if (i_size_changed) |
1136 | ext4_mark_inode_dirty(handle, inode); | 1157 | ext4_mark_inode_dirty(handle, inode); |
1137 | 1158 | ||
1138 | return copied; | 1159 | if (copied < 0) |
1139 | } | 1160 | ret = copied; |
1140 | |||
1141 | /* | ||
1142 | * We need to pick up the new inode size which generic_commit_write gave us | ||
1143 | * `file' can be NULL - eg, when called from page_symlink(). | ||
1144 | * | ||
1145 | * ext4 never places buffers on inode->i_mapping->private_list. metadata | ||
1146 | * buffers are managed internally. | ||
1147 | */ | ||
1148 | static int ext4_ordered_write_end(struct file *file, | ||
1149 | struct address_space *mapping, | ||
1150 | loff_t pos, unsigned len, unsigned copied, | ||
1151 | struct page *page, void *fsdata) | ||
1152 | { | ||
1153 | handle_t *handle = ext4_journal_current_handle(); | ||
1154 | struct inode *inode = mapping->host; | ||
1155 | int ret = 0, ret2; | ||
1156 | |||
1157 | trace_ext4_ordered_write_end(inode, pos, len, copied); | ||
1158 | ret = ext4_jbd2_file_inode(handle, inode); | ||
1159 | |||
1160 | if (ret == 0) { | ||
1161 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | ||
1162 | page, fsdata); | ||
1163 | copied = ret2; | ||
1164 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | ||
1165 | /* if we have allocated more blocks and copied | ||
1166 | * less. We will have blocks allocated outside | ||
1167 | * inode->i_size. So truncate them | ||
1168 | */ | ||
1169 | ext4_orphan_add(handle, inode); | ||
1170 | if (ret2 < 0) | ||
1171 | ret = ret2; | ||
1172 | } else { | ||
1173 | unlock_page(page); | ||
1174 | page_cache_release(page); | ||
1175 | } | ||
1176 | |||
1177 | ret2 = ext4_journal_stop(handle); | ||
1178 | if (!ret) | ||
1179 | ret = ret2; | ||
1180 | |||
1181 | if (pos + len > inode->i_size) { | ||
1182 | ext4_truncate_failed_write(inode); | ||
1183 | /* | ||
1184 | * If truncate failed early the inode might still be | ||
1185 | * on the orphan list; we need to make sure the inode | ||
1186 | * is removed from the orphan list in that case. | ||
1187 | */ | ||
1188 | if (inode->i_nlink) | ||
1189 | ext4_orphan_del(NULL, inode); | ||
1190 | } | ||
1191 | |||
1192 | |||
1193 | return ret ? ret : copied; | ||
1194 | } | ||
1195 | |||
1196 | static int ext4_writeback_write_end(struct file *file, | ||
1197 | struct address_space *mapping, | ||
1198 | loff_t pos, unsigned len, unsigned copied, | ||
1199 | struct page *page, void *fsdata) | ||
1200 | { | ||
1201 | handle_t *handle = ext4_journal_current_handle(); | ||
1202 | struct inode *inode = mapping->host; | ||
1203 | int ret = 0, ret2; | ||
1204 | |||
1205 | trace_ext4_writeback_write_end(inode, pos, len, copied); | ||
1206 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | ||
1207 | page, fsdata); | ||
1208 | copied = ret2; | ||
1209 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) | 1161 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1210 | /* if we have allocated more blocks and copied | 1162 | /* if we have allocated more blocks and copied |
1211 | * less. We will have blocks allocated outside | 1163 | * less. We will have blocks allocated outside |
1212 | * inode->i_size. So truncate them | 1164 | * inode->i_size. So truncate them |
1213 | */ | 1165 | */ |
1214 | ext4_orphan_add(handle, inode); | 1166 | ext4_orphan_add(handle, inode); |
1215 | 1167 | errout: | |
1216 | if (ret2 < 0) | ||
1217 | ret = ret2; | ||
1218 | |||
1219 | ret2 = ext4_journal_stop(handle); | 1168 | ret2 = ext4_journal_stop(handle); |
1220 | if (!ret) | 1169 | if (!ret) |
1221 | ret = ret2; | 1170 | ret = ret2; |
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
1538 | struct ext4_io_submit io_submit; | 1487 | struct ext4_io_submit io_submit; |
1539 | 1488 | ||
1540 | BUG_ON(mpd->next_page <= mpd->first_page); | 1489 | BUG_ON(mpd->next_page <= mpd->first_page); |
1541 | memset(&io_submit, 0, sizeof(io_submit)); | 1490 | ext4_io_submit_init(&io_submit, mpd->wbc); |
1491 | io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
1492 | if (!io_submit.io_end) | ||
1493 | return -ENOMEM; | ||
1542 | /* | 1494 | /* |
1543 | * We need to start from the first_page to the next_page - 1 | 1495 | * We need to start from the first_page to the next_page - 1 |
1544 | * to make sure we also write the mapped dirty buffer_heads. | 1496 | * to make sure we also write the mapped dirty buffer_heads. |
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
1626 | pagevec_release(&pvec); | 1578 | pagevec_release(&pvec); |
1627 | } | 1579 | } |
1628 | ext4_io_submit(&io_submit); | 1580 | ext4_io_submit(&io_submit); |
1581 | /* Drop io_end reference we got from init */ | ||
1582 | ext4_put_io_end_defer(io_submit.io_end); | ||
1629 | return ret; | 1583 | return ret; |
1630 | } | 1584 | } |
1631 | 1585 | ||
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode) | |||
1670 | { | 1624 | { |
1671 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1625 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1672 | struct super_block *sb = inode->i_sb; | 1626 | struct super_block *sb = inode->i_sb; |
1627 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1673 | 1628 | ||
1674 | ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", | 1629 | ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", |
1675 | EXT4_C2B(EXT4_SB(inode->i_sb), | 1630 | EXT4_C2B(EXT4_SB(inode->i_sb), |
1676 | ext4_count_free_clusters(inode->i_sb))); | 1631 | ext4_count_free_clusters(sb))); |
1677 | ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); | 1632 | ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); |
1678 | ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", | 1633 | ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", |
1679 | (long long) EXT4_C2B(EXT4_SB(inode->i_sb), | 1634 | (long long) EXT4_C2B(EXT4_SB(sb), |
1680 | percpu_counter_sum(&sbi->s_freeclusters_counter))); | 1635 | percpu_counter_sum(&sbi->s_freeclusters_counter))); |
1681 | ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", | 1636 | ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", |
1682 | (long long) EXT4_C2B(EXT4_SB(inode->i_sb), | 1637 | (long long) EXT4_C2B(EXT4_SB(sb), |
1683 | percpu_counter_sum(&sbi->s_dirtyclusters_counter))); | 1638 | percpu_counter_sum(&sbi->s_dirtyclusters_counter))); |
1684 | ext4_msg(sb, KERN_CRIT, "Block reservation details"); | 1639 | ext4_msg(sb, KERN_CRIT, "Block reservation details"); |
1685 | ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", | 1640 | ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", |
1686 | EXT4_I(inode)->i_reserved_data_blocks); | 1641 | ei->i_reserved_data_blocks); |
1687 | ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", | 1642 | ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", |
1688 | EXT4_I(inode)->i_reserved_meta_blocks); | 1643 | ei->i_reserved_meta_blocks); |
1644 | ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", | ||
1645 | ei->i_allocated_meta_blocks); | ||
1689 | return; | 1646 | return; |
1690 | } | 1647 | } |
1691 | 1648 | ||
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) | |||
1740 | */ | 1697 | */ |
1741 | map.m_lblk = next; | 1698 | map.m_lblk = next; |
1742 | map.m_len = max_blocks; | 1699 | map.m_len = max_blocks; |
1743 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE; | 1700 | /* |
1701 | * We're in delalloc path and it is possible that we're going to | ||
1702 | * need more metadata blocks than previously reserved. However | ||
1703 | * we must not fail because we're in writeback and there is | ||
1704 | * nothing we can do about it so it might result in data loss. | ||
1705 | * So use reserved blocks to allocate metadata if possible. | ||
1706 | */ | ||
1707 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE | | ||
1708 | EXT4_GET_BLOCKS_METADATA_NOFAIL; | ||
1744 | if (ext4_should_dioread_nolock(mpd->inode)) | 1709 | if (ext4_should_dioread_nolock(mpd->inode)) |
1745 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | 1710 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; |
1746 | if (mpd->b_state & (1 << BH_Delay)) | 1711 | if (mpd->b_state & (1 << BH_Delay)) |
1747 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; | 1712 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; |
1748 | 1713 | ||
1714 | |||
1749 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); | 1715 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); |
1750 | if (blks < 0) { | 1716 | if (blks < 0) { |
1751 | struct super_block *sb = mpd->inode->i_sb; | 1717 | struct super_block *sb = mpd->inode->i_sb; |
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page, | |||
2272 | */ | 2238 | */ |
2273 | return __ext4_journalled_writepage(page, len); | 2239 | return __ext4_journalled_writepage(page, len); |
2274 | 2240 | ||
2275 | memset(&io_submit, 0, sizeof(io_submit)); | 2241 | ext4_io_submit_init(&io_submit, wbc); |
2242 | io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
2243 | if (!io_submit.io_end) { | ||
2244 | redirty_page_for_writepage(wbc, page); | ||
2245 | return -ENOMEM; | ||
2246 | } | ||
2276 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); | 2247 | ret = ext4_bio_write_page(&io_submit, page, len, wbc); |
2277 | ext4_io_submit(&io_submit); | 2248 | ext4_io_submit(&io_submit); |
2249 | /* Drop io_end reference we got from init */ | ||
2250 | ext4_put_io_end_defer(io_submit.io_end); | ||
2278 | return ret; | 2251 | return ret; |
2279 | } | 2252 | } |
2280 | 2253 | ||
@@ -2661,7 +2634,7 @@ out_writepages: | |||
2661 | 2634 | ||
2662 | static int ext4_nonda_switch(struct super_block *sb) | 2635 | static int ext4_nonda_switch(struct super_block *sb) |
2663 | { | 2636 | { |
2664 | s64 free_blocks, dirty_blocks; | 2637 | s64 free_clusters, dirty_clusters; |
2665 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2638 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2666 | 2639 | ||
2667 | /* | 2640 | /* |
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2672 | * Delalloc need an accurate free block accounting. So switch | 2645 | * Delalloc need an accurate free block accounting. So switch |
2673 | * to non delalloc when we are near to error range. | 2646 | * to non delalloc when we are near to error range. |
2674 | */ | 2647 | */ |
2675 | free_blocks = EXT4_C2B(sbi, | 2648 | free_clusters = |
2676 | percpu_counter_read_positive(&sbi->s_freeclusters_counter)); | 2649 | percpu_counter_read_positive(&sbi->s_freeclusters_counter); |
2677 | dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); | 2650 | dirty_clusters = |
2651 | percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); | ||
2678 | /* | 2652 | /* |
2679 | * Start pushing delalloc when 1/2 of free blocks are dirty. | 2653 | * Start pushing delalloc when 1/2 of free blocks are dirty. |
2680 | */ | 2654 | */ |
2681 | if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) | 2655 | if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) |
2682 | try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); | 2656 | try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); |
2683 | 2657 | ||
2684 | if (2 * free_blocks < 3 * dirty_blocks || | 2658 | if (2 * free_clusters < 3 * dirty_clusters || |
2685 | free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { | 2659 | free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { |
2686 | /* | 2660 | /* |
2687 | * free block count is less than 150% of dirty blocks | 2661 | * free block count is less than 150% of dirty blocks |
2688 | * or free blocks is less than watermark | 2662 | * or free blocks is less than watermark |
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file, | |||
2818 | unsigned long start, end; | 2792 | unsigned long start, end; |
2819 | int write_mode = (int)(unsigned long)fsdata; | 2793 | int write_mode = (int)(unsigned long)fsdata; |
2820 | 2794 | ||
2821 | if (write_mode == FALL_BACK_TO_NONDELALLOC) { | 2795 | if (write_mode == FALL_BACK_TO_NONDELALLOC) |
2822 | switch (ext4_inode_journal_mode(inode)) { | 2796 | return ext4_write_end(file, mapping, pos, |
2823 | case EXT4_INODE_ORDERED_DATA_MODE: | 2797 | len, copied, page, fsdata); |
2824 | return ext4_ordered_write_end(file, mapping, pos, | ||
2825 | len, copied, page, fsdata); | ||
2826 | case EXT4_INODE_WRITEBACK_DATA_MODE: | ||
2827 | return ext4_writeback_write_end(file, mapping, pos, | ||
2828 | len, copied, page, fsdata); | ||
2829 | default: | ||
2830 | BUG(); | ||
2831 | } | ||
2832 | } | ||
2833 | 2798 | ||
2834 | trace_ext4_da_write_end(inode, pos, len, copied); | 2799 | trace_ext4_da_write_end(inode, pos, len, copied); |
2835 | start = pos & (PAGE_CACHE_SIZE - 1); | 2800 | start = pos & (PAGE_CACHE_SIZE - 1); |
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3113 | struct inode *inode = file_inode(iocb->ki_filp); | 3078 | struct inode *inode = file_inode(iocb->ki_filp); |
3114 | ext4_io_end_t *io_end = iocb->private; | 3079 | ext4_io_end_t *io_end = iocb->private; |
3115 | 3080 | ||
3116 | /* if not async direct IO or dio with 0 bytes write, just return */ | 3081 | /* if not async direct IO just return */ |
3117 | if (!io_end || !size) | 3082 | if (!io_end) { |
3118 | goto out; | 3083 | inode_dio_done(inode); |
3084 | if (is_async) | ||
3085 | aio_complete(iocb, ret, 0); | ||
3086 | return; | ||
3087 | } | ||
3119 | 3088 | ||
3120 | ext_debug("ext4_end_io_dio(): io_end 0x%p " | 3089 | ext_debug("ext4_end_io_dio(): io_end 0x%p " |
3121 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", | 3090 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", |
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3123 | size); | 3092 | size); |
3124 | 3093 | ||
3125 | iocb->private = NULL; | 3094 | iocb->private = NULL; |
3126 | |||
3127 | /* if not aio dio with unwritten extents, just free io and return */ | ||
3128 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
3129 | ext4_free_io_end(io_end); | ||
3130 | out: | ||
3131 | inode_dio_done(inode); | ||
3132 | if (is_async) | ||
3133 | aio_complete(iocb, ret, 0); | ||
3134 | return; | ||
3135 | } | ||
3136 | |||
3137 | io_end->offset = offset; | 3095 | io_end->offset = offset; |
3138 | io_end->size = size; | 3096 | io_end->size = size; |
3139 | if (is_async) { | 3097 | if (is_async) { |
3140 | io_end->iocb = iocb; | 3098 | io_end->iocb = iocb; |
3141 | io_end->result = ret; | 3099 | io_end->result = ret; |
3142 | } | 3100 | } |
3143 | 3101 | ext4_put_io_end_defer(io_end); | |
3144 | ext4_add_complete_io(io_end); | ||
3145 | } | 3102 | } |
3146 | 3103 | ||
3147 | /* | 3104 | /* |
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3175 | get_block_t *get_block_func = NULL; | 3132 | get_block_t *get_block_func = NULL; |
3176 | int dio_flags = 0; | 3133 | int dio_flags = 0; |
3177 | loff_t final_size = offset + count; | 3134 | loff_t final_size = offset + count; |
3135 | ext4_io_end_t *io_end = NULL; | ||
3178 | 3136 | ||
3179 | /* Use the old path for reads and writes beyond i_size. */ | 3137 | /* Use the old path for reads and writes beyond i_size. */ |
3180 | if (rw != WRITE || final_size > inode->i_size) | 3138 | if (rw != WRITE || final_size > inode->i_size) |
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3213 | iocb->private = NULL; | 3171 | iocb->private = NULL; |
3214 | ext4_inode_aio_set(inode, NULL); | 3172 | ext4_inode_aio_set(inode, NULL); |
3215 | if (!is_sync_kiocb(iocb)) { | 3173 | if (!is_sync_kiocb(iocb)) { |
3216 | ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); | 3174 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
3217 | if (!io_end) { | 3175 | if (!io_end) { |
3218 | ret = -ENOMEM; | 3176 | ret = -ENOMEM; |
3219 | goto retake_lock; | 3177 | goto retake_lock; |
3220 | } | 3178 | } |
3221 | io_end->flag |= EXT4_IO_END_DIRECT; | 3179 | io_end->flag |= EXT4_IO_END_DIRECT; |
3222 | iocb->private = io_end; | 3180 | /* |
3181 | * Grab reference for DIO. Will be dropped in ext4_end_io_dio() | ||
3182 | */ | ||
3183 | iocb->private = ext4_get_io_end(io_end); | ||
3223 | /* | 3184 | /* |
3224 | * we save the io structure for current async direct | 3185 | * we save the io structure for current async direct |
3225 | * IO, so that later ext4_map_blocks() could flag the | 3186 | * IO, so that later ext4_map_blocks() could flag the |
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3243 | NULL, | 3204 | NULL, |
3244 | dio_flags); | 3205 | dio_flags); |
3245 | 3206 | ||
3246 | if (iocb->private) | ||
3247 | ext4_inode_aio_set(inode, NULL); | ||
3248 | /* | 3207 | /* |
3249 | * The io_end structure takes a reference to the inode, that | 3208 | * Put our reference to io_end. This can free the io_end structure e.g. |
3250 | * structure needs to be destroyed and the reference to the | 3209 | * in sync IO case or in case of error. It can even perform extent |
3251 | * inode need to be dropped, when IO is complete, even with 0 | 3210 | * conversion if all bios we submitted finished before we got here. |
3252 | * byte write, or failed. | 3211 | * Note that in that case iocb->private can be already set to NULL |
3253 | * | 3212 | * here. |
3254 | * In the successful AIO DIO case, the io_end structure will | ||
3255 | * be destroyed and the reference to the inode will be dropped | ||
3256 | * after the end_io call back function is called. | ||
3257 | * | ||
3258 | * In the case there is 0 byte write, or error case, since VFS | ||
3259 | * direct IO won't invoke the end_io call back function, we | ||
3260 | * need to free the end_io structure here. | ||
3261 | */ | 3213 | */ |
3262 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | 3214 | if (io_end) { |
3263 | ext4_free_io_end(iocb->private); | 3215 | ext4_inode_aio_set(inode, NULL); |
3264 | iocb->private = NULL; | 3216 | ext4_put_io_end(io_end); |
3265 | } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | 3217 | /* |
3218 | * In case of error or no write ext4_end_io_dio() was not | ||
3219 | * called so we have to put iocb's reference. | ||
3220 | */ | ||
3221 | if (ret <= 0 && ret != -EIOCBQUEUED) { | ||
3222 | WARN_ON(iocb->private != io_end); | ||
3223 | ext4_put_io_end(io_end); | ||
3224 | iocb->private = NULL; | ||
3225 | } | ||
3226 | } | ||
3227 | if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | ||
3266 | EXT4_STATE_DIO_UNWRITTEN)) { | 3228 | EXT4_STATE_DIO_UNWRITTEN)) { |
3267 | int err; | 3229 | int err; |
3268 | /* | 3230 | /* |
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
3334 | return __set_page_dirty_nobuffers(page); | 3296 | return __set_page_dirty_nobuffers(page); |
3335 | } | 3297 | } |
3336 | 3298 | ||
3337 | static const struct address_space_operations ext4_ordered_aops = { | 3299 | static const struct address_space_operations ext4_aops = { |
3338 | .readpage = ext4_readpage, | 3300 | .readpage = ext4_readpage, |
3339 | .readpages = ext4_readpages, | 3301 | .readpages = ext4_readpages, |
3340 | .writepage = ext4_writepage, | 3302 | .writepage = ext4_writepage, |
3341 | .write_begin = ext4_write_begin, | 3303 | .write_begin = ext4_write_begin, |
3342 | .write_end = ext4_ordered_write_end, | 3304 | .write_end = ext4_write_end, |
3343 | .bmap = ext4_bmap, | ||
3344 | .invalidatepage = ext4_invalidatepage, | ||
3345 | .releasepage = ext4_releasepage, | ||
3346 | .direct_IO = ext4_direct_IO, | ||
3347 | .migratepage = buffer_migrate_page, | ||
3348 | .is_partially_uptodate = block_is_partially_uptodate, | ||
3349 | .error_remove_page = generic_error_remove_page, | ||
3350 | }; | ||
3351 | |||
3352 | static const struct address_space_operations ext4_writeback_aops = { | ||
3353 | .readpage = ext4_readpage, | ||
3354 | .readpages = ext4_readpages, | ||
3355 | .writepage = ext4_writepage, | ||
3356 | .write_begin = ext4_write_begin, | ||
3357 | .write_end = ext4_writeback_write_end, | ||
3358 | .bmap = ext4_bmap, | 3305 | .bmap = ext4_bmap, |
3359 | .invalidatepage = ext4_invalidatepage, | 3306 | .invalidatepage = ext4_invalidatepage, |
3360 | .releasepage = ext4_releasepage, | 3307 | .releasepage = ext4_releasepage, |
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode) | |||
3399 | { | 3346 | { |
3400 | switch (ext4_inode_journal_mode(inode)) { | 3347 | switch (ext4_inode_journal_mode(inode)) { |
3401 | case EXT4_INODE_ORDERED_DATA_MODE: | 3348 | case EXT4_INODE_ORDERED_DATA_MODE: |
3402 | if (test_opt(inode->i_sb, DELALLOC)) | 3349 | ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); |
3403 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
3404 | else | ||
3405 | inode->i_mapping->a_ops = &ext4_ordered_aops; | ||
3406 | break; | 3350 | break; |
3407 | case EXT4_INODE_WRITEBACK_DATA_MODE: | 3351 | case EXT4_INODE_WRITEBACK_DATA_MODE: |
3408 | if (test_opt(inode->i_sb, DELALLOC)) | 3352 | ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); |
3409 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
3410 | else | ||
3411 | inode->i_mapping->a_ops = &ext4_writeback_aops; | ||
3412 | break; | 3353 | break; |
3413 | case EXT4_INODE_JOURNAL_DATA_MODE: | 3354 | case EXT4_INODE_JOURNAL_DATA_MODE: |
3414 | inode->i_mapping->a_ops = &ext4_journalled_aops; | 3355 | inode->i_mapping->a_ops = &ext4_journalled_aops; |
3415 | break; | 3356 | return; |
3416 | default: | 3357 | default: |
3417 | BUG(); | 3358 | BUG(); |
3418 | } | 3359 | } |
3360 | if (test_opt(inode->i_sb, DELALLOC)) | ||
3361 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
3362 | else | ||
3363 | inode->i_mapping->a_ops = &ext4_aops; | ||
3419 | } | 3364 | } |
3420 | 3365 | ||
3421 | 3366 | ||
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode) | |||
3646 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | 3591 | int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) |
3647 | { | 3592 | { |
3648 | struct inode *inode = file_inode(file); | 3593 | struct inode *inode = file_inode(file); |
3594 | struct super_block *sb = inode->i_sb; | ||
3595 | ext4_lblk_t first_block, stop_block; | ||
3596 | struct address_space *mapping = inode->i_mapping; | ||
3597 | loff_t first_page, last_page, page_len; | ||
3598 | loff_t first_page_offset, last_page_offset; | ||
3599 | handle_t *handle; | ||
3600 | unsigned int credits; | ||
3601 | int ret = 0; | ||
3602 | |||
3649 | if (!S_ISREG(inode->i_mode)) | 3603 | if (!S_ISREG(inode->i_mode)) |
3650 | return -EOPNOTSUPP; | 3604 | return -EOPNOTSUPP; |
3651 | 3605 | ||
3652 | if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3606 | if (EXT4_SB(sb)->s_cluster_ratio > 1) { |
3653 | return ext4_ind_punch_hole(file, offset, length); | ||
3654 | |||
3655 | if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { | ||
3656 | /* TODO: Add support for bigalloc file systems */ | 3607 | /* TODO: Add support for bigalloc file systems */ |
3657 | return -EOPNOTSUPP; | 3608 | return -EOPNOTSUPP; |
3658 | } | 3609 | } |
3659 | 3610 | ||
3660 | trace_ext4_punch_hole(inode, offset, length); | 3611 | trace_ext4_punch_hole(inode, offset, length); |
3661 | 3612 | ||
3662 | return ext4_ext_punch_hole(file, offset, length); | 3613 | /* |
3614 | * Write out all dirty pages to avoid race conditions | ||
3615 | * Then release them. | ||
3616 | */ | ||
3617 | if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||
3618 | ret = filemap_write_and_wait_range(mapping, offset, | ||
3619 | offset + length - 1); | ||
3620 | if (ret) | ||
3621 | return ret; | ||
3622 | } | ||
3623 | |||
3624 | mutex_lock(&inode->i_mutex); | ||
3625 | /* It's not possible punch hole on append only file */ | ||
3626 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { | ||
3627 | ret = -EPERM; | ||
3628 | goto out_mutex; | ||
3629 | } | ||
3630 | if (IS_SWAPFILE(inode)) { | ||
3631 | ret = -ETXTBSY; | ||
3632 | goto out_mutex; | ||
3633 | } | ||
3634 | |||
3635 | /* No need to punch hole beyond i_size */ | ||
3636 | if (offset >= inode->i_size) | ||
3637 | goto out_mutex; | ||
3638 | |||
3639 | /* | ||
3640 | * If the hole extends beyond i_size, set the hole | ||
3641 | * to end after the page that contains i_size | ||
3642 | */ | ||
3643 | if (offset + length > inode->i_size) { | ||
3644 | length = inode->i_size + | ||
3645 | PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - | ||
3646 | offset; | ||
3647 | } | ||
3648 | |||
3649 | first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
3650 | last_page = (offset + length) >> PAGE_CACHE_SHIFT; | ||
3651 | |||
3652 | first_page_offset = first_page << PAGE_CACHE_SHIFT; | ||
3653 | last_page_offset = last_page << PAGE_CACHE_SHIFT; | ||
3654 | |||
3655 | /* Now release the pages */ | ||
3656 | if (last_page_offset > first_page_offset) { | ||
3657 | truncate_pagecache_range(inode, first_page_offset, | ||
3658 | last_page_offset - 1); | ||
3659 | } | ||
3660 | |||
3661 | /* Wait all existing dio workers, newcomers will block on i_mutex */ | ||
3662 | ext4_inode_block_unlocked_dio(inode); | ||
3663 | ret = ext4_flush_unwritten_io(inode); | ||
3664 | if (ret) | ||
3665 | goto out_dio; | ||
3666 | inode_dio_wait(inode); | ||
3667 | |||
3668 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||
3669 | credits = ext4_writepage_trans_blocks(inode); | ||
3670 | else | ||
3671 | credits = ext4_blocks_for_truncate(inode); | ||
3672 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); | ||
3673 | if (IS_ERR(handle)) { | ||
3674 | ret = PTR_ERR(handle); | ||
3675 | ext4_std_error(sb, ret); | ||
3676 | goto out_dio; | ||
3677 | } | ||
3678 | |||
3679 | /* | ||
3680 | * Now we need to zero out the non-page-aligned data in the | ||
3681 | * pages at the start and tail of the hole, and unmap the | ||
3682 | * buffer heads for the block aligned regions of the page that | ||
3683 | * were completely zeroed. | ||
3684 | */ | ||
3685 | if (first_page > last_page) { | ||
3686 | /* | ||
3687 | * If the file space being truncated is contained | ||
3688 | * within a page just zero out and unmap the middle of | ||
3689 | * that page | ||
3690 | */ | ||
3691 | ret = ext4_discard_partial_page_buffers(handle, | ||
3692 | mapping, offset, length, 0); | ||
3693 | |||
3694 | if (ret) | ||
3695 | goto out_stop; | ||
3696 | } else { | ||
3697 | /* | ||
3698 | * zero out and unmap the partial page that contains | ||
3699 | * the start of the hole | ||
3700 | */ | ||
3701 | page_len = first_page_offset - offset; | ||
3702 | if (page_len > 0) { | ||
3703 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3704 | offset, page_len, 0); | ||
3705 | if (ret) | ||
3706 | goto out_stop; | ||
3707 | } | ||
3708 | |||
3709 | /* | ||
3710 | * zero out and unmap the partial page that contains | ||
3711 | * the end of the hole | ||
3712 | */ | ||
3713 | page_len = offset + length - last_page_offset; | ||
3714 | if (page_len > 0) { | ||
3715 | ret = ext4_discard_partial_page_buffers(handle, mapping, | ||
3716 | last_page_offset, page_len, 0); | ||
3717 | if (ret) | ||
3718 | goto out_stop; | ||
3719 | } | ||
3720 | } | ||
3721 | |||
3722 | /* | ||
3723 | * If i_size is contained in the last page, we need to | ||
3724 | * unmap and zero the partial page after i_size | ||
3725 | */ | ||
3726 | if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && | ||
3727 | inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
3728 | page_len = PAGE_CACHE_SIZE - | ||
3729 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3730 | |||
3731 | if (page_len > 0) { | ||
3732 | ret = ext4_discard_partial_page_buffers(handle, | ||
3733 | mapping, inode->i_size, page_len, 0); | ||
3734 | |||
3735 | if (ret) | ||
3736 | goto out_stop; | ||
3737 | } | ||
3738 | } | ||
3739 | |||
3740 | first_block = (offset + sb->s_blocksize - 1) >> | ||
3741 | EXT4_BLOCK_SIZE_BITS(sb); | ||
3742 | stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); | ||
3743 | |||
3744 | /* If there are no blocks to remove, return now */ | ||
3745 | if (first_block >= stop_block) | ||
3746 | goto out_stop; | ||
3747 | |||
3748 | down_write(&EXT4_I(inode)->i_data_sem); | ||
3749 | ext4_discard_preallocations(inode); | ||
3750 | |||
3751 | ret = ext4_es_remove_extent(inode, first_block, | ||
3752 | stop_block - first_block); | ||
3753 | if (ret) { | ||
3754 | up_write(&EXT4_I(inode)->i_data_sem); | ||
3755 | goto out_stop; | ||
3756 | } | ||
3757 | |||
3758 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||
3759 | ret = ext4_ext_remove_space(inode, first_block, | ||
3760 | stop_block - 1); | ||
3761 | else | ||
3762 | ret = ext4_free_hole_blocks(handle, inode, first_block, | ||
3763 | stop_block); | ||
3764 | |||
3765 | ext4_discard_preallocations(inode); | ||
3766 | up_write(&EXT4_I(inode)->i_data_sem); | ||
3767 | if (IS_SYNC(inode)) | ||
3768 | ext4_handle_sync(handle); | ||
3769 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
3770 | ext4_mark_inode_dirty(handle, inode); | ||
3771 | out_stop: | ||
3772 | ext4_journal_stop(handle); | ||
3773 | out_dio: | ||
3774 | ext4_inode_resume_unlocked_dio(inode); | ||
3775 | out_mutex: | ||
3776 | mutex_unlock(&inode->i_mutex); | ||
3777 | return ret; | ||
3663 | } | 3778 | } |
3664 | 3779 | ||
3665 | /* | 3780 | /* |
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
3692 | */ | 3807 | */ |
3693 | void ext4_truncate(struct inode *inode) | 3808 | void ext4_truncate(struct inode *inode) |
3694 | { | 3809 | { |
3810 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3811 | unsigned int credits; | ||
3812 | handle_t *handle; | ||
3813 | struct address_space *mapping = inode->i_mapping; | ||
3814 | loff_t page_len; | ||
3815 | |||
3816 | /* | ||
3817 | * There is a possibility that we're either freeing the inode | ||
3818 | * or it completely new indode. In those cases we might not | ||
3819 | * have i_mutex locked because it's not necessary. | ||
3820 | */ | ||
3821 | if (!(inode->i_state & (I_NEW|I_FREEING))) | ||
3822 | WARN_ON(!mutex_is_locked(&inode->i_mutex)); | ||
3695 | trace_ext4_truncate_enter(inode); | 3823 | trace_ext4_truncate_enter(inode); |
3696 | 3824 | ||
3697 | if (!ext4_can_truncate(inode)) | 3825 | if (!ext4_can_truncate(inode)) |
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode) | |||
3710 | return; | 3838 | return; |
3711 | } | 3839 | } |
3712 | 3840 | ||
3841 | /* | ||
3842 | * finish any pending end_io work so we won't run the risk of | ||
3843 | * converting any truncated blocks to initialized later | ||
3844 | */ | ||
3845 | ext4_flush_unwritten_io(inode); | ||
3846 | |||
3847 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||
3848 | credits = ext4_writepage_trans_blocks(inode); | ||
3849 | else | ||
3850 | credits = ext4_blocks_for_truncate(inode); | ||
3851 | |||
3852 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); | ||
3853 | if (IS_ERR(handle)) { | ||
3854 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | ||
3855 | return; | ||
3856 | } | ||
3857 | |||
3858 | if (inode->i_size % PAGE_CACHE_SIZE != 0) { | ||
3859 | page_len = PAGE_CACHE_SIZE - | ||
3860 | (inode->i_size & (PAGE_CACHE_SIZE - 1)); | ||
3861 | |||
3862 | if (ext4_discard_partial_page_buffers(handle, | ||
3863 | mapping, inode->i_size, page_len, 0)) | ||
3864 | goto out_stop; | ||
3865 | } | ||
3866 | |||
3867 | /* | ||
3868 | * We add the inode to the orphan list, so that if this | ||
3869 | * truncate spans multiple transactions, and we crash, we will | ||
3870 | * resume the truncate when the filesystem recovers. It also | ||
3871 | * marks the inode dirty, to catch the new size. | ||
3872 | * | ||
3873 | * Implication: the file must always be in a sane, consistent | ||
3874 | * truncatable state while each transaction commits. | ||
3875 | */ | ||
3876 | if (ext4_orphan_add(handle, inode)) | ||
3877 | goto out_stop; | ||
3878 | |||
3879 | down_write(&EXT4_I(inode)->i_data_sem); | ||
3880 | |||
3881 | ext4_discard_preallocations(inode); | ||
3882 | |||
3713 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3883 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
3714 | ext4_ext_truncate(inode); | 3884 | ext4_ext_truncate(handle, inode); |
3715 | else | 3885 | else |
3716 | ext4_ind_truncate(inode); | 3886 | ext4_ind_truncate(handle, inode); |
3887 | |||
3888 | up_write(&ei->i_data_sem); | ||
3889 | |||
3890 | if (IS_SYNC(inode)) | ||
3891 | ext4_handle_sync(handle); | ||
3892 | |||
3893 | out_stop: | ||
3894 | /* | ||
3895 | * If this was a simple ftruncate() and the file will remain alive, | ||
3896 | * then we need to clear up the orphan record which we created above. | ||
3897 | * However, if this was a real unlink then we were called by | ||
3898 | * ext4_delete_inode(), and we allow that function to clean up the | ||
3899 | * orphan info for us. | ||
3900 | */ | ||
3901 | if (inode->i_nlink) | ||
3902 | ext4_orphan_del(handle, inode); | ||
3903 | |||
3904 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
3905 | ext4_mark_inode_dirty(handle, inode); | ||
3906 | ext4_journal_stop(handle); | ||
3717 | 3907 | ||
3718 | trace_ext4_truncate_exit(inode); | 3908 | trace_ext4_truncate_exit(inode); |
3719 | } | 3909 | } |
@@ -3821,13 +4011,14 @@ make_io: | |||
3821 | if (EXT4_SB(sb)->s_inode_readahead_blks) { | 4011 | if (EXT4_SB(sb)->s_inode_readahead_blks) { |
3822 | ext4_fsblk_t b, end, table; | 4012 | ext4_fsblk_t b, end, table; |
3823 | unsigned num; | 4013 | unsigned num; |
4014 | __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; | ||
3824 | 4015 | ||
3825 | table = ext4_inode_table(sb, gdp); | 4016 | table = ext4_inode_table(sb, gdp); |
3826 | /* s_inode_readahead_blks is always a power of 2 */ | 4017 | /* s_inode_readahead_blks is always a power of 2 */ |
3827 | b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); | 4018 | b = block & ~((ext4_fsblk_t) ra_blks - 1); |
3828 | if (table > b) | 4019 | if (table > b) |
3829 | b = table; | 4020 | b = table; |
3830 | end = b + EXT4_SB(sb)->s_inode_readahead_blks; | 4021 | end = b + ra_blks; |
3831 | num = EXT4_INODES_PER_GROUP(sb); | 4022 | num = EXT4_INODES_PER_GROUP(sb); |
3832 | if (ext4_has_group_desc_csum(sb)) | 4023 | if (ext4_has_group_desc_csum(sb)) |
3833 | num -= ext4_itable_unused_count(sb, gdp); | 4024 | num -= ext4_itable_unused_count(sb, gdp); |
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4024 | * NeilBrown 1999oct15 | 4215 | * NeilBrown 1999oct15 |
4025 | */ | 4216 | */ |
4026 | if (inode->i_nlink == 0) { | 4217 | if (inode->i_nlink == 0) { |
4027 | if (inode->i_mode == 0 || | 4218 | if ((inode->i_mode == 0 || |
4028 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { | 4219 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && |
4220 | ino != EXT4_BOOT_LOADER_INO) { | ||
4029 | /* this inode is deleted */ | 4221 | /* this inode is deleted */ |
4030 | ret = -ESTALE; | 4222 | ret = -ESTALE; |
4031 | goto bad_inode; | 4223 | goto bad_inode; |
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4033 | /* The only unlinked inodes we let through here have | 4225 | /* The only unlinked inodes we let through here have |
4034 | * valid i_mode and are being read by the orphan | 4226 | * valid i_mode and are being read by the orphan |
4035 | * recovery code: that's fine, we're about to complete | 4227 | * recovery code: that's fine, we're about to complete |
4036 | * the process of deleting those. */ | 4228 | * the process of deleting those. |
4229 | * OR it is the EXT4_BOOT_LOADER_INO which is | ||
4230 | * not initialized on a new filesystem. */ | ||
4037 | } | 4231 | } |
4038 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); | 4232 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); |
4039 | inode->i_blocks = ext4_inode_blocks(raw_inode, ei); | 4233 | inode->i_blocks = ext4_inode_blocks(raw_inode, ei); |
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4153 | else | 4347 | else |
4154 | init_special_inode(inode, inode->i_mode, | 4348 | init_special_inode(inode, inode->i_mode, |
4155 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); | 4349 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); |
4350 | } else if (ino == EXT4_BOOT_LOADER_INO) { | ||
4351 | make_bad_inode(inode); | ||
4156 | } else { | 4352 | } else { |
4157 | ret = -EIO; | 4353 | ret = -EIO; |
4158 | EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); | 4354 | EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 721f4d33e148..9491ac0590f7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -17,9 +17,201 @@ | |||
17 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
18 | #include "ext4_jbd2.h" | 18 | #include "ext4_jbd2.h" |
19 | #include "ext4.h" | 19 | #include "ext4.h" |
20 | #include "ext4_extents.h" | ||
20 | 21 | ||
21 | #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) | 22 | #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) |
22 | 23 | ||
24 | /** | ||
25 | * Swap memory between @a and @b for @len bytes. | ||
26 | * | ||
27 | * @a: pointer to first memory area | ||
28 | * @b: pointer to second memory area | ||
29 | * @len: number of bytes to swap | ||
30 | * | ||
31 | */ | ||
32 | static void memswap(void *a, void *b, size_t len) | ||
33 | { | ||
34 | unsigned char *ap, *bp; | ||
35 | unsigned char tmp; | ||
36 | |||
37 | ap = (unsigned char *)a; | ||
38 | bp = (unsigned char *)b; | ||
39 | while (len-- > 0) { | ||
40 | tmp = *ap; | ||
41 | *ap = *bp; | ||
42 | *bp = tmp; | ||
43 | ap++; | ||
44 | bp++; | ||
45 | } | ||
46 | } | ||
47 | |||
48 | /** | ||
49 | * Swap i_data and associated attributes between @inode1 and @inode2. | ||
50 | * This function is used for the primary swap between inode1 and inode2 | ||
51 | * and also to revert this primary swap in case of errors. | ||
52 | * | ||
53 | * Therefore you have to make sure, that calling this method twice | ||
54 | * will revert all changes. | ||
55 | * | ||
56 | * @inode1: pointer to first inode | ||
57 | * @inode2: pointer to second inode | ||
58 | */ | ||
59 | static void swap_inode_data(struct inode *inode1, struct inode *inode2) | ||
60 | { | ||
61 | loff_t isize; | ||
62 | struct ext4_inode_info *ei1; | ||
63 | struct ext4_inode_info *ei2; | ||
64 | |||
65 | ei1 = EXT4_I(inode1); | ||
66 | ei2 = EXT4_I(inode2); | ||
67 | |||
68 | memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); | ||
69 | memswap(&inode1->i_version, &inode2->i_version, | ||
70 | sizeof(inode1->i_version)); | ||
71 | memswap(&inode1->i_blocks, &inode2->i_blocks, | ||
72 | sizeof(inode1->i_blocks)); | ||
73 | memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); | ||
74 | memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); | ||
75 | memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); | ||
76 | |||
77 | memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); | ||
78 | memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); | ||
79 | memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); | ||
80 | memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); | ||
81 | memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); | ||
82 | |||
83 | isize = i_size_read(inode1); | ||
84 | i_size_write(inode1, i_size_read(inode2)); | ||
85 | i_size_write(inode2, isize); | ||
86 | } | ||
87 | |||
88 | /** | ||
89 | * Swap the information from the given @inode and the inode | ||
90 | * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other | ||
91 | * important fields of the inodes. | ||
92 | * | ||
93 | * @sb: the super block of the filesystem | ||
94 | * @inode: the inode to swap with EXT4_BOOT_LOADER_INO | ||
95 | * | ||
96 | */ | ||
97 | static long swap_inode_boot_loader(struct super_block *sb, | ||
98 | struct inode *inode) | ||
99 | { | ||
100 | handle_t *handle; | ||
101 | int err; | ||
102 | struct inode *inode_bl; | ||
103 | struct ext4_inode_info *ei; | ||
104 | struct ext4_inode_info *ei_bl; | ||
105 | struct ext4_sb_info *sbi; | ||
106 | |||
107 | if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { | ||
108 | err = -EINVAL; | ||
109 | goto swap_boot_out; | ||
110 | } | ||
111 | |||
112 | if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { | ||
113 | err = -EPERM; | ||
114 | goto swap_boot_out; | ||
115 | } | ||
116 | |||
117 | sbi = EXT4_SB(sb); | ||
118 | ei = EXT4_I(inode); | ||
119 | |||
120 | inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); | ||
121 | if (IS_ERR(inode_bl)) { | ||
122 | err = PTR_ERR(inode_bl); | ||
123 | goto swap_boot_out; | ||
124 | } | ||
125 | ei_bl = EXT4_I(inode_bl); | ||
126 | |||
127 | filemap_flush(inode->i_mapping); | ||
128 | filemap_flush(inode_bl->i_mapping); | ||
129 | |||
130 | /* Protect orig inodes against a truncate and make sure, | ||
131 | * that only 1 swap_inode_boot_loader is running. */ | ||
132 | ext4_inode_double_lock(inode, inode_bl); | ||
133 | |||
134 | truncate_inode_pages(&inode->i_data, 0); | ||
135 | truncate_inode_pages(&inode_bl->i_data, 0); | ||
136 | |||
137 | /* Wait for all existing dio workers */ | ||
138 | ext4_inode_block_unlocked_dio(inode); | ||
139 | ext4_inode_block_unlocked_dio(inode_bl); | ||
140 | inode_dio_wait(inode); | ||
141 | inode_dio_wait(inode_bl); | ||
142 | |||
143 | handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); | ||
144 | if (IS_ERR(handle)) { | ||
145 | err = -EINVAL; | ||
146 | goto swap_boot_out; | ||
147 | } | ||
148 | |||
149 | /* Protect extent tree against block allocations via delalloc */ | ||
150 | ext4_double_down_write_data_sem(inode, inode_bl); | ||
151 | |||
152 | if (inode_bl->i_nlink == 0) { | ||
153 | /* this inode has never been used as a BOOT_LOADER */ | ||
154 | set_nlink(inode_bl, 1); | ||
155 | i_uid_write(inode_bl, 0); | ||
156 | i_gid_write(inode_bl, 0); | ||
157 | inode_bl->i_flags = 0; | ||
158 | ei_bl->i_flags = 0; | ||
159 | inode_bl->i_version = 1; | ||
160 | i_size_write(inode_bl, 0); | ||
161 | inode_bl->i_mode = S_IFREG; | ||
162 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, | ||
163 | EXT4_FEATURE_INCOMPAT_EXTENTS)) { | ||
164 | ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); | ||
165 | ext4_ext_tree_init(handle, inode_bl); | ||
166 | } else | ||
167 | memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); | ||
168 | } | ||
169 | |||
170 | swap_inode_data(inode, inode_bl); | ||
171 | |||
172 | inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); | ||
173 | |||
174 | spin_lock(&sbi->s_next_gen_lock); | ||
175 | inode->i_generation = sbi->s_next_generation++; | ||
176 | inode_bl->i_generation = sbi->s_next_generation++; | ||
177 | spin_unlock(&sbi->s_next_gen_lock); | ||
178 | |||
179 | ext4_discard_preallocations(inode); | ||
180 | |||
181 | err = ext4_mark_inode_dirty(handle, inode); | ||
182 | if (err < 0) { | ||
183 | ext4_warning(inode->i_sb, | ||
184 | "couldn't mark inode #%lu dirty (err %d)", | ||
185 | inode->i_ino, err); | ||
186 | /* Revert all changes: */ | ||
187 | swap_inode_data(inode, inode_bl); | ||
188 | } else { | ||
189 | err = ext4_mark_inode_dirty(handle, inode_bl); | ||
190 | if (err < 0) { | ||
191 | ext4_warning(inode_bl->i_sb, | ||
192 | "couldn't mark inode #%lu dirty (err %d)", | ||
193 | inode_bl->i_ino, err); | ||
194 | /* Revert all changes: */ | ||
195 | swap_inode_data(inode, inode_bl); | ||
196 | ext4_mark_inode_dirty(handle, inode); | ||
197 | } | ||
198 | } | ||
199 | |||
200 | ext4_journal_stop(handle); | ||
201 | |||
202 | ext4_double_up_write_data_sem(inode, inode_bl); | ||
203 | |||
204 | ext4_inode_resume_unlocked_dio(inode); | ||
205 | ext4_inode_resume_unlocked_dio(inode_bl); | ||
206 | |||
207 | ext4_inode_double_unlock(inode, inode_bl); | ||
208 | |||
209 | iput(inode_bl); | ||
210 | |||
211 | swap_boot_out: | ||
212 | return err; | ||
213 | } | ||
214 | |||
23 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | 215 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
24 | { | 216 | { |
25 | struct inode *inode = file_inode(filp); | 217 | struct inode *inode = file_inode(filp); |
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
83 | if (!capable(CAP_SYS_RESOURCE)) | 275 | if (!capable(CAP_SYS_RESOURCE)) |
84 | goto flags_out; | 276 | goto flags_out; |
85 | } | 277 | } |
86 | if (oldflags & EXT4_EXTENTS_FL) { | 278 | if ((flags ^ oldflags) & EXT4_EXTENTS_FL) |
87 | /* We don't support clearning extent flags */ | ||
88 | if (!(flags & EXT4_EXTENTS_FL)) { | ||
89 | err = -EOPNOTSUPP; | ||
90 | goto flags_out; | ||
91 | } | ||
92 | } else if (flags & EXT4_EXTENTS_FL) { | ||
93 | /* migrate the file */ | ||
94 | migrate = 1; | 279 | migrate = 1; |
95 | flags &= ~EXT4_EXTENTS_FL; | ||
96 | } | ||
97 | 280 | ||
98 | if (flags & EXT4_EOFBLOCKS_FL) { | 281 | if (flags & EXT4_EOFBLOCKS_FL) { |
99 | /* we don't support adding EOFBLOCKS flag */ | 282 | /* we don't support adding EOFBLOCKS flag */ |
@@ -137,8 +320,13 @@ flags_err: | |||
137 | err = ext4_change_inode_journal_flag(inode, jflag); | 320 | err = ext4_change_inode_journal_flag(inode, jflag); |
138 | if (err) | 321 | if (err) |
139 | goto flags_out; | 322 | goto flags_out; |
140 | if (migrate) | 323 | if (migrate) { |
141 | err = ext4_ext_migrate(inode); | 324 | if (flags & EXT4_EXTENTS_FL) |
325 | err = ext4_ext_migrate(inode); | ||
326 | else | ||
327 | err = ext4_ind_migrate(inode); | ||
328 | } | ||
329 | |||
142 | flags_out: | 330 | flags_out: |
143 | mutex_unlock(&inode->i_mutex); | 331 | mutex_unlock(&inode->i_mutex); |
144 | mnt_drop_write_file(filp); | 332 | mnt_drop_write_file(filp); |
@@ -357,9 +545,13 @@ group_add_out: | |||
357 | return err; | 545 | return err; |
358 | } | 546 | } |
359 | 547 | ||
548 | case EXT4_IOC_SWAP_BOOT: | ||
549 | if (!(filp->f_mode & FMODE_WRITE)) | ||
550 | return -EBADF; | ||
551 | return swap_inode_boot_loader(sb, inode); | ||
552 | |||
360 | case EXT4_IOC_RESIZE_FS: { | 553 | case EXT4_IOC_RESIZE_FS: { |
361 | ext4_fsblk_t n_blocks_count; | 554 | ext4_fsblk_t n_blocks_count; |
362 | struct super_block *sb = inode->i_sb; | ||
363 | int err = 0, err2 = 0; | 555 | int err = 0, err2 = 0; |
364 | ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; | 556 | ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; |
365 | 557 | ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ee6614bdb639..a11ea4d6164c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) | |||
405 | ext4_clear_bit(bit, addr); | 405 | ext4_clear_bit(bit, addr); |
406 | } | 406 | } |
407 | 407 | ||
408 | static inline int mb_test_and_clear_bit(int bit, void *addr) | ||
409 | { | ||
410 | addr = mb_correct_addr_and_bit(&bit, addr); | ||
411 | return ext4_test_and_clear_bit(bit, addr); | ||
412 | } | ||
413 | |||
408 | static inline int mb_find_next_zero_bit(void *addr, int max, int start) | 414 | static inline int mb_find_next_zero_bit(void *addr, int max, int start) |
409 | { | 415 | { |
410 | int fix = 0, ret, tmpmax; | 416 | int fix = 0, ret, tmpmax; |
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, | |||
764 | spin_unlock(&EXT4_SB(sb)->s_bal_lock); | 770 | spin_unlock(&EXT4_SB(sb)->s_bal_lock); |
765 | } | 771 | } |
766 | 772 | ||
773 | static void mb_regenerate_buddy(struct ext4_buddy *e4b) | ||
774 | { | ||
775 | int count; | ||
776 | int order = 1; | ||
777 | void *buddy; | ||
778 | |||
779 | while ((buddy = mb_find_buddy(e4b, order++, &count))) { | ||
780 | ext4_set_bits(buddy, 0, count); | ||
781 | } | ||
782 | e4b->bd_info->bb_fragments = 0; | ||
783 | memset(e4b->bd_info->bb_counters, 0, | ||
784 | sizeof(*e4b->bd_info->bb_counters) * | ||
785 | (e4b->bd_sb->s_blocksize_bits + 2)); | ||
786 | |||
787 | ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, | ||
788 | e4b->bd_bitmap, e4b->bd_group); | ||
789 | } | ||
790 | |||
767 | /* The buddy information is attached the buddy cache inode | 791 | /* The buddy information is attached the buddy cache inode |
768 | * for convenience. The information regarding each group | 792 | * for convenience. The information regarding each group |
769 | * is loaded via ext4_mb_load_buddy. The information involve | 793 | * is loaded via ext4_mb_load_buddy. The information involve |
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
860 | 884 | ||
861 | first_block = page->index * blocks_per_page; | 885 | first_block = page->index * blocks_per_page; |
862 | for (i = 0; i < blocks_per_page; i++) { | 886 | for (i = 0; i < blocks_per_page; i++) { |
863 | int group; | ||
864 | |||
865 | group = (first_block + i) >> 1; | 887 | group = (first_block + i) >> 1; |
866 | if (group >= ngroups) | 888 | if (group >= ngroups) |
867 | break; | 889 | break; |
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1011 | struct page *page; | 1033 | struct page *page; |
1012 | int ret = 0; | 1034 | int ret = 0; |
1013 | 1035 | ||
1036 | might_sleep(); | ||
1014 | mb_debug(1, "init group %u\n", group); | 1037 | mb_debug(1, "init group %u\n", group); |
1015 | this_grp = ext4_get_group_info(sb, group); | 1038 | this_grp = ext4_get_group_info(sb, group); |
1016 | /* | 1039 | /* |
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1082 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1105 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1083 | struct inode *inode = sbi->s_buddy_cache; | 1106 | struct inode *inode = sbi->s_buddy_cache; |
1084 | 1107 | ||
1108 | might_sleep(); | ||
1085 | mb_debug(1, "load group %u\n", group); | 1109 | mb_debug(1, "load group %u\n", group); |
1086 | 1110 | ||
1087 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | 1111 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; |
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len) | |||
1244 | } | 1268 | } |
1245 | } | 1269 | } |
1246 | 1270 | ||
1271 | /* clear bits in given range | ||
1272 | * will return first found zero bit if any, -1 otherwise | ||
1273 | */ | ||
1274 | static int mb_test_and_clear_bits(void *bm, int cur, int len) | ||
1275 | { | ||
1276 | __u32 *addr; | ||
1277 | int zero_bit = -1; | ||
1278 | |||
1279 | len = cur + len; | ||
1280 | while (cur < len) { | ||
1281 | if ((cur & 31) == 0 && (len - cur) >= 32) { | ||
1282 | /* fast path: clear whole word at once */ | ||
1283 | addr = bm + (cur >> 3); | ||
1284 | if (*addr != (__u32)(-1) && zero_bit == -1) | ||
1285 | zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); | ||
1286 | *addr = 0; | ||
1287 | cur += 32; | ||
1288 | continue; | ||
1289 | } | ||
1290 | if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) | ||
1291 | zero_bit = cur; | ||
1292 | cur++; | ||
1293 | } | ||
1294 | |||
1295 | return zero_bit; | ||
1296 | } | ||
1297 | |||
1247 | void ext4_set_bits(void *bm, int cur, int len) | 1298 | void ext4_set_bits(void *bm, int cur, int len) |
1248 | { | 1299 | { |
1249 | __u32 *addr; | 1300 | __u32 *addr; |
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len) | |||
1262 | } | 1313 | } |
1263 | } | 1314 | } |
1264 | 1315 | ||
1316 | /* | ||
1317 | * _________________________________________________________________ */ | ||
1318 | |||
1319 | static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) | ||
1320 | { | ||
1321 | if (mb_test_bit(*bit + side, bitmap)) { | ||
1322 | mb_clear_bit(*bit, bitmap); | ||
1323 | (*bit) -= side; | ||
1324 | return 1; | ||
1325 | } | ||
1326 | else { | ||
1327 | (*bit) += side; | ||
1328 | mb_set_bit(*bit, bitmap); | ||
1329 | return -1; | ||
1330 | } | ||
1331 | } | ||
1332 | |||
1333 | static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) | ||
1334 | { | ||
1335 | int max; | ||
1336 | int order = 1; | ||
1337 | void *buddy = mb_find_buddy(e4b, order, &max); | ||
1338 | |||
1339 | while (buddy) { | ||
1340 | void *buddy2; | ||
1341 | |||
1342 | /* Bits in range [first; last] are known to be set since | ||
1343 | * corresponding blocks were allocated. Bits in range | ||
1344 | * (first; last) will stay set because they form buddies on | ||
1345 | * upper layer. We just deal with borders if they don't | ||
1346 | * align with upper layer and then go up. | ||
1347 | * Releasing entire group is all about clearing | ||
1348 | * single bit of highest order buddy. | ||
1349 | */ | ||
1350 | |||
1351 | /* Example: | ||
1352 | * --------------------------------- | ||
1353 | * | 1 | 1 | 1 | 1 | | ||
1354 | * --------------------------------- | ||
1355 | * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | | ||
1356 | * --------------------------------- | ||
1357 | * 0 1 2 3 4 5 6 7 | ||
1358 | * \_____________________/ | ||
1359 | * | ||
1360 | * Neither [1] nor [6] is aligned to above layer. | ||
1361 | * Left neighbour [0] is free, so mark it busy, | ||
1362 | * decrease bb_counters and extend range to | ||
1363 | * [0; 6] | ||
1364 | * Right neighbour [7] is busy. It can't be coaleasced with [6], so | ||
1365 | * mark [6] free, increase bb_counters and shrink range to | ||
1366 | * [0; 5]. | ||
1367 | * Then shift range to [0; 2], go up and do the same. | ||
1368 | */ | ||
1369 | |||
1370 | |||
1371 | if (first & 1) | ||
1372 | e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); | ||
1373 | if (!(last & 1)) | ||
1374 | e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); | ||
1375 | if (first > last) | ||
1376 | break; | ||
1377 | order++; | ||
1378 | |||
1379 | if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { | ||
1380 | mb_clear_bits(buddy, first, last - first + 1); | ||
1381 | e4b->bd_info->bb_counters[order - 1] += last - first + 1; | ||
1382 | break; | ||
1383 | } | ||
1384 | first >>= 1; | ||
1385 | last >>= 1; | ||
1386 | buddy = buddy2; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1265 | static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | 1390 | static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, |
1266 | int first, int count) | 1391 | int first, int count) |
1267 | { | 1392 | { |
1268 | int block = 0; | 1393 | int left_is_free = 0; |
1269 | int max = 0; | 1394 | int right_is_free = 0; |
1270 | int order; | 1395 | int block; |
1271 | void *buddy; | 1396 | int last = first + count - 1; |
1272 | void *buddy2; | ||
1273 | struct super_block *sb = e4b->bd_sb; | 1397 | struct super_block *sb = e4b->bd_sb; |
1274 | 1398 | ||
1275 | BUG_ON(first + count > (sb->s_blocksize << 3)); | 1399 | BUG_ON(last >= (sb->s_blocksize << 3)); |
1276 | assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); | 1400 | assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); |
1277 | mb_check_buddy(e4b); | 1401 | mb_check_buddy(e4b); |
1278 | mb_free_blocks_double(inode, e4b, first, count); | 1402 | mb_free_blocks_double(inode, e4b, first, count); |
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |||
1281 | if (first < e4b->bd_info->bb_first_free) | 1405 | if (first < e4b->bd_info->bb_first_free) |
1282 | e4b->bd_info->bb_first_free = first; | 1406 | e4b->bd_info->bb_first_free = first; |
1283 | 1407 | ||
1284 | /* let's maintain fragments counter */ | 1408 | /* access memory sequentially: check left neighbour, |
1409 | * clear range and then check right neighbour | ||
1410 | */ | ||
1285 | if (first != 0) | 1411 | if (first != 0) |
1286 | block = !mb_test_bit(first - 1, e4b->bd_bitmap); | 1412 | left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); |
1287 | if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) | 1413 | block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); |
1288 | max = !mb_test_bit(first + count, e4b->bd_bitmap); | 1414 | if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) |
1289 | if (block && max) | 1415 | right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); |
1290 | e4b->bd_info->bb_fragments--; | ||
1291 | else if (!block && !max) | ||
1292 | e4b->bd_info->bb_fragments++; | ||
1293 | 1416 | ||
1294 | /* let's maintain buddy itself */ | 1417 | if (unlikely(block != -1)) { |
1295 | while (count-- > 0) { | 1418 | ext4_fsblk_t blocknr; |
1296 | block = first++; | ||
1297 | order = 0; | ||
1298 | 1419 | ||
1299 | if (!mb_test_bit(block, e4b->bd_bitmap)) { | 1420 | blocknr = ext4_group_first_block_no(sb, e4b->bd_group); |
1300 | ext4_fsblk_t blocknr; | 1421 | blocknr += EXT4_C2B(EXT4_SB(sb), block); |
1301 | 1422 | ext4_grp_locked_error(sb, e4b->bd_group, | |
1302 | blocknr = ext4_group_first_block_no(sb, e4b->bd_group); | 1423 | inode ? inode->i_ino : 0, |
1303 | blocknr += EXT4_C2B(EXT4_SB(sb), block); | 1424 | blocknr, |
1304 | ext4_grp_locked_error(sb, e4b->bd_group, | 1425 | "freeing already freed block " |
1305 | inode ? inode->i_ino : 0, | 1426 | "(bit %u)", block); |
1306 | blocknr, | 1427 | mb_regenerate_buddy(e4b); |
1307 | "freeing already freed block " | 1428 | goto done; |
1308 | "(bit %u)", block); | 1429 | } |
1309 | } | ||
1310 | mb_clear_bit(block, e4b->bd_bitmap); | ||
1311 | e4b->bd_info->bb_counters[order]++; | ||
1312 | |||
1313 | /* start of the buddy */ | ||
1314 | buddy = mb_find_buddy(e4b, order, &max); | ||
1315 | |||
1316 | do { | ||
1317 | block &= ~1UL; | ||
1318 | if (mb_test_bit(block, buddy) || | ||
1319 | mb_test_bit(block + 1, buddy)) | ||
1320 | break; | ||
1321 | |||
1322 | /* both the buddies are free, try to coalesce them */ | ||
1323 | buddy2 = mb_find_buddy(e4b, order + 1, &max); | ||
1324 | 1430 | ||
1325 | if (!buddy2) | 1431 | /* let's maintain fragments counter */ |
1326 | break; | 1432 | if (left_is_free && right_is_free) |
1433 | e4b->bd_info->bb_fragments--; | ||
1434 | else if (!left_is_free && !right_is_free) | ||
1435 | e4b->bd_info->bb_fragments++; | ||
1327 | 1436 | ||
1328 | if (order > 0) { | 1437 | /* buddy[0] == bd_bitmap is a special case, so handle |
1329 | /* for special purposes, we don't set | 1438 | * it right away and let mb_buddy_mark_free stay free of |
1330 | * free bits in bitmap */ | 1439 | * zero order checks. |
1331 | mb_set_bit(block, buddy); | 1440 | * Check if neighbours are to be coaleasced, |
1332 | mb_set_bit(block + 1, buddy); | 1441 | * adjust bitmap bb_counters and borders appropriately. |
1333 | } | 1442 | */ |
1334 | e4b->bd_info->bb_counters[order]--; | 1443 | if (first & 1) { |
1335 | e4b->bd_info->bb_counters[order]--; | 1444 | first += !left_is_free; |
1445 | e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; | ||
1446 | } | ||
1447 | if (!(last & 1)) { | ||
1448 | last -= !right_is_free; | ||
1449 | e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; | ||
1450 | } | ||
1336 | 1451 | ||
1337 | block = block >> 1; | 1452 | if (first <= last) |
1338 | order++; | 1453 | mb_buddy_mark_free(e4b, first >> 1, last >> 1); |
1339 | e4b->bd_info->bb_counters[order]++; | ||
1340 | 1454 | ||
1341 | mb_clear_bit(block, buddy2); | 1455 | done: |
1342 | buddy = buddy2; | ||
1343 | } while (1); | ||
1344 | } | ||
1345 | mb_set_largest_free_order(sb, e4b->bd_info); | 1456 | mb_set_largest_free_order(sb, e4b->bd_info); |
1346 | mb_check_buddy(e4b); | 1457 | mb_check_buddy(e4b); |
1347 | } | 1458 | } |
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, | |||
3342 | if (pa->pa_type == MB_GROUP_PA) | 3453 | if (pa->pa_type == MB_GROUP_PA) |
3343 | grp_blk--; | 3454 | grp_blk--; |
3344 | 3455 | ||
3345 | ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); | 3456 | grp = ext4_get_group_number(sb, grp_blk); |
3346 | 3457 | ||
3347 | /* | 3458 | /* |
3348 | * possible race: | 3459 | * possible race: |
@@ -3807,7 +3918,7 @@ repeat: | |||
3807 | 3918 | ||
3808 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { | 3919 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { |
3809 | BUG_ON(pa->pa_type != MB_INODE_PA); | 3920 | BUG_ON(pa->pa_type != MB_INODE_PA); |
3810 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | 3921 | group = ext4_get_group_number(sb, pa->pa_pstart); |
3811 | 3922 | ||
3812 | err = ext4_mb_load_buddy(sb, group, &e4b); | 3923 | err = ext4_mb_load_buddy(sb, group, &e4b); |
3813 | if (err) { | 3924 | if (err) { |
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, | |||
4069 | 4180 | ||
4070 | list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { | 4181 | list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { |
4071 | 4182 | ||
4072 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | 4183 | group = ext4_get_group_number(sb, pa->pa_pstart); |
4073 | if (ext4_mb_load_buddy(sb, group, &e4b)) { | 4184 | if (ext4_mb_load_buddy(sb, group, &e4b)) { |
4074 | ext4_error(sb, "Error loading buddy information for %u", | 4185 | ext4_error(sb, "Error loading buddy information for %u", |
4075 | group); | 4186 | group); |
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4217 | unsigned int inquota = 0; | 4328 | unsigned int inquota = 0; |
4218 | unsigned int reserv_clstrs = 0; | 4329 | unsigned int reserv_clstrs = 0; |
4219 | 4330 | ||
4331 | might_sleep(); | ||
4220 | sb = ar->inode->i_sb; | 4332 | sb = ar->inode->i_sb; |
4221 | sbi = EXT4_SB(sb); | 4333 | sbi = EXT4_SB(sb); |
4222 | 4334 | ||
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | |||
4420 | node = rb_prev(new_node); | 4532 | node = rb_prev(new_node); |
4421 | if (node) { | 4533 | if (node) { |
4422 | entry = rb_entry(node, struct ext4_free_data, efd_node); | 4534 | entry = rb_entry(node, struct ext4_free_data, efd_node); |
4423 | if (can_merge(entry, new_entry)) { | 4535 | if (can_merge(entry, new_entry) && |
4536 | ext4_journal_callback_try_del(handle, &entry->efd_jce)) { | ||
4424 | new_entry->efd_start_cluster = entry->efd_start_cluster; | 4537 | new_entry->efd_start_cluster = entry->efd_start_cluster; |
4425 | new_entry->efd_count += entry->efd_count; | 4538 | new_entry->efd_count += entry->efd_count; |
4426 | rb_erase(node, &(db->bb_free_root)); | 4539 | rb_erase(node, &(db->bb_free_root)); |
4427 | ext4_journal_callback_del(handle, &entry->efd_jce); | ||
4428 | kmem_cache_free(ext4_free_data_cachep, entry); | 4540 | kmem_cache_free(ext4_free_data_cachep, entry); |
4429 | } | 4541 | } |
4430 | } | 4542 | } |
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | |||
4432 | node = rb_next(new_node); | 4544 | node = rb_next(new_node); |
4433 | if (node) { | 4545 | if (node) { |
4434 | entry = rb_entry(node, struct ext4_free_data, efd_node); | 4546 | entry = rb_entry(node, struct ext4_free_data, efd_node); |
4435 | if (can_merge(new_entry, entry)) { | 4547 | if (can_merge(new_entry, entry) && |
4548 | ext4_journal_callback_try_del(handle, &entry->efd_jce)) { | ||
4436 | new_entry->efd_count += entry->efd_count; | 4549 | new_entry->efd_count += entry->efd_count; |
4437 | rb_erase(node, &(db->bb_free_root)); | 4550 | rb_erase(node, &(db->bb_free_root)); |
4438 | ext4_journal_callback_del(handle, &entry->efd_jce); | ||
4439 | kmem_cache_free(ext4_free_data_cachep, entry); | 4551 | kmem_cache_free(ext4_free_data_cachep, entry); |
4440 | } | 4552 | } |
4441 | } | 4553 | } |
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
4470 | int err = 0; | 4582 | int err = 0; |
4471 | int ret; | 4583 | int ret; |
4472 | 4584 | ||
4585 | might_sleep(); | ||
4473 | if (bh) { | 4586 | if (bh) { |
4474 | if (block) | 4587 | if (block) |
4475 | BUG_ON(block != bh->b_blocknr); | 4588 | BUG_ON(block != bh->b_blocknr); |
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 480acf4a085f..49e8bdff9163 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c | |||
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) | |||
426 | return retval; | 426 | return retval; |
427 | } | 427 | } |
428 | return retval; | 428 | return retval; |
429 | |||
430 | } | 429 | } |
431 | 430 | ||
432 | int ext4_ext_migrate(struct inode *inode) | 431 | int ext4_ext_migrate(struct inode *inode) |
@@ -606,3 +605,64 @@ out: | |||
606 | 605 | ||
607 | return retval; | 606 | return retval; |
608 | } | 607 | } |
608 | |||
609 | /* | ||
610 | * Migrate a simple extent-based inode to use the i_blocks[] array | ||
611 | */ | ||
612 | int ext4_ind_migrate(struct inode *inode) | ||
613 | { | ||
614 | struct ext4_extent_header *eh; | ||
615 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
616 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
617 | struct ext4_extent *ex; | ||
618 | unsigned int i, len; | ||
619 | ext4_fsblk_t blk; | ||
620 | handle_t *handle; | ||
621 | int ret; | ||
622 | |||
623 | if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, | ||
624 | EXT4_FEATURE_INCOMPAT_EXTENTS) || | ||
625 | (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | ||
626 | return -EINVAL; | ||
627 | |||
628 | if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, | ||
629 | EXT4_FEATURE_RO_COMPAT_BIGALLOC)) | ||
630 | return -EOPNOTSUPP; | ||
631 | |||
632 | handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); | ||
633 | if (IS_ERR(handle)) | ||
634 | return PTR_ERR(handle); | ||
635 | |||
636 | down_write(&EXT4_I(inode)->i_data_sem); | ||
637 | ret = ext4_ext_check_inode(inode); | ||
638 | if (ret) | ||
639 | goto errout; | ||
640 | |||
641 | eh = ext_inode_hdr(inode); | ||
642 | ex = EXT_FIRST_EXTENT(eh); | ||
643 | if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || | ||
644 | eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { | ||
645 | ret = -EOPNOTSUPP; | ||
646 | goto errout; | ||
647 | } | ||
648 | if (eh->eh_entries == 0) | ||
649 | blk = len = 0; | ||
650 | else { | ||
651 | len = le16_to_cpu(ex->ee_len); | ||
652 | blk = ext4_ext_pblock(ex); | ||
653 | if (len > EXT4_NDIR_BLOCKS) { | ||
654 | ret = -EOPNOTSUPP; | ||
655 | goto errout; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); | ||
660 | memset(ei->i_data, 0, sizeof(ei->i_data)); | ||
661 | for (i=0; i < len; i++) | ||
662 | ei->i_data[i] = cpu_to_le32(blk++); | ||
663 | ext4_mark_inode_dirty(handle, inode); | ||
664 | errout: | ||
665 | ext4_journal_stop(handle); | ||
666 | up_write(&EXT4_I(inode)->i_data_sem); | ||
667 | return ret; | ||
668 | } | ||
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f9b551561d2c..214461e42a05 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include "ext4.h" | 7 | #include "ext4.h" |
8 | 8 | ||
9 | /* Checksumming functions */ | 9 | /* Checksumming functions */ |
10 | static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) | 10 | static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) |
11 | { | 11 | { |
12 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 12 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
13 | int offset = offsetof(struct mmp_struct, mmp_checksum); | 13 | int offset = offsetof(struct mmp_struct, mmp_checksum); |
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) | |||
54 | lock_buffer(bh); | 54 | lock_buffer(bh); |
55 | bh->b_end_io = end_buffer_write_sync; | 55 | bh->b_end_io = end_buffer_write_sync; |
56 | get_bh(bh); | 56 | get_bh(bh); |
57 | submit_bh(WRITE_SYNC, bh); | 57 | submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); |
58 | wait_on_buffer(bh); | 58 | wait_on_buffer(bh); |
59 | sb_end_write(sb); | 59 | sb_end_write(sb); |
60 | if (unlikely(!buffer_uptodate(bh))) | 60 | if (unlikely(!buffer_uptodate(bh))) |
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, | |||
86 | get_bh(*bh); | 86 | get_bh(*bh); |
87 | lock_buffer(*bh); | 87 | lock_buffer(*bh); |
88 | (*bh)->b_end_io = end_buffer_read_sync; | 88 | (*bh)->b_end_io = end_buffer_read_sync; |
89 | submit_bh(READ_SYNC, *bh); | 89 | submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); |
90 | wait_on_buffer(*bh); | 90 | wait_on_buffer(*bh); |
91 | if (!buffer_uptodate(*bh)) { | 91 | if (!buffer_uptodate(*bh)) { |
92 | brelse(*bh); | 92 | brelse(*bh); |
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 33e1c086858b..3dcbf364022f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | |||
144 | } | 144 | } |
145 | 145 | ||
146 | /** | 146 | /** |
147 | * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem | 147 | * ext4_double_down_write_data_sem - Acquire two inodes' write lock |
148 | * of i_data_sem | ||
148 | * | 149 | * |
149 | * Acquire write lock of i_data_sem of the two inodes | 150 | * Acquire write lock of i_data_sem of the two inodes |
150 | */ | 151 | */ |
151 | static void | 152 | void |
152 | double_down_write_data_sem(struct inode *first, struct inode *second) | 153 | ext4_double_down_write_data_sem(struct inode *first, struct inode *second) |
153 | { | 154 | { |
154 | if (first < second) { | 155 | if (first < second) { |
155 | down_write(&EXT4_I(first)->i_data_sem); | 156 | down_write(&EXT4_I(first)->i_data_sem); |
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second) | |||
162 | } | 163 | } |
163 | 164 | ||
164 | /** | 165 | /** |
165 | * double_up_write_data_sem - Release two inodes' write lock of i_data_sem | 166 | * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem |
166 | * | 167 | * |
167 | * @orig_inode: original inode structure to be released its lock first | 168 | * @orig_inode: original inode structure to be released its lock first |
168 | * @donor_inode: donor inode structure to be released its lock second | 169 | * @donor_inode: donor inode structure to be released its lock second |
169 | * Release write lock of i_data_sem of two inodes (orig and donor). | 170 | * Release write lock of i_data_sem of two inodes (orig and donor). |
170 | */ | 171 | */ |
171 | static void | 172 | void |
172 | double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) | 173 | ext4_double_up_write_data_sem(struct inode *orig_inode, |
174 | struct inode *donor_inode) | ||
173 | { | 175 | { |
174 | up_write(&EXT4_I(orig_inode)->i_data_sem); | 176 | up_write(&EXT4_I(orig_inode)->i_data_sem); |
175 | up_write(&EXT4_I(donor_inode)->i_data_sem); | 177 | up_write(&EXT4_I(donor_inode)->i_data_sem); |
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, | |||
407 | mext_insert_inside_block(o_start, o_end, start_ext, new_ext, | 409 | mext_insert_inside_block(o_start, o_end, start_ext, new_ext, |
408 | end_ext, eh, range_to_move); | 410 | end_ext, eh, range_to_move); |
409 | 411 | ||
410 | if (depth) { | 412 | return ext4_ext_dirty(handle, orig_inode, orig_path); |
411 | ret = ext4_handle_dirty_metadata(handle, orig_inode, | ||
412 | orig_path->p_bh); | ||
413 | if (ret) | ||
414 | return ret; | ||
415 | } else { | ||
416 | ret = ext4_mark_inode_dirty(handle, orig_inode); | ||
417 | if (ret < 0) | ||
418 | return ret; | ||
419 | } | ||
420 | |||
421 | return 0; | ||
422 | } | 413 | } |
423 | 414 | ||
424 | /** | 415 | /** |
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, | |||
737 | donor_off += dext_alen; | 728 | donor_off += dext_alen; |
738 | orig_off += dext_alen; | 729 | orig_off += dext_alen; |
739 | 730 | ||
731 | BUG_ON(replaced_count > count); | ||
740 | /* Already moved the expected blocks */ | 732 | /* Already moved the expected blocks */ |
741 | if (replaced_count >= count) | 733 | if (replaced_count >= count) |
742 | break; | 734 | break; |
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, | |||
814 | page_cache_release(page[0]); | 806 | page_cache_release(page[0]); |
815 | return -ENOMEM; | 807 | return -ENOMEM; |
816 | } | 808 | } |
817 | 809 | /* | |
810 | * grab_cache_page_write_begin() may not wait on page's writeback if | ||
811 | * BDI not demand that. But it is reasonable to be very conservative | ||
812 | * here and explicitly wait on page's writeback | ||
813 | */ | ||
814 | wait_on_page_writeback(page[0]); | ||
815 | wait_on_page_writeback(page[1]); | ||
818 | if (inode1 > inode2) { | 816 | if (inode1 > inode2) { |
819 | struct page *tmp; | 817 | struct page *tmp; |
820 | tmp = page[0]; | 818 | tmp = page[0]; |
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) | |||
856 | if (buffer_uptodate(bh)) | 854 | if (buffer_uptodate(bh)) |
857 | continue; | 855 | continue; |
858 | if (!buffer_mapped(bh)) { | 856 | if (!buffer_mapped(bh)) { |
859 | int err = 0; | ||
860 | err = ext4_get_block(inode, block, bh, 0); | 857 | err = ext4_get_block(inode, block, bh, 0); |
861 | if (err) { | 858 | if (err) { |
862 | SetPageError(page); | 859 | SetPageError(page); |
@@ -976,7 +973,7 @@ again: | |||
976 | * necessary, just swap data blocks between orig and donor. | 973 | * necessary, just swap data blocks between orig and donor. |
977 | */ | 974 | */ |
978 | if (uninit) { | 975 | if (uninit) { |
979 | double_down_write_data_sem(orig_inode, donor_inode); | 976 | ext4_double_down_write_data_sem(orig_inode, donor_inode); |
980 | /* If any of extents in range became initialized we have to | 977 | /* If any of extents in range became initialized we have to |
981 | * fallback to data copying */ | 978 | * fallback to data copying */ |
982 | uninit = mext_check_coverage(orig_inode, orig_blk_offset, | 979 | uninit = mext_check_coverage(orig_inode, orig_blk_offset, |
@@ -990,7 +987,7 @@ again: | |||
990 | goto drop_data_sem; | 987 | goto drop_data_sem; |
991 | 988 | ||
992 | if (!uninit) { | 989 | if (!uninit) { |
993 | double_up_write_data_sem(orig_inode, donor_inode); | 990 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
994 | goto data_copy; | 991 | goto data_copy; |
995 | } | 992 | } |
996 | if ((page_has_private(pagep[0]) && | 993 | if ((page_has_private(pagep[0]) && |
@@ -1004,7 +1001,7 @@ again: | |||
1004 | donor_inode, orig_blk_offset, | 1001 | donor_inode, orig_blk_offset, |
1005 | block_len_in_page, err); | 1002 | block_len_in_page, err); |
1006 | drop_data_sem: | 1003 | drop_data_sem: |
1007 | double_up_write_data_sem(orig_inode, donor_inode); | 1004 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
1008 | goto unlock_pages; | 1005 | goto unlock_pages; |
1009 | } | 1006 | } |
1010 | data_copy: | 1007 | data_copy: |
@@ -1033,7 +1030,7 @@ data_copy: | |||
1033 | } | 1030 | } |
1034 | /* Perform all necessary steps similar write_begin()/write_end() | 1031 | /* Perform all necessary steps similar write_begin()/write_end() |
1035 | * but keeping in mind that i_size will not change */ | 1032 | * but keeping in mind that i_size will not change */ |
1036 | *err = __block_write_begin(pagep[0], from, from + replaced_size, | 1033 | *err = __block_write_begin(pagep[0], from, replaced_size, |
1037 | ext4_get_block); | 1034 | ext4_get_block); |
1038 | if (!*err) | 1035 | if (!*err) |
1039 | *err = block_commit_write(pagep[0], from, from + replaced_size); | 1036 | *err = block_commit_write(pagep[0], from, from + replaced_size); |
@@ -1065,11 +1062,11 @@ repair_branches: | |||
1065 | * Extents are swapped already, but we are not able to copy data. | 1062 | * Extents are swapped already, but we are not able to copy data. |
1066 | * Try to swap extents to it's original places | 1063 | * Try to swap extents to it's original places |
1067 | */ | 1064 | */ |
1068 | double_down_write_data_sem(orig_inode, donor_inode); | 1065 | ext4_double_down_write_data_sem(orig_inode, donor_inode); |
1069 | replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, | 1066 | replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, |
1070 | orig_blk_offset, | 1067 | orig_blk_offset, |
1071 | block_len_in_page, &err2); | 1068 | block_len_in_page, &err2); |
1072 | double_up_write_data_sem(orig_inode, donor_inode); | 1069 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
1073 | if (replaced_count != block_len_in_page) { | 1070 | if (replaced_count != block_len_in_page) { |
1074 | EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), | 1071 | EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), |
1075 | "Unable to copy data block," | 1072 | "Unable to copy data block," |
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode, | |||
1209 | } | 1206 | } |
1210 | 1207 | ||
1211 | /** | 1208 | /** |
1212 | * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 | 1209 | * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 |
1213 | * | 1210 | * |
1214 | * @inode1: the inode structure | 1211 | * @inode1: the inode structure |
1215 | * @inode2: the inode structure | 1212 | * @inode2: the inode structure |
1216 | * | 1213 | * |
1217 | * Lock two inodes' i_mutex | 1214 | * Lock two inodes' i_mutex |
1218 | */ | 1215 | */ |
1219 | static void | 1216 | void |
1220 | mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | 1217 | ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) |
1221 | { | 1218 | { |
1222 | BUG_ON(inode1 == inode2); | 1219 | BUG_ON(inode1 == inode2); |
1223 | if (inode1 < inode2) { | 1220 | if (inode1 < inode2) { |
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | |||
1230 | } | 1227 | } |
1231 | 1228 | ||
1232 | /** | 1229 | /** |
1233 | * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 | 1230 | * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 |
1234 | * | 1231 | * |
1235 | * @inode1: the inode that is released first | 1232 | * @inode1: the inode that is released first |
1236 | * @inode2: the inode that is released second | 1233 | * @inode2: the inode that is released second |
1237 | * | 1234 | * |
1238 | */ | 1235 | */ |
1239 | 1236 | ||
1240 | static void | 1237 | void |
1241 | mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) | 1238 | ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) |
1242 | { | 1239 | { |
1243 | mutex_unlock(&inode1->i_mutex); | 1240 | mutex_unlock(&inode1->i_mutex); |
1244 | mutex_unlock(&inode2->i_mutex); | 1241 | mutex_unlock(&inode2->i_mutex); |
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1333 | return -EINVAL; | 1330 | return -EINVAL; |
1334 | } | 1331 | } |
1335 | /* Protect orig and donor inodes against a truncate */ | 1332 | /* Protect orig and donor inodes against a truncate */ |
1336 | mext_inode_double_lock(orig_inode, donor_inode); | 1333 | ext4_inode_double_lock(orig_inode, donor_inode); |
1337 | 1334 | ||
1338 | /* Wait for all existing dio workers */ | 1335 | /* Wait for all existing dio workers */ |
1339 | ext4_inode_block_unlocked_dio(orig_inode); | 1336 | ext4_inode_block_unlocked_dio(orig_inode); |
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1342 | inode_dio_wait(donor_inode); | 1339 | inode_dio_wait(donor_inode); |
1343 | 1340 | ||
1344 | /* Protect extent tree against block allocations via delalloc */ | 1341 | /* Protect extent tree against block allocations via delalloc */ |
1345 | double_down_write_data_sem(orig_inode, donor_inode); | 1342 | ext4_double_down_write_data_sem(orig_inode, donor_inode); |
1346 | /* Check the filesystem environment whether move_extent can be done */ | 1343 | /* Check the filesystem environment whether move_extent can be done */ |
1347 | ret = mext_check_arguments(orig_inode, donor_inode, orig_start, | 1344 | ret = mext_check_arguments(orig_inode, donor_inode, orig_start, |
1348 | donor_start, &len); | 1345 | donor_start, &len); |
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1466 | * b. racing with ->readpage, ->write_begin, and ext4_get_block | 1463 | * b. racing with ->readpage, ->write_begin, and ext4_get_block |
1467 | * in move_extent_per_page | 1464 | * in move_extent_per_page |
1468 | */ | 1465 | */ |
1469 | double_up_write_data_sem(orig_inode, donor_inode); | 1466 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
1470 | 1467 | ||
1471 | while (orig_page_offset <= seq_end_page) { | 1468 | while (orig_page_offset <= seq_end_page) { |
1472 | 1469 | ||
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1500 | block_len_in_page = rest_blocks; | 1497 | block_len_in_page = rest_blocks; |
1501 | } | 1498 | } |
1502 | 1499 | ||
1503 | double_down_write_data_sem(orig_inode, donor_inode); | 1500 | ext4_double_down_write_data_sem(orig_inode, donor_inode); |
1504 | if (ret < 0) | 1501 | if (ret < 0) |
1505 | break; | 1502 | break; |
1506 | 1503 | ||
@@ -1538,10 +1535,10 @@ out: | |||
1538 | ext4_ext_drop_refs(holecheck_path); | 1535 | ext4_ext_drop_refs(holecheck_path); |
1539 | kfree(holecheck_path); | 1536 | kfree(holecheck_path); |
1540 | } | 1537 | } |
1541 | double_up_write_data_sem(orig_inode, donor_inode); | 1538 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
1542 | ext4_inode_resume_unlocked_dio(orig_inode); | 1539 | ext4_inode_resume_unlocked_dio(orig_inode); |
1543 | ext4_inode_resume_unlocked_dio(donor_inode); | 1540 | ext4_inode_resume_unlocked_dio(donor_inode); |
1544 | mext_inode_double_unlock(orig_inode, donor_inode); | 1541 | ext4_inode_double_unlock(orig_inode, donor_inode); |
1545 | 1542 | ||
1546 | return ret; | 1543 | return ret; |
1547 | } | 1544 | } |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3825d6aa8336..6653fc35ecb7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, | |||
416 | { | 416 | { |
417 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 417 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
418 | struct ext4_inode_info *ei = EXT4_I(inode); | 418 | struct ext4_inode_info *ei = EXT4_I(inode); |
419 | __u32 csum, old_csum; | 419 | __u32 csum; |
420 | __le32 save_csum; | ||
420 | int size; | 421 | int size; |
421 | 422 | ||
422 | size = count_offset + (count * sizeof(struct dx_entry)); | 423 | size = count_offset + (count * sizeof(struct dx_entry)); |
423 | old_csum = t->dt_checksum; | 424 | save_csum = t->dt_checksum; |
424 | t->dt_checksum = 0; | 425 | t->dt_checksum = 0; |
425 | csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); | 426 | csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); |
426 | csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); | 427 | csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); |
427 | t->dt_checksum = old_csum; | 428 | t->dt_checksum = save_csum; |
428 | 429 | ||
429 | return cpu_to_le32(csum); | 430 | return cpu_to_le32(csum); |
430 | } | 431 | } |
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, | |||
971 | hinfo.hash_version += | 972 | hinfo.hash_version += |
972 | EXT4_SB(dir->i_sb)->s_hash_unsigned; | 973 | EXT4_SB(dir->i_sb)->s_hash_unsigned; |
973 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; | 974 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; |
975 | if (ext4_has_inline_data(dir)) { | ||
976 | int has_inline_data = 1; | ||
977 | count = htree_inlinedir_to_tree(dir_file, dir, 0, | ||
978 | &hinfo, start_hash, | ||
979 | start_minor_hash, | ||
980 | &has_inline_data); | ||
981 | if (has_inline_data) { | ||
982 | *next_hash = ~0; | ||
983 | return count; | ||
984 | } | ||
985 | } | ||
974 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, | 986 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, |
975 | start_hash, start_minor_hash); | 987 | start_hash, start_minor_hash); |
976 | *next_hash = ~0; | 988 | *next_hash = ~0; |
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child) | |||
1455 | return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); | 1467 | return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); |
1456 | } | 1468 | } |
1457 | 1469 | ||
1458 | #define S_SHIFT 12 | ||
1459 | static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { | ||
1460 | [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, | ||
1461 | [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, | ||
1462 | [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, | ||
1463 | [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, | ||
1464 | [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, | ||
1465 | [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, | ||
1466 | [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, | ||
1467 | }; | ||
1468 | |||
1469 | static inline void ext4_set_de_type(struct super_block *sb, | ||
1470 | struct ext4_dir_entry_2 *de, | ||
1471 | umode_t mode) { | ||
1472 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) | ||
1473 | de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; | ||
1474 | } | ||
1475 | |||
1476 | /* | 1470 | /* |
1477 | * Move count entries from end of map between two memory locations. | 1471 | * Move count entries from end of map between two memory locations. |
1478 | * Returns pointer to last entry moved. | 1472 | * Returns pointer to last entry moved. |
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, | |||
2251 | dquot_initialize(dir); | 2245 | dquot_initialize(dir); |
2252 | 2246 | ||
2253 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | 2247 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
2254 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | 2248 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); |
2255 | EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
2256 | retry: | 2249 | retry: |
2257 | inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, | 2250 | inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, |
2258 | NULL, EXT4_HT_DIR, credits); | 2251 | NULL, EXT4_HT_DIR, credits); |
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, | |||
2286 | dquot_initialize(dir); | 2279 | dquot_initialize(dir); |
2287 | 2280 | ||
2288 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | 2281 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
2289 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | 2282 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); |
2290 | EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
2291 | retry: | 2283 | retry: |
2292 | inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, | 2284 | inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, |
2293 | NULL, EXT4_HT_DIR, credits); | 2285 | NULL, EXT4_HT_DIR, credits); |
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
2396 | dquot_initialize(dir); | 2388 | dquot_initialize(dir); |
2397 | 2389 | ||
2398 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | 2390 | credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
2399 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | 2391 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); |
2400 | EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); | ||
2401 | retry: | 2392 | retry: |
2402 | inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, | 2393 | inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, |
2403 | &dentry->d_name, | 2394 | &dentry->d_name, |
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir, | |||
2826 | * quota blocks, sb is already counted in previous macros). | 2817 | * quota blocks, sb is already counted in previous macros). |
2827 | */ | 2818 | */ |
2828 | credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + | 2819 | credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
2829 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | 2820 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; |
2830 | EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); | ||
2831 | } | 2821 | } |
2832 | retry: | 2822 | retry: |
2833 | inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, | 2823 | inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 047a6de04a0a..5929cd0baa20 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -29,25 +29,19 @@ | |||
29 | #include "xattr.h" | 29 | #include "xattr.h" |
30 | #include "acl.h" | 30 | #include "acl.h" |
31 | 31 | ||
32 | static struct kmem_cache *io_page_cachep, *io_end_cachep; | 32 | static struct kmem_cache *io_end_cachep; |
33 | 33 | ||
34 | int __init ext4_init_pageio(void) | 34 | int __init ext4_init_pageio(void) |
35 | { | 35 | { |
36 | io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); | ||
37 | if (io_page_cachep == NULL) | ||
38 | return -ENOMEM; | ||
39 | io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); | 36 | io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); |
40 | if (io_end_cachep == NULL) { | 37 | if (io_end_cachep == NULL) |
41 | kmem_cache_destroy(io_page_cachep); | ||
42 | return -ENOMEM; | 38 | return -ENOMEM; |
43 | } | ||
44 | return 0; | 39 | return 0; |
45 | } | 40 | } |
46 | 41 | ||
47 | void ext4_exit_pageio(void) | 42 | void ext4_exit_pageio(void) |
48 | { | 43 | { |
49 | kmem_cache_destroy(io_end_cachep); | 44 | kmem_cache_destroy(io_end_cachep); |
50 | kmem_cache_destroy(io_page_cachep); | ||
51 | } | 45 | } |
52 | 46 | ||
53 | /* | 47 | /* |
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode) | |||
67 | cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); | 61 | cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); |
68 | } | 62 | } |
69 | 63 | ||
70 | static void put_io_page(struct ext4_io_page *io_page) | 64 | static void ext4_release_io_end(ext4_io_end_t *io_end) |
71 | { | 65 | { |
72 | if (atomic_dec_and_test(&io_page->p_count)) { | 66 | BUG_ON(!list_empty(&io_end->list)); |
73 | end_page_writeback(io_page->p_page); | 67 | BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); |
74 | put_page(io_page->p_page); | 68 | |
75 | kmem_cache_free(io_page_cachep, io_page); | 69 | if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) |
76 | } | 70 | wake_up_all(ext4_ioend_wq(io_end->inode)); |
71 | if (io_end->flag & EXT4_IO_END_DIRECT) | ||
72 | inode_dio_done(io_end->inode); | ||
73 | if (io_end->iocb) | ||
74 | aio_complete(io_end->iocb, io_end->result, 0); | ||
75 | kmem_cache_free(io_end_cachep, io_end); | ||
77 | } | 76 | } |
78 | 77 | ||
79 | void ext4_free_io_end(ext4_io_end_t *io) | 78 | static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) |
80 | { | 79 | { |
81 | int i; | 80 | struct inode *inode = io_end->inode; |
82 | |||
83 | BUG_ON(!io); | ||
84 | BUG_ON(!list_empty(&io->list)); | ||
85 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); | ||
86 | 81 | ||
87 | for (i = 0; i < io->num_io_pages; i++) | 82 | io_end->flag &= ~EXT4_IO_END_UNWRITTEN; |
88 | put_io_page(io->pages[i]); | 83 | /* Wake up anyone waiting on unwritten extent conversion */ |
89 | io->num_io_pages = 0; | 84 | if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) |
90 | if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) | 85 | wake_up_all(ext4_ioend_wq(inode)); |
91 | wake_up_all(ext4_ioend_wq(io->inode)); | ||
92 | kmem_cache_free(io_end_cachep, io); | ||
93 | } | 86 | } |
94 | 87 | ||
95 | /* check a range of space and convert unwritten extents to written. */ | 88 | /* check a range of space and convert unwritten extents to written. */ |
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io) | |||
112 | "(inode %lu, offset %llu, size %zd, error %d)", | 105 | "(inode %lu, offset %llu, size %zd, error %d)", |
113 | inode->i_ino, offset, size, ret); | 106 | inode->i_ino, offset, size, ret); |
114 | } | 107 | } |
115 | /* Wake up anyone waiting on unwritten extent conversion */ | 108 | ext4_clear_io_unwritten_flag(io); |
116 | if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) | 109 | ext4_release_io_end(io); |
117 | wake_up_all(ext4_ioend_wq(inode)); | ||
118 | if (io->flag & EXT4_IO_END_DIRECT) | ||
119 | inode_dio_done(inode); | ||
120 | if (io->iocb) | ||
121 | aio_complete(io->iocb, io->result, 0); | ||
122 | return ret; | 110 | return ret; |
123 | } | 111 | } |
124 | 112 | ||
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode) | |||
149 | } | 137 | } |
150 | 138 | ||
151 | /* Add the io_end to per-inode completed end_io list. */ | 139 | /* Add the io_end to per-inode completed end_io list. */ |
152 | void ext4_add_complete_io(ext4_io_end_t *io_end) | 140 | static void ext4_add_complete_io(ext4_io_end_t *io_end) |
153 | { | 141 | { |
154 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); | 142 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); |
155 | struct workqueue_struct *wq; | 143 | struct workqueue_struct *wq; |
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) | |||
186 | err = ext4_end_io(io); | 174 | err = ext4_end_io(io); |
187 | if (unlikely(!ret && err)) | 175 | if (unlikely(!ret && err)) |
188 | ret = err; | 176 | ret = err; |
189 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
190 | ext4_free_io_end(io); | ||
191 | } | 177 | } |
192 | return ret; | 178 | return ret; |
193 | } | 179 | } |
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | |||
219 | atomic_inc(&EXT4_I(inode)->i_ioend_count); | 205 | atomic_inc(&EXT4_I(inode)->i_ioend_count); |
220 | io->inode = inode; | 206 | io->inode = inode; |
221 | INIT_LIST_HEAD(&io->list); | 207 | INIT_LIST_HEAD(&io->list); |
208 | atomic_set(&io->count, 1); | ||
222 | } | 209 | } |
223 | return io; | 210 | return io; |
224 | } | 211 | } |
225 | 212 | ||
213 | void ext4_put_io_end_defer(ext4_io_end_t *io_end) | ||
214 | { | ||
215 | if (atomic_dec_and_test(&io_end->count)) { | ||
216 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { | ||
217 | ext4_release_io_end(io_end); | ||
218 | return; | ||
219 | } | ||
220 | ext4_add_complete_io(io_end); | ||
221 | } | ||
222 | } | ||
223 | |||
224 | int ext4_put_io_end(ext4_io_end_t *io_end) | ||
225 | { | ||
226 | int err = 0; | ||
227 | |||
228 | if (atomic_dec_and_test(&io_end->count)) { | ||
229 | if (io_end->flag & EXT4_IO_END_UNWRITTEN) { | ||
230 | err = ext4_convert_unwritten_extents(io_end->inode, | ||
231 | io_end->offset, io_end->size); | ||
232 | ext4_clear_io_unwritten_flag(io_end); | ||
233 | } | ||
234 | ext4_release_io_end(io_end); | ||
235 | } | ||
236 | return err; | ||
237 | } | ||
238 | |||
239 | ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) | ||
240 | { | ||
241 | atomic_inc(&io_end->count); | ||
242 | return io_end; | ||
243 | } | ||
244 | |||
226 | /* | 245 | /* |
227 | * Print an buffer I/O error compatible with the fs/buffer.c. This | 246 | * Print an buffer I/O error compatible with the fs/buffer.c. This |
228 | * provides compatibility with dmesg scrapers that look for a specific | 247 | * provides compatibility with dmesg scrapers that look for a specific |
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
243 | ext4_io_end_t *io_end = bio->bi_private; | 262 | ext4_io_end_t *io_end = bio->bi_private; |
244 | struct inode *inode; | 263 | struct inode *inode; |
245 | int i; | 264 | int i; |
265 | int blocksize; | ||
246 | sector_t bi_sector = bio->bi_sector; | 266 | sector_t bi_sector = bio->bi_sector; |
247 | 267 | ||
248 | BUG_ON(!io_end); | 268 | BUG_ON(!io_end); |
269 | inode = io_end->inode; | ||
270 | blocksize = 1 << inode->i_blkbits; | ||
249 | bio->bi_private = NULL; | 271 | bio->bi_private = NULL; |
250 | bio->bi_end_io = NULL; | 272 | bio->bi_end_io = NULL; |
251 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 273 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
252 | error = 0; | 274 | error = 0; |
253 | bio_put(bio); | 275 | for (i = 0; i < bio->bi_vcnt; i++) { |
254 | 276 | struct bio_vec *bvec = &bio->bi_io_vec[i]; | |
255 | for (i = 0; i < io_end->num_io_pages; i++) { | 277 | struct page *page = bvec->bv_page; |
256 | struct page *page = io_end->pages[i]->p_page; | ||
257 | struct buffer_head *bh, *head; | 278 | struct buffer_head *bh, *head; |
258 | loff_t offset; | 279 | unsigned bio_start = bvec->bv_offset; |
259 | loff_t io_end_offset; | 280 | unsigned bio_end = bio_start + bvec->bv_len; |
281 | unsigned under_io = 0; | ||
282 | unsigned long flags; | ||
283 | |||
284 | if (!page) | ||
285 | continue; | ||
260 | 286 | ||
261 | if (error) { | 287 | if (error) { |
262 | SetPageError(page); | 288 | SetPageError(page); |
263 | set_bit(AS_EIO, &page->mapping->flags); | 289 | set_bit(AS_EIO, &page->mapping->flags); |
264 | head = page_buffers(page); | ||
265 | BUG_ON(!head); | ||
266 | |||
267 | io_end_offset = io_end->offset + io_end->size; | ||
268 | |||
269 | offset = (sector_t) page->index << PAGE_CACHE_SHIFT; | ||
270 | bh = head; | ||
271 | do { | ||
272 | if ((offset >= io_end->offset) && | ||
273 | (offset+bh->b_size <= io_end_offset)) | ||
274 | buffer_io_error(bh); | ||
275 | |||
276 | offset += bh->b_size; | ||
277 | bh = bh->b_this_page; | ||
278 | } while (bh != head); | ||
279 | } | 290 | } |
280 | 291 | bh = head = page_buffers(page); | |
281 | put_io_page(io_end->pages[i]); | 292 | /* |
293 | * We check all buffers in the page under BH_Uptodate_Lock | ||
294 | * to avoid races with other end io clearing async_write flags | ||
295 | */ | ||
296 | local_irq_save(flags); | ||
297 | bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | ||
298 | do { | ||
299 | if (bh_offset(bh) < bio_start || | ||
300 | bh_offset(bh) + blocksize > bio_end) { | ||
301 | if (buffer_async_write(bh)) | ||
302 | under_io++; | ||
303 | continue; | ||
304 | } | ||
305 | clear_buffer_async_write(bh); | ||
306 | if (error) | ||
307 | buffer_io_error(bh); | ||
308 | } while ((bh = bh->b_this_page) != head); | ||
309 | bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | ||
310 | local_irq_restore(flags); | ||
311 | if (!under_io) | ||
312 | end_page_writeback(page); | ||
282 | } | 313 | } |
283 | io_end->num_io_pages = 0; | 314 | bio_put(bio); |
284 | inode = io_end->inode; | ||
285 | 315 | ||
286 | if (error) { | 316 | if (error) { |
287 | io_end->flag |= EXT4_IO_END_ERROR; | 317 | io_end->flag |= EXT4_IO_END_ERROR; |
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
294 | bi_sector >> (inode->i_blkbits - 9)); | 324 | bi_sector >> (inode->i_blkbits - 9)); |
295 | } | 325 | } |
296 | 326 | ||
297 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | 327 | ext4_put_io_end_defer(io_end); |
298 | ext4_free_io_end(io_end); | ||
299 | return; | ||
300 | } | ||
301 | |||
302 | ext4_add_complete_io(io_end); | ||
303 | } | 328 | } |
304 | 329 | ||
305 | void ext4_io_submit(struct ext4_io_submit *io) | 330 | void ext4_io_submit(struct ext4_io_submit *io) |
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io) | |||
313 | bio_put(io->io_bio); | 338 | bio_put(io->io_bio); |
314 | } | 339 | } |
315 | io->io_bio = NULL; | 340 | io->io_bio = NULL; |
316 | io->io_op = 0; | 341 | } |
342 | |||
343 | void ext4_io_submit_init(struct ext4_io_submit *io, | ||
344 | struct writeback_control *wbc) | ||
345 | { | ||
346 | io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); | ||
347 | io->io_bio = NULL; | ||
317 | io->io_end = NULL; | 348 | io->io_end = NULL; |
318 | } | 349 | } |
319 | 350 | ||
320 | static int io_submit_init(struct ext4_io_submit *io, | 351 | static int io_submit_init_bio(struct ext4_io_submit *io, |
321 | struct inode *inode, | 352 | struct buffer_head *bh) |
322 | struct writeback_control *wbc, | ||
323 | struct buffer_head *bh) | ||
324 | { | 353 | { |
325 | ext4_io_end_t *io_end; | ||
326 | struct page *page = bh->b_page; | ||
327 | int nvecs = bio_get_nr_vecs(bh->b_bdev); | 354 | int nvecs = bio_get_nr_vecs(bh->b_bdev); |
328 | struct bio *bio; | 355 | struct bio *bio; |
329 | 356 | ||
330 | io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
331 | if (!io_end) | ||
332 | return -ENOMEM; | ||
333 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); | 357 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); |
334 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 358 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
335 | bio->bi_bdev = bh->b_bdev; | 359 | bio->bi_bdev = bh->b_bdev; |
336 | bio->bi_private = io->io_end = io_end; | ||
337 | bio->bi_end_io = ext4_end_bio; | 360 | bio->bi_end_io = ext4_end_bio; |
338 | 361 | bio->bi_private = ext4_get_io_end(io->io_end); | |
339 | io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); | 362 | if (!io->io_end->size) |
340 | 363 | io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) | |
364 | + bh_offset(bh); | ||
341 | io->io_bio = bio; | 365 | io->io_bio = bio; |
342 | io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); | ||
343 | io->io_next_block = bh->b_blocknr; | 366 | io->io_next_block = bh->b_blocknr; |
344 | return 0; | 367 | return 0; |
345 | } | 368 | } |
346 | 369 | ||
347 | static int io_submit_add_bh(struct ext4_io_submit *io, | 370 | static int io_submit_add_bh(struct ext4_io_submit *io, |
348 | struct ext4_io_page *io_page, | ||
349 | struct inode *inode, | 371 | struct inode *inode, |
350 | struct writeback_control *wbc, | ||
351 | struct buffer_head *bh) | 372 | struct buffer_head *bh) |
352 | { | 373 | { |
353 | ext4_io_end_t *io_end; | 374 | ext4_io_end_t *io_end; |
354 | int ret; | 375 | int ret; |
355 | 376 | ||
356 | if (buffer_new(bh)) { | ||
357 | clear_buffer_new(bh); | ||
358 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
359 | } | ||
360 | |||
361 | if (io->io_bio && bh->b_blocknr != io->io_next_block) { | 377 | if (io->io_bio && bh->b_blocknr != io->io_next_block) { |
362 | submit_and_retry: | 378 | submit_and_retry: |
363 | ext4_io_submit(io); | 379 | ext4_io_submit(io); |
364 | } | 380 | } |
365 | if (io->io_bio == NULL) { | 381 | if (io->io_bio == NULL) { |
366 | ret = io_submit_init(io, inode, wbc, bh); | 382 | ret = io_submit_init_bio(io, bh); |
367 | if (ret) | 383 | if (ret) |
368 | return ret; | 384 | return ret; |
369 | } | 385 | } |
370 | io_end = io->io_end; | ||
371 | if ((io_end->num_io_pages >= MAX_IO_PAGES) && | ||
372 | (io_end->pages[io_end->num_io_pages-1] != io_page)) | ||
373 | goto submit_and_retry; | ||
374 | if (buffer_uninit(bh)) | ||
375 | ext4_set_io_unwritten_flag(inode, io_end); | ||
376 | io->io_end->size += bh->b_size; | ||
377 | io->io_next_block++; | ||
378 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); | 386 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); |
379 | if (ret != bh->b_size) | 387 | if (ret != bh->b_size) |
380 | goto submit_and_retry; | 388 | goto submit_and_retry; |
381 | if ((io_end->num_io_pages == 0) || | 389 | io_end = io->io_end; |
382 | (io_end->pages[io_end->num_io_pages-1] != io_page)) { | 390 | if (test_clear_buffer_uninit(bh)) |
383 | io_end->pages[io_end->num_io_pages++] = io_page; | 391 | ext4_set_io_unwritten_flag(inode, io_end); |
384 | atomic_inc(&io_page->p_count); | 392 | io_end->size += bh->b_size; |
385 | } | 393 | io->io_next_block++; |
386 | return 0; | 394 | return 0; |
387 | } | 395 | } |
388 | 396 | ||
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
392 | struct writeback_control *wbc) | 400 | struct writeback_control *wbc) |
393 | { | 401 | { |
394 | struct inode *inode = page->mapping->host; | 402 | struct inode *inode = page->mapping->host; |
395 | unsigned block_start, block_end, blocksize; | 403 | unsigned block_start, blocksize; |
396 | struct ext4_io_page *io_page; | ||
397 | struct buffer_head *bh, *head; | 404 | struct buffer_head *bh, *head; |
398 | int ret = 0; | 405 | int ret = 0; |
406 | int nr_submitted = 0; | ||
399 | 407 | ||
400 | blocksize = 1 << inode->i_blkbits; | 408 | blocksize = 1 << inode->i_blkbits; |
401 | 409 | ||
402 | BUG_ON(!PageLocked(page)); | 410 | BUG_ON(!PageLocked(page)); |
403 | BUG_ON(PageWriteback(page)); | 411 | BUG_ON(PageWriteback(page)); |
404 | 412 | ||
405 | io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); | ||
406 | if (!io_page) { | ||
407 | redirty_page_for_writepage(wbc, page); | ||
408 | unlock_page(page); | ||
409 | return -ENOMEM; | ||
410 | } | ||
411 | io_page->p_page = page; | ||
412 | atomic_set(&io_page->p_count, 1); | ||
413 | get_page(page); | ||
414 | set_page_writeback(page); | 413 | set_page_writeback(page); |
415 | ClearPageError(page); | 414 | ClearPageError(page); |
416 | 415 | ||
417 | for (bh = head = page_buffers(page), block_start = 0; | 416 | /* |
418 | bh != head || !block_start; | 417 | * In the first loop we prepare and mark buffers to submit. We have to |
419 | block_start = block_end, bh = bh->b_this_page) { | 418 | * mark all buffers in the page before submitting so that |
420 | 419 | * end_page_writeback() cannot be called from ext4_bio_end_io() when IO | |
421 | block_end = block_start + blocksize; | 420 | * on the first buffer finishes and we are still working on submitting |
421 | * the second buffer. | ||
422 | */ | ||
423 | bh = head = page_buffers(page); | ||
424 | do { | ||
425 | block_start = bh_offset(bh); | ||
422 | if (block_start >= len) { | 426 | if (block_start >= len) { |
423 | /* | 427 | /* |
424 | * Comments copied from block_write_full_page_endio: | 428 | * Comments copied from block_write_full_page_endio: |
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
431 | * mapped, and writes to that region are not written | 435 | * mapped, and writes to that region are not written |
432 | * out to the file." | 436 | * out to the file." |
433 | */ | 437 | */ |
434 | zero_user_segment(page, block_start, block_end); | 438 | zero_user_segment(page, block_start, |
439 | block_start + blocksize); | ||
435 | clear_buffer_dirty(bh); | 440 | clear_buffer_dirty(bh); |
436 | set_buffer_uptodate(bh); | 441 | set_buffer_uptodate(bh); |
437 | continue; | 442 | continue; |
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
445 | ext4_io_submit(io); | 450 | ext4_io_submit(io); |
446 | continue; | 451 | continue; |
447 | } | 452 | } |
448 | ret = io_submit_add_bh(io, io_page, inode, wbc, bh); | 453 | if (buffer_new(bh)) { |
454 | clear_buffer_new(bh); | ||
455 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
456 | } | ||
457 | set_buffer_async_write(bh); | ||
458 | } while ((bh = bh->b_this_page) != head); | ||
459 | |||
460 | /* Now submit buffers to write */ | ||
461 | bh = head = page_buffers(page); | ||
462 | do { | ||
463 | if (!buffer_async_write(bh)) | ||
464 | continue; | ||
465 | ret = io_submit_add_bh(io, inode, bh); | ||
449 | if (ret) { | 466 | if (ret) { |
450 | /* | 467 | /* |
451 | * We only get here on ENOMEM. Not much else | 468 | * We only get here on ENOMEM. Not much else |
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
455 | redirty_page_for_writepage(wbc, page); | 472 | redirty_page_for_writepage(wbc, page); |
456 | break; | 473 | break; |
457 | } | 474 | } |
475 | nr_submitted++; | ||
458 | clear_buffer_dirty(bh); | 476 | clear_buffer_dirty(bh); |
477 | } while ((bh = bh->b_this_page) != head); | ||
478 | |||
479 | /* Error stopped previous loop? Clean up buffers... */ | ||
480 | if (ret) { | ||
481 | do { | ||
482 | clear_buffer_async_write(bh); | ||
483 | bh = bh->b_this_page; | ||
484 | } while (bh != head); | ||
459 | } | 485 | } |
460 | unlock_page(page); | 486 | unlock_page(page); |
461 | /* | 487 | /* Nothing submitted - we have to end page writeback */ |
462 | * If the page was truncated before we could do the writeback, | 488 | if (!nr_submitted) |
463 | * or we had a memory allocation error while trying to write | 489 | end_page_writeback(page); |
464 | * the first buffer head, we won't have submitted any pages for | ||
465 | * I/O. In that case we need to make sure we've cleared the | ||
466 | * PageWriteback bit from the page to prevent the system from | ||
467 | * wedging later on. | ||
468 | */ | ||
469 | put_io_page(io_page); | ||
470 | return ret; | 490 | return ret; |
471 | } | 491 | } |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c169477a62c9..b27c96d01965 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -272,7 +272,7 @@ next_group: | |||
272 | if (start_blk >= last_blk) | 272 | if (start_blk >= last_blk) |
273 | goto next_group; | 273 | goto next_group; |
274 | group_data[bb_index].block_bitmap = start_blk++; | 274 | group_data[bb_index].block_bitmap = start_blk++; |
275 | ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); | 275 | group = ext4_get_group_number(sb, start_blk - 1); |
276 | group -= group_data[0].group; | 276 | group -= group_data[0].group; |
277 | group_data[group].free_blocks_count--; | 277 | group_data[group].free_blocks_count--; |
278 | if (flexbg_size > 1) | 278 | if (flexbg_size > 1) |
@@ -284,7 +284,7 @@ next_group: | |||
284 | if (start_blk >= last_blk) | 284 | if (start_blk >= last_blk) |
285 | goto next_group; | 285 | goto next_group; |
286 | group_data[ib_index].inode_bitmap = start_blk++; | 286 | group_data[ib_index].inode_bitmap = start_blk++; |
287 | ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); | 287 | group = ext4_get_group_number(sb, start_blk - 1); |
288 | group -= group_data[0].group; | 288 | group -= group_data[0].group; |
289 | group_data[group].free_blocks_count--; | 289 | group_data[group].free_blocks_count--; |
290 | if (flexbg_size > 1) | 290 | if (flexbg_size > 1) |
@@ -296,7 +296,7 @@ next_group: | |||
296 | if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) | 296 | if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) |
297 | goto next_group; | 297 | goto next_group; |
298 | group_data[it_index].inode_table = start_blk; | 298 | group_data[it_index].inode_table = start_blk; |
299 | ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); | 299 | group = ext4_get_group_number(sb, start_blk - 1); |
300 | group -= group_data[0].group; | 300 | group -= group_data[0].group; |
301 | group_data[group].free_blocks_count -= | 301 | group_data[group].free_blocks_count -= |
302 | EXT4_SB(sb)->s_itb_per_group; | 302 | EXT4_SB(sb)->s_itb_per_group; |
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, | |||
392 | ext4_group_t group; | 392 | ext4_group_t group; |
393 | int err; | 393 | int err; |
394 | 394 | ||
395 | ext4_get_group_no_and_offset(sb, block, &group, NULL); | 395 | group = ext4_get_group_number(sb, block); |
396 | start = ext4_group_first_block_no(sb, group); | 396 | start = ext4_group_first_block_no(sb, group); |
397 | group -= flex_gd->groups[0].group; | 397 | group -= flex_gd->groups[0].group; |
398 | 398 | ||
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb, | |||
1341 | 1341 | ||
1342 | /* Update the global fs size fields */ | 1342 | /* Update the global fs size fields */ |
1343 | sbi->s_groups_count += flex_gd->count; | 1343 | sbi->s_groups_count += flex_gd->count; |
1344 | sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, | ||
1345 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); | ||
1344 | 1346 | ||
1345 | /* Update the reserved block counts only once the new group is | 1347 | /* Update the reserved block counts only once the new group is |
1346 | * active. */ | 1348 | * active. */ |
@@ -1879,7 +1881,11 @@ retry: | |||
1879 | /* Nothing need to do */ | 1881 | /* Nothing need to do */ |
1880 | return 0; | 1882 | return 0; |
1881 | 1883 | ||
1882 | ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); | 1884 | n_group = ext4_get_group_number(sb, n_blocks_count - 1); |
1885 | if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { | ||
1886 | ext4_warning(sb, "resize would cause inodes_count overflow"); | ||
1887 | return -EINVAL; | ||
1888 | } | ||
1883 | ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); | 1889 | ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); |
1884 | 1890 | ||
1885 | n_desc_blocks = num_desc_blocks(sb, n_group + 1); | 1891 | n_desc_blocks = num_desc_blocks(sb, n_group + 1); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5d6d53578124..dbc7c090c13a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); | |||
81 | static void ext4_destroy_lazyinit_thread(void); | 81 | static void ext4_destroy_lazyinit_thread(void); |
82 | static void ext4_unregister_li_request(struct super_block *sb); | 82 | static void ext4_unregister_li_request(struct super_block *sb); |
83 | static void ext4_clear_request_list(void); | 83 | static void ext4_clear_request_list(void); |
84 | static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); | ||
84 | 85 | ||
85 | #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) | 86 | #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) |
86 | static struct file_system_type ext2_fs_type = { | 87 | static struct file_system_type ext2_fs_type = { |
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) | |||
353 | struct super_block *sb = journal->j_private; | 354 | struct super_block *sb = journal->j_private; |
354 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 355 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
355 | int error = is_journal_aborted(journal); | 356 | int error = is_journal_aborted(journal); |
356 | struct ext4_journal_cb_entry *jce, *tmp; | 357 | struct ext4_journal_cb_entry *jce; |
357 | 358 | ||
359 | BUG_ON(txn->t_state == T_FINISHED); | ||
358 | spin_lock(&sbi->s_md_lock); | 360 | spin_lock(&sbi->s_md_lock); |
359 | list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { | 361 | while (!list_empty(&txn->t_private_list)) { |
362 | jce = list_entry(txn->t_private_list.next, | ||
363 | struct ext4_journal_cb_entry, jce_list); | ||
360 | list_del_init(&jce->jce_list); | 364 | list_del_init(&jce->jce_list); |
361 | spin_unlock(&sbi->s_md_lock); | 365 | spin_unlock(&sbi->s_md_lock); |
362 | jce->jce_func(sb, jce, error); | 366 | jce->jce_func(sb, jce, error); |
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, | |||
1948 | if ((sbi->s_es->s_feature_ro_compat & | 1952 | if ((sbi->s_es->s_feature_ro_compat & |
1949 | cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { | 1953 | cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { |
1950 | /* Use new metadata_csum algorithm */ | 1954 | /* Use new metadata_csum algorithm */ |
1951 | __u16 old_csum; | 1955 | __le16 save_csum; |
1952 | __u32 csum32; | 1956 | __u32 csum32; |
1953 | 1957 | ||
1954 | old_csum = gdp->bg_checksum; | 1958 | save_csum = gdp->bg_checksum; |
1955 | gdp->bg_checksum = 0; | 1959 | gdp->bg_checksum = 0; |
1956 | csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, | 1960 | csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, |
1957 | sizeof(le_group)); | 1961 | sizeof(le_group)); |
1958 | csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, | 1962 | csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, |
1959 | sbi->s_desc_size); | 1963 | sbi->s_desc_size); |
1960 | gdp->bg_checksum = old_csum; | 1964 | gdp->bg_checksum = save_csum; |
1961 | 1965 | ||
1962 | crc = csum32 & 0xFFFF; | 1966 | crc = csum32 & 0xFFFF; |
1963 | goto out; | 1967 | goto out; |
@@ -2379,17 +2383,15 @@ struct ext4_attr { | |||
2379 | int offset; | 2383 | int offset; |
2380 | }; | 2384 | }; |
2381 | 2385 | ||
2382 | static int parse_strtoul(const char *buf, | 2386 | static int parse_strtoull(const char *buf, |
2383 | unsigned long max, unsigned long *value) | 2387 | unsigned long long max, unsigned long long *value) |
2384 | { | 2388 | { |
2385 | char *endp; | 2389 | int ret; |
2386 | |||
2387 | *value = simple_strtoul(skip_spaces(buf), &endp, 0); | ||
2388 | endp = skip_spaces(endp); | ||
2389 | if (*endp || *value > max) | ||
2390 | return -EINVAL; | ||
2391 | 2390 | ||
2392 | return 0; | 2391 | ret = kstrtoull(skip_spaces(buf), 0, value); |
2392 | if (!ret && *value > max) | ||
2393 | ret = -EINVAL; | ||
2394 | return ret; | ||
2393 | } | 2395 | } |
2394 | 2396 | ||
2395 | static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, | 2397 | static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, |
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, | |||
2431 | const char *buf, size_t count) | 2433 | const char *buf, size_t count) |
2432 | { | 2434 | { |
2433 | unsigned long t; | 2435 | unsigned long t; |
2436 | int ret; | ||
2434 | 2437 | ||
2435 | if (parse_strtoul(buf, 0x40000000, &t)) | 2438 | ret = kstrtoul(skip_spaces(buf), 0, &t); |
2436 | return -EINVAL; | 2439 | if (ret) |
2440 | return ret; | ||
2437 | 2441 | ||
2438 | if (t && !is_power_of_2(t)) | 2442 | if (t && (!is_power_of_2(t) || t > 0x40000000)) |
2439 | return -EINVAL; | 2443 | return -EINVAL; |
2440 | 2444 | ||
2441 | sbi->s_inode_readahead_blks = t; | 2445 | sbi->s_inode_readahead_blks = t; |
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, | |||
2456 | { | 2460 | { |
2457 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); | 2461 | unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); |
2458 | unsigned long t; | 2462 | unsigned long t; |
2463 | int ret; | ||
2459 | 2464 | ||
2460 | if (parse_strtoul(buf, 0xffffffff, &t)) | 2465 | ret = kstrtoul(skip_spaces(buf), 0, &t); |
2461 | return -EINVAL; | 2466 | if (ret) |
2467 | return ret; | ||
2462 | *ui = t; | 2468 | *ui = t; |
2463 | return count; | 2469 | return count; |
2464 | } | 2470 | } |
2465 | 2471 | ||
2472 | static ssize_t reserved_clusters_show(struct ext4_attr *a, | ||
2473 | struct ext4_sb_info *sbi, char *buf) | ||
2474 | { | ||
2475 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
2476 | (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); | ||
2477 | } | ||
2478 | |||
2479 | static ssize_t reserved_clusters_store(struct ext4_attr *a, | ||
2480 | struct ext4_sb_info *sbi, | ||
2481 | const char *buf, size_t count) | ||
2482 | { | ||
2483 | unsigned long long val; | ||
2484 | int ret; | ||
2485 | |||
2486 | if (parse_strtoull(buf, -1ULL, &val)) | ||
2487 | return -EINVAL; | ||
2488 | ret = ext4_reserve_clusters(sbi, val); | ||
2489 | |||
2490 | return ret ? ret : count; | ||
2491 | } | ||
2492 | |||
2466 | static ssize_t trigger_test_error(struct ext4_attr *a, | 2493 | static ssize_t trigger_test_error(struct ext4_attr *a, |
2467 | struct ext4_sb_info *sbi, | 2494 | struct ext4_sb_info *sbi, |
2468 | const char *buf, size_t count) | 2495 | const char *buf, size_t count) |
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) | |||
2500 | EXT4_RO_ATTR(delayed_allocation_blocks); | 2527 | EXT4_RO_ATTR(delayed_allocation_blocks); |
2501 | EXT4_RO_ATTR(session_write_kbytes); | 2528 | EXT4_RO_ATTR(session_write_kbytes); |
2502 | EXT4_RO_ATTR(lifetime_write_kbytes); | 2529 | EXT4_RO_ATTR(lifetime_write_kbytes); |
2530 | EXT4_RW_ATTR(reserved_clusters); | ||
2503 | EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, | 2531 | EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, |
2504 | inode_readahead_blks_store, s_inode_readahead_blks); | 2532 | inode_readahead_blks_store, s_inode_readahead_blks); |
2505 | EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); | 2533 | EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); |
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = { | |||
2517 | ATTR_LIST(delayed_allocation_blocks), | 2545 | ATTR_LIST(delayed_allocation_blocks), |
2518 | ATTR_LIST(session_write_kbytes), | 2546 | ATTR_LIST(session_write_kbytes), |
2519 | ATTR_LIST(lifetime_write_kbytes), | 2547 | ATTR_LIST(lifetime_write_kbytes), |
2548 | ATTR_LIST(reserved_clusters), | ||
2520 | ATTR_LIST(inode_readahead_blks), | 2549 | ATTR_LIST(inode_readahead_blks), |
2521 | ATTR_LIST(inode_goal), | 2550 | ATTR_LIST(inode_goal), |
2522 | ATTR_LIST(mb_stats), | 2551 | ATTR_LIST(mb_stats), |
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb) | |||
3192 | return 0; | 3221 | return 0; |
3193 | } | 3222 | } |
3194 | 3223 | ||
3224 | |||
3225 | static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) | ||
3226 | { | ||
3227 | ext4_fsblk_t resv_clusters; | ||
3228 | |||
3229 | /* | ||
3230 | * By default we reserve 2% or 4096 clusters, whichever is smaller. | ||
3231 | * This should cover the situations where we can not afford to run | ||
3232 | * out of space like for example punch hole, or converting | ||
3233 | * uninitialized extents in delalloc path. In most cases such | ||
3234 | * allocation would require 1, or 2 blocks, higher numbers are | ||
3235 | * very rare. | ||
3236 | */ | ||
3237 | resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; | ||
3238 | |||
3239 | do_div(resv_clusters, 50); | ||
3240 | resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); | ||
3241 | |||
3242 | return resv_clusters; | ||
3243 | } | ||
3244 | |||
3245 | |||
3246 | static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) | ||
3247 | { | ||
3248 | ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> | ||
3249 | sbi->s_cluster_bits; | ||
3250 | |||
3251 | if (count >= clusters) | ||
3252 | return -EINVAL; | ||
3253 | |||
3254 | atomic64_set(&sbi->s_resv_clusters, count); | ||
3255 | return 0; | ||
3256 | } | ||
3257 | |||
3195 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) | 3258 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
3196 | { | 3259 | { |
3197 | char *orig_data = kstrdup(data, GFP_KERNEL); | 3260 | char *orig_data = kstrdup(data, GFP_KERNEL); |
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3526 | sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); | 3589 | sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); |
3527 | sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); | 3590 | sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); |
3528 | 3591 | ||
3592 | /* Do we have standard group size of blocksize * 8 blocks ? */ | ||
3593 | if (sbi->s_blocks_per_group == blocksize << 3) | ||
3594 | set_opt2(sb, STD_GROUP_SIZE); | ||
3595 | |||
3529 | for (i = 0; i < 4; i++) | 3596 | for (i = 0; i < 4; i++) |
3530 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); | 3597 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); |
3531 | sbi->s_def_hash_version = es->s_def_hash_version; | 3598 | sbi->s_def_hash_version = es->s_def_hash_version; |
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3698 | sbi->s_err_report.function = print_daily_error_info; | 3765 | sbi->s_err_report.function = print_daily_error_info; |
3699 | sbi->s_err_report.data = (unsigned long) sb; | 3766 | sbi->s_err_report.data = (unsigned long) sb; |
3700 | 3767 | ||
3768 | /* Register extent status tree shrinker */ | ||
3769 | ext4_es_register_shrinker(sb); | ||
3770 | |||
3701 | err = percpu_counter_init(&sbi->s_freeclusters_counter, | 3771 | err = percpu_counter_init(&sbi->s_freeclusters_counter, |
3702 | ext4_count_free_clusters(sb)); | 3772 | ext4_count_free_clusters(sb)); |
3703 | if (!err) { | 3773 | if (!err) { |
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3723 | sbi->s_max_writeback_mb_bump = 128; | 3793 | sbi->s_max_writeback_mb_bump = 128; |
3724 | sbi->s_extent_max_zeroout_kb = 32; | 3794 | sbi->s_extent_max_zeroout_kb = 32; |
3725 | 3795 | ||
3726 | /* Register extent status tree shrinker */ | ||
3727 | ext4_es_register_shrinker(sb); | ||
3728 | |||
3729 | /* | 3796 | /* |
3730 | * set up enough so that it can read an inode | 3797 | * set up enough so that it can read an inode |
3731 | */ | 3798 | */ |
@@ -3911,6 +3978,13 @@ no_journal: | |||
3911 | "available"); | 3978 | "available"); |
3912 | } | 3979 | } |
3913 | 3980 | ||
3981 | err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); | ||
3982 | if (err) { | ||
3983 | ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " | ||
3984 | "reserved pool", ext4_calculate_resv_clusters(sbi)); | ||
3985 | goto failed_mount4a; | ||
3986 | } | ||
3987 | |||
3914 | err = ext4_setup_system_zone(sb); | 3988 | err = ext4_setup_system_zone(sb); |
3915 | if (err) { | 3989 | if (err) { |
3916 | ext4_msg(sb, KERN_ERR, "failed to initialize system " | 3990 | ext4_msg(sb, KERN_ERR, "failed to initialize system " |
@@ -4010,6 +4084,7 @@ failed_mount_wq: | |||
4010 | sbi->s_journal = NULL; | 4084 | sbi->s_journal = NULL; |
4011 | } | 4085 | } |
4012 | failed_mount3: | 4086 | failed_mount3: |
4087 | ext4_es_unregister_shrinker(sb); | ||
4013 | del_timer(&sbi->s_err_report); | 4088 | del_timer(&sbi->s_err_report); |
4014 | if (sbi->s_flex_groups) | 4089 | if (sbi->s_flex_groups) |
4015 | ext4_kvfree(sbi->s_flex_groups); | 4090 | ext4_kvfree(sbi->s_flex_groups); |
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, | |||
4177 | goto out_bdev; | 4252 | goto out_bdev; |
4178 | } | 4253 | } |
4179 | journal->j_private = sb; | 4254 | journal->j_private = sb; |
4180 | ll_rw_block(READ, 1, &journal->j_sb_buffer); | 4255 | ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); |
4181 | wait_on_buffer(journal->j_sb_buffer); | 4256 | wait_on_buffer(journal->j_sb_buffer); |
4182 | if (!buffer_uptodate(journal->j_sb_buffer)) { | 4257 | if (!buffer_uptodate(journal->j_sb_buffer)) { |
4183 | ext4_msg(sb, KERN_ERR, "I/O error on journal device"); | 4258 | ext4_msg(sb, KERN_ERR, "I/O error on journal device"); |
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
4742 | struct super_block *sb = dentry->d_sb; | 4817 | struct super_block *sb = dentry->d_sb; |
4743 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 4818 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
4744 | struct ext4_super_block *es = sbi->s_es; | 4819 | struct ext4_super_block *es = sbi->s_es; |
4745 | ext4_fsblk_t overhead = 0; | 4820 | ext4_fsblk_t overhead = 0, resv_blocks; |
4746 | u64 fsid; | 4821 | u64 fsid; |
4747 | s64 bfree; | 4822 | s64 bfree; |
4823 | resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); | ||
4748 | 4824 | ||
4749 | if (!test_opt(sb, MINIX_DF)) | 4825 | if (!test_opt(sb, MINIX_DF)) |
4750 | overhead = sbi->s_overhead; | 4826 | overhead = sbi->s_overhead; |
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
4756 | percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); | 4832 | percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); |
4757 | /* prevent underflow in case that few free space is available */ | 4833 | /* prevent underflow in case that few free space is available */ |
4758 | buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); | 4834 | buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); |
4759 | buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); | 4835 | buf->f_bavail = buf->f_bfree - |
4760 | if (buf->f_bfree < ext4_r_blocks_count(es)) | 4836 | (ext4_r_blocks_count(es) + resv_blocks); |
4837 | if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) | ||
4761 | buf->f_bavail = 0; | 4838 | buf->f_bavail = 0; |
4762 | buf->f_files = le32_to_cpu(es->s_inodes_count); | 4839 | buf->f_files = le32_to_cpu(es->s_inodes_count); |
4763 | buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); | 4840 | buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); |
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, | |||
4945 | return PTR_ERR(qf_inode); | 5022 | return PTR_ERR(qf_inode); |
4946 | } | 5023 | } |
4947 | 5024 | ||
5025 | /* Don't account quota for quota files to avoid recursion */ | ||
5026 | qf_inode->i_flags |= S_NOQUOTA; | ||
4948 | err = dquot_enable(qf_inode, type, format_id, flags); | 5027 | err = dquot_enable(qf_inode, type, format_id, flags); |
4949 | iput(qf_inode); | 5028 | iput(qf_inode); |
4950 | 5029 | ||
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a120b277240..c081e34f717f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, | |||
122 | struct ext4_xattr_header *hdr) | 122 | struct ext4_xattr_header *hdr) |
123 | { | 123 | { |
124 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 124 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
125 | __u32 csum, old; | 125 | __u32 csum; |
126 | __le32 save_csum; | ||
127 | __le64 dsk_block_nr = cpu_to_le64(block_nr); | ||
126 | 128 | ||
127 | old = hdr->h_checksum; | 129 | save_csum = hdr->h_checksum; |
128 | hdr->h_checksum = 0; | 130 | hdr->h_checksum = 0; |
129 | block_nr = cpu_to_le64(block_nr); | 131 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, |
130 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, | 132 | sizeof(dsk_block_nr)); |
131 | sizeof(block_nr)); | ||
132 | csum = ext4_chksum(sbi, csum, (__u8 *)hdr, | 133 | csum = ext4_chksum(sbi, csum, (__u8 *)hdr, |
133 | EXT4_BLOCK_SIZE(inode->i_sb)); | 134 | EXT4_BLOCK_SIZE(inode->i_sb)); |
134 | 135 | ||
135 | hdr->h_checksum = old; | 136 | hdr->h_checksum = save_csum; |
136 | return cpu_to_le32(csum); | 137 | return cpu_to_le32(csum); |
137 | } | 138 | } |
138 | 139 | ||
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index aa25deb5c6cd..c767dbdd7fc4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #define EXT4_XATTR_INDEX_LUSTRE 5 | 22 | #define EXT4_XATTR_INDEX_LUSTRE 5 |
23 | #define EXT4_XATTR_INDEX_SECURITY 6 | 23 | #define EXT4_XATTR_INDEX_SECURITY 6 |
24 | #define EXT4_XATTR_INDEX_SYSTEM 7 | 24 | #define EXT4_XATTR_INDEX_SYSTEM 7 |
25 | #define EXT4_XATTR_INDEX_RICHACL 8 | ||
25 | 26 | ||
26 | struct ext4_xattr_header { | 27 | struct ext4_xattr_header { |
27 | __le32 h_magic; /* magic number for identification */ | 28 | __le32 h_magic; /* magic number for identification */ |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..0f53946f13c1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
382 | int space_left = 0; | 382 | int space_left = 0; |
383 | int first_tag = 0; | 383 | int first_tag = 0; |
384 | int tag_flag; | 384 | int tag_flag; |
385 | int i, to_free = 0; | 385 | int i; |
386 | int tag_bytes = journal_tag_bytes(journal); | 386 | int tag_bytes = journal_tag_bytes(journal); |
387 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | 387 | struct buffer_head *cbh = NULL; /* For transactional checksums */ |
388 | __u32 crc32_sum = ~0; | 388 | __u32 crc32_sum = ~0; |
@@ -1134,7 +1134,7 @@ restart_loop: | |||
1134 | journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; | 1134 | journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; |
1135 | spin_unlock(&journal->j_history_lock); | 1135 | spin_unlock(&journal->j_history_lock); |
1136 | 1136 | ||
1137 | commit_transaction->t_state = T_FINISHED; | 1137 | commit_transaction->t_state = T_COMMIT_CALLBACK; |
1138 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 1138 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
1139 | journal->j_commit_sequence = commit_transaction->t_tid; | 1139 | journal->j_commit_sequence = commit_transaction->t_tid; |
1140 | journal->j_committing_transaction = NULL; | 1140 | journal->j_committing_transaction = NULL; |
@@ -1149,38 +1149,44 @@ restart_loop: | |||
1149 | journal->j_average_commit_time*3) / 4; | 1149 | journal->j_average_commit_time*3) / 4; |
1150 | else | 1150 | else |
1151 | journal->j_average_commit_time = commit_time; | 1151 | journal->j_average_commit_time = commit_time; |
1152 | |||
1152 | write_unlock(&journal->j_state_lock); | 1153 | write_unlock(&journal->j_state_lock); |
1153 | 1154 | ||
1154 | if (commit_transaction->t_checkpoint_list == NULL && | 1155 | if (journal->j_checkpoint_transactions == NULL) { |
1155 | commit_transaction->t_checkpoint_io_list == NULL) { | 1156 | journal->j_checkpoint_transactions = commit_transaction; |
1156 | __jbd2_journal_drop_transaction(journal, commit_transaction); | 1157 | commit_transaction->t_cpnext = commit_transaction; |
1157 | to_free = 1; | 1158 | commit_transaction->t_cpprev = commit_transaction; |
1158 | } else { | 1159 | } else { |
1159 | if (journal->j_checkpoint_transactions == NULL) { | 1160 | commit_transaction->t_cpnext = |
1160 | journal->j_checkpoint_transactions = commit_transaction; | 1161 | journal->j_checkpoint_transactions; |
1161 | commit_transaction->t_cpnext = commit_transaction; | 1162 | commit_transaction->t_cpprev = |
1162 | commit_transaction->t_cpprev = commit_transaction; | 1163 | commit_transaction->t_cpnext->t_cpprev; |
1163 | } else { | 1164 | commit_transaction->t_cpnext->t_cpprev = |
1164 | commit_transaction->t_cpnext = | 1165 | commit_transaction; |
1165 | journal->j_checkpoint_transactions; | 1166 | commit_transaction->t_cpprev->t_cpnext = |
1166 | commit_transaction->t_cpprev = | ||
1167 | commit_transaction->t_cpnext->t_cpprev; | ||
1168 | commit_transaction->t_cpnext->t_cpprev = | ||
1169 | commit_transaction; | ||
1170 | commit_transaction->t_cpprev->t_cpnext = | ||
1171 | commit_transaction; | 1167 | commit_transaction; |
1172 | } | ||
1173 | } | 1168 | } |
1174 | spin_unlock(&journal->j_list_lock); | 1169 | spin_unlock(&journal->j_list_lock); |
1175 | 1170 | /* Drop all spin_locks because commit_callback may be block. | |
1171 | * __journal_remove_checkpoint() can not destroy transaction | ||
1172 | * under us because it is not marked as T_FINISHED yet */ | ||
1176 | if (journal->j_commit_callback) | 1173 | if (journal->j_commit_callback) |
1177 | journal->j_commit_callback(journal, commit_transaction); | 1174 | journal->j_commit_callback(journal, commit_transaction); |
1178 | 1175 | ||
1179 | trace_jbd2_end_commit(journal, commit_transaction); | 1176 | trace_jbd2_end_commit(journal, commit_transaction); |
1180 | jbd_debug(1, "JBD2: commit %d complete, head %d\n", | 1177 | jbd_debug(1, "JBD2: commit %d complete, head %d\n", |
1181 | journal->j_commit_sequence, journal->j_tail_sequence); | 1178 | journal->j_commit_sequence, journal->j_tail_sequence); |
1182 | if (to_free) | ||
1183 | jbd2_journal_free_transaction(commit_transaction); | ||
1184 | 1179 | ||
1180 | write_lock(&journal->j_state_lock); | ||
1181 | spin_lock(&journal->j_list_lock); | ||
1182 | commit_transaction->t_state = T_FINISHED; | ||
1183 | /* Recheck checkpoint lists after j_list_lock was dropped */ | ||
1184 | if (commit_transaction->t_checkpoint_list == NULL && | ||
1185 | commit_transaction->t_checkpoint_io_list == NULL) { | ||
1186 | __jbd2_journal_drop_transaction(journal, commit_transaction); | ||
1187 | jbd2_journal_free_transaction(commit_transaction); | ||
1188 | } | ||
1189 | spin_unlock(&journal->j_list_lock); | ||
1190 | write_unlock(&journal->j_state_lock); | ||
1185 | wake_up(&journal->j_wait_done_commit); | 1191 | wake_up(&journal->j_wait_done_commit); |
1186 | } | 1192 | } |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8b220f1ab54f..f6c5ba027f4f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -708,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) | |||
708 | } | 708 | } |
709 | 709 | ||
710 | /* | 710 | /* |
711 | * When this function returns the transaction corresponding to tid | ||
712 | * will be completed. If the transaction has currently running, start | ||
713 | * committing that transaction before waiting for it to complete. If | ||
714 | * the transaction id is stale, it is by definition already completed, | ||
715 | * so just return SUCCESS. | ||
716 | */ | ||
717 | int jbd2_complete_transaction(journal_t *journal, tid_t tid) | ||
718 | { | ||
719 | int need_to_wait = 1; | ||
720 | |||
721 | read_lock(&journal->j_state_lock); | ||
722 | if (journal->j_running_transaction && | ||
723 | journal->j_running_transaction->t_tid == tid) { | ||
724 | if (journal->j_commit_request != tid) { | ||
725 | /* transaction not yet started, so request it */ | ||
726 | read_unlock(&journal->j_state_lock); | ||
727 | jbd2_log_start_commit(journal, tid); | ||
728 | goto wait_commit; | ||
729 | } | ||
730 | } else if (!(journal->j_committing_transaction && | ||
731 | journal->j_committing_transaction->t_tid == tid)) | ||
732 | need_to_wait = 0; | ||
733 | read_unlock(&journal->j_state_lock); | ||
734 | if (!need_to_wait) | ||
735 | return 0; | ||
736 | wait_commit: | ||
737 | return jbd2_log_wait_commit(journal, tid); | ||
738 | } | ||
739 | EXPORT_SYMBOL(jbd2_complete_transaction); | ||
740 | |||
741 | /* | ||
711 | * Log buffer allocation routines: | 742 | * Log buffer allocation routines: |
712 | */ | 743 | */ |
713 | 744 | ||
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 325bc019ed88..10f524c59ea8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks) | |||
332 | handle_t *handle = jbd2_alloc_handle(GFP_NOFS); | 332 | handle_t *handle = jbd2_alloc_handle(GFP_NOFS); |
333 | if (!handle) | 333 | if (!handle) |
334 | return NULL; | 334 | return NULL; |
335 | memset(handle, 0, sizeof(*handle)); | ||
336 | handle->h_buffer_credits = nblocks; | 335 | handle->h_buffer_credits = nblocks; |
337 | handle->h_ref = 1; | 336 | handle->h_ref = 1; |
338 | 337 | ||
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, | |||
640 | int error; | 639 | int error; |
641 | char *frozen_buffer = NULL; | 640 | char *frozen_buffer = NULL; |
642 | int need_copy = 0; | 641 | int need_copy = 0; |
642 | unsigned long start_lock, time_lock; | ||
643 | 643 | ||
644 | if (is_handle_aborted(handle)) | 644 | if (is_handle_aborted(handle)) |
645 | return -EROFS; | 645 | return -EROFS; |
@@ -655,9 +655,16 @@ repeat: | |||
655 | 655 | ||
656 | /* @@@ Need to check for errors here at some point. */ | 656 | /* @@@ Need to check for errors here at some point. */ |
657 | 657 | ||
658 | start_lock = jiffies; | ||
658 | lock_buffer(bh); | 659 | lock_buffer(bh); |
659 | jbd_lock_bh_state(bh); | 660 | jbd_lock_bh_state(bh); |
660 | 661 | ||
662 | /* If it takes too long to lock the buffer, trace it */ | ||
663 | time_lock = jbd2_time_diff(start_lock, jiffies); | ||
664 | if (time_lock > HZ/10) | ||
665 | trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, | ||
666 | jiffies_to_msecs(time_lock)); | ||
667 | |||
661 | /* We now hold the buffer lock so it is safe to query the buffer | 668 | /* We now hold the buffer lock so it is safe to query the buffer |
662 | * state. Is the buffer dirty? | 669 | * state. Is the buffer dirty? |
663 | * | 670 | * |