diff options
author | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 01:06:28 -0500 |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 01:06:28 -0500 |
commit | f43dc23d5ea91fca257be02138a255f02d98e806 (patch) | |
tree | b29722f6e965316e90ac97abf79923ced250dc21 /fs/ext4/inode.c | |
parent | f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff) | |
parent | 4162cf64973df51fc885825bc9ca4d055891c49f (diff) |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts:
arch/sh/kernel/cpu/sh2/setup-sh7619.c
arch/sh/kernel/cpu/sh2a/setup-mxg.c
arch/sh/kernel/cpu/sh2a/setup-sh7201.c
arch/sh/kernel/cpu/sh2a/setup-sh7203.c
arch/sh/kernel/cpu/sh2a/setup-sh7206.c
arch/sh/kernel/cpu/sh3/setup-sh7705.c
arch/sh/kernel/cpu/sh3/setup-sh770x.c
arch/sh/kernel/cpu/sh3/setup-sh7710.c
arch/sh/kernel/cpu/sh3/setup-sh7720.c
arch/sh/kernel/cpu/sh4/setup-sh4-202.c
arch/sh/kernel/cpu/sh4/setup-sh7750.c
arch/sh/kernel/cpu/sh4/setup-sh7760.c
arch/sh/kernel/cpu/sh4a/setup-sh7343.c
arch/sh/kernel/cpu/sh4a/setup-sh7366.c
arch/sh/kernel/cpu/sh4a/setup-sh7722.c
arch/sh/kernel/cpu/sh4a/setup-sh7723.c
arch/sh/kernel/cpu/sh4a/setup-sh7724.c
arch/sh/kernel/cpu/sh4a/setup-sh7763.c
arch/sh/kernel/cpu/sh4a/setup-sh7770.c
arch/sh/kernel/cpu/sh4a/setup-sh7780.c
arch/sh/kernel/cpu/sh4a/setup-sh7785.c
arch/sh/kernel/cpu/sh4a/setup-sh7786.c
arch/sh/kernel/cpu/sh4a/setup-shx3.c
arch/sh/kernel/cpu/sh5/setup-sh5.c
drivers/serial/sh-sci.c
drivers/serial/sh-sci.h
include/linux/serial_sci.h
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 2741 |
1 files changed, 1603 insertions, 1138 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c17ae275af4..e80fc513eacc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -37,6 +37,10 @@ | |||
37 | #include <linux/namei.h> | 37 | #include <linux/namei.h> |
38 | #include <linux/uio.h> | 38 | #include <linux/uio.h> |
39 | #include <linux/bio.h> | 39 | #include <linux/bio.h> |
40 | #include <linux/workqueue.h> | ||
41 | #include <linux/kernel.h> | ||
42 | #include <linux/slab.h> | ||
43 | #include <linux/ratelimit.h> | ||
40 | 44 | ||
41 | #include "ext4_jbd2.h" | 45 | #include "ext4_jbd2.h" |
42 | #include "xattr.h" | 46 | #include "xattr.h" |
@@ -50,13 +54,27 @@ | |||
50 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | 54 | static inline int ext4_begin_ordered_truncate(struct inode *inode, |
51 | loff_t new_size) | 55 | loff_t new_size) |
52 | { | 56 | { |
53 | return jbd2_journal_begin_ordered_truncate( | 57 | trace_ext4_begin_ordered_truncate(inode, new_size); |
54 | EXT4_SB(inode->i_sb)->s_journal, | 58 | /* |
55 | &EXT4_I(inode)->jinode, | 59 | * If jinode is zero, then we never opened the file for |
56 | new_size); | 60 | * writing, so there's no need to call |
61 | * jbd2_journal_begin_ordered_truncate() since there's no | ||
62 | * outstanding writes we need to flush. | ||
63 | */ | ||
64 | if (!EXT4_I(inode)->jinode) | ||
65 | return 0; | ||
66 | return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), | ||
67 | EXT4_I(inode)->jinode, | ||
68 | new_size); | ||
57 | } | 69 | } |
58 | 70 | ||
59 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | 71 | static void ext4_invalidatepage(struct page *page, unsigned long offset); |
72 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, | ||
73 | struct buffer_head *bh_result, int create); | ||
74 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); | ||
75 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); | ||
76 | static int __ext4_journalled_writepage(struct page *page, unsigned int len); | ||
77 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); | ||
60 | 78 | ||
61 | /* | 79 | /* |
62 | * Test whether an inode is a fast symlink. | 80 | * Test whether an inode is a fast symlink. |
@@ -70,60 +88,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
70 | } | 88 | } |
71 | 89 | ||
72 | /* | 90 | /* |
73 | * The ext4 forget function must perform a revoke if we are freeing data | ||
74 | * which has been journaled. Metadata (eg. indirect blocks) must be | ||
75 | * revoked in all cases. | ||
76 | * | ||
77 | * "bh" may be NULL: a metadata block may have been freed from memory | ||
78 | * but there may still be a record of it in the journal, and that record | ||
79 | * still needs to be revoked. | ||
80 | * | ||
81 | * If the handle isn't valid we're not journaling so there's nothing to do. | ||
82 | */ | ||
83 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | ||
84 | struct buffer_head *bh, ext4_fsblk_t blocknr) | ||
85 | { | ||
86 | int err; | ||
87 | |||
88 | if (!ext4_handle_valid(handle)) | ||
89 | return 0; | ||
90 | |||
91 | might_sleep(); | ||
92 | |||
93 | BUFFER_TRACE(bh, "enter"); | ||
94 | |||
95 | jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " | ||
96 | "data mode %x\n", | ||
97 | bh, is_metadata, inode->i_mode, | ||
98 | test_opt(inode->i_sb, DATA_FLAGS)); | ||
99 | |||
100 | /* Never use the revoke function if we are doing full data | ||
101 | * journaling: there is no need to, and a V1 superblock won't | ||
102 | * support it. Otherwise, only skip the revoke on un-journaled | ||
103 | * data blocks. */ | ||
104 | |||
105 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || | ||
106 | (!is_metadata && !ext4_should_journal_data(inode))) { | ||
107 | if (bh) { | ||
108 | BUFFER_TRACE(bh, "call jbd2_journal_forget"); | ||
109 | return ext4_journal_forget(handle, bh); | ||
110 | } | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * data!=journal && (is_metadata || should_journal_data(inode)) | ||
116 | */ | ||
117 | BUFFER_TRACE(bh, "call ext4_journal_revoke"); | ||
118 | err = ext4_journal_revoke(handle, blocknr, bh); | ||
119 | if (err) | ||
120 | ext4_abort(inode->i_sb, __func__, | ||
121 | "error %d when attempting revoke", err); | ||
122 | BUFFER_TRACE(bh, "exit"); | ||
123 | return err; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Work out how many blocks we need to proceed with the next chunk of a | 91 | * Work out how many blocks we need to proceed with the next chunk of a |
128 | * truncate transaction. | 92 | * truncate transaction. |
129 | */ | 93 | */ |
@@ -194,21 +158,44 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | |||
194 | * so before we call here everything must be consistently dirtied against | 158 | * so before we call here everything must be consistently dirtied against |
195 | * this transaction. | 159 | * this transaction. |
196 | */ | 160 | */ |
197 | static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | 161 | int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, |
162 | int nblocks) | ||
198 | { | 163 | { |
164 | int ret; | ||
165 | |||
166 | /* | ||
167 | * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this | ||
168 | * moment, get_block can be called only for blocks inside i_size since | ||
169 | * page cache has been already dropped and writes are blocked by | ||
170 | * i_mutex. So we can safely drop the i_data_sem here. | ||
171 | */ | ||
199 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | 172 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
200 | jbd_debug(2, "restarting handle %p\n", handle); | 173 | jbd_debug(2, "restarting handle %p\n", handle); |
201 | return ext4_journal_restart(handle, blocks_for_truncate(inode)); | 174 | up_write(&EXT4_I(inode)->i_data_sem); |
175 | ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); | ||
176 | down_write(&EXT4_I(inode)->i_data_sem); | ||
177 | ext4_discard_preallocations(inode); | ||
178 | |||
179 | return ret; | ||
202 | } | 180 | } |
203 | 181 | ||
204 | /* | 182 | /* |
205 | * Called at the last iput() if i_nlink is zero. | 183 | * Called at the last iput() if i_nlink is zero. |
206 | */ | 184 | */ |
207 | void ext4_delete_inode(struct inode *inode) | 185 | void ext4_evict_inode(struct inode *inode) |
208 | { | 186 | { |
209 | handle_t *handle; | 187 | handle_t *handle; |
210 | int err; | 188 | int err; |
211 | 189 | ||
190 | trace_ext4_evict_inode(inode); | ||
191 | if (inode->i_nlink) { | ||
192 | truncate_inode_pages(&inode->i_data, 0); | ||
193 | goto no_delete; | ||
194 | } | ||
195 | |||
196 | if (!is_bad_inode(inode)) | ||
197 | dquot_initialize(inode); | ||
198 | |||
212 | if (ext4_should_order_data(inode)) | 199 | if (ext4_should_order_data(inode)) |
213 | ext4_begin_ordered_truncate(inode, 0); | 200 | ext4_begin_ordered_truncate(inode, 0); |
214 | truncate_inode_pages(&inode->i_data, 0); | 201 | truncate_inode_pages(&inode->i_data, 0); |
@@ -233,7 +220,7 @@ void ext4_delete_inode(struct inode *inode) | |||
233 | inode->i_size = 0; | 220 | inode->i_size = 0; |
234 | err = ext4_mark_inode_dirty(handle, inode); | 221 | err = ext4_mark_inode_dirty(handle, inode); |
235 | if (err) { | 222 | if (err) { |
236 | ext4_warning(inode->i_sb, __func__, | 223 | ext4_warning(inode->i_sb, |
237 | "couldn't mark inode dirty (err %d)", err); | 224 | "couldn't mark inode dirty (err %d)", err); |
238 | goto stop_handle; | 225 | goto stop_handle; |
239 | } | 226 | } |
@@ -251,10 +238,11 @@ void ext4_delete_inode(struct inode *inode) | |||
251 | if (err > 0) | 238 | if (err > 0) |
252 | err = ext4_journal_restart(handle, 3); | 239 | err = ext4_journal_restart(handle, 3); |
253 | if (err != 0) { | 240 | if (err != 0) { |
254 | ext4_warning(inode->i_sb, __func__, | 241 | ext4_warning(inode->i_sb, |
255 | "couldn't extend journal (err %d)", err); | 242 | "couldn't extend journal (err %d)", err); |
256 | stop_handle: | 243 | stop_handle: |
257 | ext4_journal_stop(handle); | 244 | ext4_journal_stop(handle); |
245 | ext4_orphan_del(NULL, inode); | ||
258 | goto no_delete; | 246 | goto no_delete; |
259 | } | 247 | } |
260 | } | 248 | } |
@@ -279,13 +267,13 @@ void ext4_delete_inode(struct inode *inode) | |||
279 | */ | 267 | */ |
280 | if (ext4_mark_inode_dirty(handle, inode)) | 268 | if (ext4_mark_inode_dirty(handle, inode)) |
281 | /* If that failed, just do the required in-core inode clear. */ | 269 | /* If that failed, just do the required in-core inode clear. */ |
282 | clear_inode(inode); | 270 | ext4_clear_inode(inode); |
283 | else | 271 | else |
284 | ext4_free_inode(handle, inode); | 272 | ext4_free_inode(handle, inode); |
285 | ext4_journal_stop(handle); | 273 | ext4_journal_stop(handle); |
286 | return; | 274 | return; |
287 | no_delete: | 275 | no_delete: |
288 | clear_inode(inode); /* We must guarantee clearing of inode... */ | 276 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
289 | } | 277 | } |
290 | 278 | ||
291 | typedef struct { | 279 | typedef struct { |
@@ -343,9 +331,7 @@ static int ext4_block_to_path(struct inode *inode, | |||
343 | int n = 0; | 331 | int n = 0; |
344 | int final = 0; | 332 | int final = 0; |
345 | 333 | ||
346 | if (i_block < 0) { | 334 | if (i_block < direct_blocks) { |
347 | ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); | ||
348 | } else if (i_block < direct_blocks) { | ||
349 | offsets[n++] = i_block; | 335 | offsets[n++] = i_block; |
350 | final = direct_blocks; | 336 | final = direct_blocks; |
351 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | 337 | } else if ((i_block -= direct_blocks) < indirect_blocks) { |
@@ -364,8 +350,7 @@ static int ext4_block_to_path(struct inode *inode, | |||
364 | offsets[n++] = i_block & (ptrs - 1); | 350 | offsets[n++] = i_block & (ptrs - 1); |
365 | final = ptrs; | 351 | final = ptrs; |
366 | } else { | 352 | } else { |
367 | ext4_warning(inode->i_sb, "ext4_block_to_path", | 353 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", |
368 | "block %lu > max in inode %lu", | ||
369 | i_block + direct_blocks + | 354 | i_block + direct_blocks + |
370 | indirect_blocks + double_blocks, inode->i_ino); | 355 | indirect_blocks + double_blocks, inode->i_ino); |
371 | } | 356 | } |
@@ -374,9 +359,11 @@ static int ext4_block_to_path(struct inode *inode, | |||
374 | return n; | 359 | return n; |
375 | } | 360 | } |
376 | 361 | ||
377 | static int __ext4_check_blockref(const char *function, struct inode *inode, | 362 | static int __ext4_check_blockref(const char *function, unsigned int line, |
363 | struct inode *inode, | ||
378 | __le32 *p, unsigned int max) | 364 | __le32 *p, unsigned int max) |
379 | { | 365 | { |
366 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
380 | __le32 *bref = p; | 367 | __le32 *bref = p; |
381 | unsigned int blk; | 368 | unsigned int blk; |
382 | 369 | ||
@@ -385,9 +372,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, | |||
385 | if (blk && | 372 | if (blk && |
386 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | 373 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), |
387 | blk, 1))) { | 374 | blk, 1))) { |
388 | ext4_error(inode->i_sb, function, | 375 | es->s_last_error_block = cpu_to_le64(blk); |
389 | "invalid block reference %u " | 376 | ext4_error_inode(inode, function, line, blk, |
390 | "in inode #%lu", blk, inode->i_ino); | 377 | "invalid block"); |
391 | return -EIO; | 378 | return -EIO; |
392 | } | 379 | } |
393 | } | 380 | } |
@@ -396,11 +383,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, | |||
396 | 383 | ||
397 | 384 | ||
398 | #define ext4_check_indirect_blockref(inode, bh) \ | 385 | #define ext4_check_indirect_blockref(inode, bh) \ |
399 | __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ | 386 | __ext4_check_blockref(__func__, __LINE__, inode, \ |
387 | (__le32 *)(bh)->b_data, \ | ||
400 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | 388 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) |
401 | 389 | ||
402 | #define ext4_check_inode_blockref(inode) \ | 390 | #define ext4_check_inode_blockref(inode) \ |
403 | __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ | 391 | __ext4_check_blockref(__func__, __LINE__, inode, \ |
392 | EXT4_I(inode)->i_data, \ | ||
404 | EXT4_NDIR_BLOCKS) | 393 | EXT4_NDIR_BLOCKS) |
405 | 394 | ||
406 | /** | 395 | /** |
@@ -553,19 +542,25 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | |||
553 | * | 542 | * |
554 | * Normally this function find the preferred place for block allocation, | 543 | * Normally this function find the preferred place for block allocation, |
555 | * returns it. | 544 | * returns it. |
545 | * Because this is only used for non-extent files, we limit the block nr | ||
546 | * to 32 bits. | ||
556 | */ | 547 | */ |
557 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | 548 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, |
558 | Indirect *partial) | 549 | Indirect *partial) |
559 | { | 550 | { |
551 | ext4_fsblk_t goal; | ||
552 | |||
560 | /* | 553 | /* |
561 | * XXX need to get goal block from mballoc's data structures | 554 | * XXX need to get goal block from mballoc's data structures |
562 | */ | 555 | */ |
563 | 556 | ||
564 | return ext4_find_near(inode, partial); | 557 | goal = ext4_find_near(inode, partial); |
558 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
559 | return goal; | ||
565 | } | 560 | } |
566 | 561 | ||
567 | /** | 562 | /** |
568 | * ext4_blks_to_allocate: Look up the block map and count the number | 563 | * ext4_blks_to_allocate - Look up the block map and count the number |
569 | * of direct blocks need to be allocated for the given branch. | 564 | * of direct blocks need to be allocated for the given branch. |
570 | * | 565 | * |
571 | * @branch: chain of indirect blocks | 566 | * @branch: chain of indirect blocks |
@@ -604,13 +599,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | |||
604 | 599 | ||
605 | /** | 600 | /** |
606 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | 601 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch |
602 | * @handle: handle for this transaction | ||
603 | * @inode: inode which needs allocated blocks | ||
604 | * @iblock: the logical block to start allocated at | ||
605 | * @goal: preferred physical block of allocation | ||
607 | * @indirect_blks: the number of blocks need to allocate for indirect | 606 | * @indirect_blks: the number of blocks need to allocate for indirect |
608 | * blocks | 607 | * blocks |
609 | * | 608 | * @blks: number of desired blocks |
610 | * @new_blocks: on return it will store the new block numbers for | 609 | * @new_blocks: on return it will store the new block numbers for |
611 | * the indirect blocks(if needed) and the first direct block, | 610 | * the indirect blocks(if needed) and the first direct block, |
612 | * @blks: on return it will store the total number of allocated | 611 | * @err: on return it will store the error code |
613 | * direct blocks | 612 | * |
613 | * This function will return the number of blocks allocated as | ||
614 | * requested by the passed-in parameters. | ||
614 | */ | 615 | */ |
615 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | 616 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, |
616 | ext4_lblk_t iblock, ext4_fsblk_t goal, | 617 | ext4_lblk_t iblock, ext4_fsblk_t goal, |
@@ -642,6 +643,15 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
642 | if (*err) | 643 | if (*err) |
643 | goto failed_out; | 644 | goto failed_out; |
644 | 645 | ||
646 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
647 | EXT4_ERROR_INODE(inode, | ||
648 | "current_block %llu + count %lu > %d!", | ||
649 | current_block, count, | ||
650 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
651 | *err = -EIO; | ||
652 | goto failed_out; | ||
653 | } | ||
654 | |||
645 | target -= count; | 655 | target -= count; |
646 | /* allocate blocks for indirect blocks */ | 656 | /* allocate blocks for indirect blocks */ |
647 | while (index < indirect_blks && count) { | 657 | while (index < indirect_blks && count) { |
@@ -676,6 +686,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
676 | ar.flags = EXT4_MB_HINT_DATA; | 686 | ar.flags = EXT4_MB_HINT_DATA; |
677 | 687 | ||
678 | current_block = ext4_mb_new_blocks(handle, &ar, err); | 688 | current_block = ext4_mb_new_blocks(handle, &ar, err); |
689 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
690 | EXT4_ERROR_INODE(inode, | ||
691 | "current_block %llu + ar.len %d > %d!", | ||
692 | current_block, ar.len, | ||
693 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
694 | *err = -EIO; | ||
695 | goto failed_out; | ||
696 | } | ||
679 | 697 | ||
680 | if (*err && (target == blks)) { | 698 | if (*err && (target == blks)) { |
681 | /* | 699 | /* |
@@ -701,15 +719,17 @@ allocated: | |||
701 | return ret; | 719 | return ret; |
702 | failed_out: | 720 | failed_out: |
703 | for (i = 0; i < index; i++) | 721 | for (i = 0; i < index; i++) |
704 | ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); | 722 | ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); |
705 | return ret; | 723 | return ret; |
706 | } | 724 | } |
707 | 725 | ||
708 | /** | 726 | /** |
709 | * ext4_alloc_branch - allocate and set up a chain of blocks. | 727 | * ext4_alloc_branch - allocate and set up a chain of blocks. |
728 | * @handle: handle for this transaction | ||
710 | * @inode: owner | 729 | * @inode: owner |
711 | * @indirect_blks: number of allocated indirect blocks | 730 | * @indirect_blks: number of allocated indirect blocks |
712 | * @blks: number of allocated direct blocks | 731 | * @blks: number of allocated direct blocks |
732 | * @goal: preferred place for allocation | ||
713 | * @offsets: offsets (in the blocks) to store the pointers to next. | 733 | * @offsets: offsets (in the blocks) to store the pointers to next. |
714 | * @branch: place to store the chain in. | 734 | * @branch: place to store the chain in. |
715 | * | 735 | * |
@@ -759,13 +779,19 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
759 | * parent to disk. | 779 | * parent to disk. |
760 | */ | 780 | */ |
761 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | 781 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); |
782 | if (unlikely(!bh)) { | ||
783 | err = -EIO; | ||
784 | goto failed; | ||
785 | } | ||
786 | |||
762 | branch[n].bh = bh; | 787 | branch[n].bh = bh; |
763 | lock_buffer(bh); | 788 | lock_buffer(bh); |
764 | BUFFER_TRACE(bh, "call get_create_access"); | 789 | BUFFER_TRACE(bh, "call get_create_access"); |
765 | err = ext4_journal_get_create_access(handle, bh); | 790 | err = ext4_journal_get_create_access(handle, bh); |
766 | if (err) { | 791 | if (err) { |
792 | /* Don't brelse(bh) here; it's done in | ||
793 | * ext4_journal_forget() below */ | ||
767 | unlock_buffer(bh); | 794 | unlock_buffer(bh); |
768 | brelse(bh); | ||
769 | goto failed; | 795 | goto failed; |
770 | } | 796 | } |
771 | 797 | ||
@@ -796,20 +822,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
796 | return err; | 822 | return err; |
797 | failed: | 823 | failed: |
798 | /* Allocation failed, free what we already allocated */ | 824 | /* Allocation failed, free what we already allocated */ |
825 | ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); | ||
799 | for (i = 1; i <= n ; i++) { | 826 | for (i = 1; i <= n ; i++) { |
800 | BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); | 827 | /* |
801 | ext4_journal_forget(handle, branch[i].bh); | 828 | * branch[i].bh is newly allocated, so there is no |
829 | * need to revoke the block, which is why we don't | ||
830 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
831 | */ | ||
832 | ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, | ||
833 | EXT4_FREE_BLOCKS_FORGET); | ||
802 | } | 834 | } |
803 | for (i = 0; i < indirect_blks; i++) | 835 | for (i = n+1; i < indirect_blks; i++) |
804 | ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); | 836 | ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); |
805 | 837 | ||
806 | ext4_free_blocks(handle, inode, new_blocks[i], num, 0); | 838 | ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); |
807 | 839 | ||
808 | return err; | 840 | return err; |
809 | } | 841 | } |
810 | 842 | ||
811 | /** | 843 | /** |
812 | * ext4_splice_branch - splice the allocated branch onto inode. | 844 | * ext4_splice_branch - splice the allocated branch onto inode. |
845 | * @handle: handle for this transaction | ||
813 | * @inode: owner | 846 | * @inode: owner |
814 | * @block: (logical) number of block we are adding | 847 | * @block: (logical) number of block we are adding |
815 | * @chain: chain of indirect blocks (with a missing link - see | 848 | * @chain: chain of indirect blocks (with a missing link - see |
@@ -882,20 +915,24 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, | |||
882 | 915 | ||
883 | err_out: | 916 | err_out: |
884 | for (i = 1; i <= num; i++) { | 917 | for (i = 1; i <= num; i++) { |
885 | BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); | 918 | /* |
886 | ext4_journal_forget(handle, where[i].bh); | 919 | * branch[i].bh is newly allocated, so there is no |
887 | ext4_free_blocks(handle, inode, | 920 | * need to revoke the block, which is why we don't |
888 | le32_to_cpu(where[i-1].key), 1, 0); | 921 | * need to set EXT4_FREE_BLOCKS_METADATA. |
922 | */ | ||
923 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
924 | EXT4_FREE_BLOCKS_FORGET); | ||
889 | } | 925 | } |
890 | ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); | 926 | ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), |
927 | blks, 0); | ||
891 | 928 | ||
892 | return err; | 929 | return err; |
893 | } | 930 | } |
894 | 931 | ||
895 | /* | 932 | /* |
896 | * The ext4_ind_get_blocks() function handles non-extents inodes | 933 | * The ext4_ind_map_blocks() function handles non-extents inodes |
897 | * (i.e., using the traditional indirect/double-indirect i_blocks | 934 | * (i.e., using the traditional indirect/double-indirect i_blocks |
898 | * scheme) for ext4_get_blocks(). | 935 | * scheme) for ext4_map_blocks(). |
899 | * | 936 | * |
900 | * Allocation strategy is simple: if we have to allocate something, we will | 937 | * Allocation strategy is simple: if we have to allocate something, we will |
901 | * have to go the whole way to leaf. So let's do it before attaching anything | 938 | * have to go the whole way to leaf. So let's do it before attaching anything |
@@ -920,9 +957,8 @@ err_out: | |||
920 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | 957 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system |
921 | * blocks. | 958 | * blocks. |
922 | */ | 959 | */ |
923 | static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | 960 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, |
924 | ext4_lblk_t iblock, unsigned int maxblocks, | 961 | struct ext4_map_blocks *map, |
925 | struct buffer_head *bh_result, | ||
926 | int flags) | 962 | int flags) |
927 | { | 963 | { |
928 | int err = -EIO; | 964 | int err = -EIO; |
@@ -936,9 +972,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
936 | int count = 0; | 972 | int count = 0; |
937 | ext4_fsblk_t first_block = 0; | 973 | ext4_fsblk_t first_block = 0; |
938 | 974 | ||
939 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); | 975 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); |
940 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | 976 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); |
941 | depth = ext4_block_to_path(inode, iblock, offsets, | 977 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, |
942 | &blocks_to_boundary); | 978 | &blocks_to_boundary); |
943 | 979 | ||
944 | if (depth == 0) | 980 | if (depth == 0) |
@@ -949,10 +985,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
949 | /* Simplest case - block found, no allocation needed */ | 985 | /* Simplest case - block found, no allocation needed */ |
950 | if (!partial) { | 986 | if (!partial) { |
951 | first_block = le32_to_cpu(chain[depth - 1].key); | 987 | first_block = le32_to_cpu(chain[depth - 1].key); |
952 | clear_buffer_new(bh_result); | ||
953 | count++; | 988 | count++; |
954 | /*map more blocks*/ | 989 | /*map more blocks*/ |
955 | while (count < maxblocks && count <= blocks_to_boundary) { | 990 | while (count < map->m_len && count <= blocks_to_boundary) { |
956 | ext4_fsblk_t blk; | 991 | ext4_fsblk_t blk; |
957 | 992 | ||
958 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | 993 | blk = le32_to_cpu(*(chain[depth-1].p + count)); |
@@ -972,7 +1007,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
972 | /* | 1007 | /* |
973 | * Okay, we need to do block allocation. | 1008 | * Okay, we need to do block allocation. |
974 | */ | 1009 | */ |
975 | goal = ext4_find_goal(inode, iblock, partial); | 1010 | goal = ext4_find_goal(inode, map->m_lblk, partial); |
976 | 1011 | ||
977 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | 1012 | /* the number of blocks need to allocate for [d,t]indirect blocks */ |
978 | indirect_blks = (chain + depth) - partial - 1; | 1013 | indirect_blks = (chain + depth) - partial - 1; |
@@ -982,11 +1017,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
982 | * direct blocks to allocate for this branch. | 1017 | * direct blocks to allocate for this branch. |
983 | */ | 1018 | */ |
984 | count = ext4_blks_to_allocate(partial, indirect_blks, | 1019 | count = ext4_blks_to_allocate(partial, indirect_blks, |
985 | maxblocks, blocks_to_boundary); | 1020 | map->m_len, blocks_to_boundary); |
986 | /* | 1021 | /* |
987 | * Block out ext4_truncate while we alter the tree | 1022 | * Block out ext4_truncate while we alter the tree |
988 | */ | 1023 | */ |
989 | err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, | 1024 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, |
990 | &count, goal, | 1025 | &count, goal, |
991 | offsets + (partial - chain), partial); | 1026 | offsets + (partial - chain), partial); |
992 | 1027 | ||
@@ -998,16 +1033,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, | |||
998 | * may need to return -EAGAIN upwards in the worst case. --sct | 1033 | * may need to return -EAGAIN upwards in the worst case. --sct |
999 | */ | 1034 | */ |
1000 | if (!err) | 1035 | if (!err) |
1001 | err = ext4_splice_branch(handle, inode, iblock, | 1036 | err = ext4_splice_branch(handle, inode, map->m_lblk, |
1002 | partial, indirect_blks, count); | 1037 | partial, indirect_blks, count); |
1003 | else | 1038 | if (err) |
1004 | goto cleanup; | 1039 | goto cleanup; |
1005 | 1040 | ||
1006 | set_buffer_new(bh_result); | 1041 | map->m_flags |= EXT4_MAP_NEW; |
1042 | |||
1043 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1007 | got_it: | 1044 | got_it: |
1008 | map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); | 1045 | map->m_flags |= EXT4_MAP_MAPPED; |
1046 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
1047 | map->m_len = count; | ||
1009 | if (count > blocks_to_boundary) | 1048 | if (count > blocks_to_boundary) |
1010 | set_buffer_boundary(bh_result); | 1049 | map->m_flags |= EXT4_MAP_BOUNDARY; |
1011 | err = count; | 1050 | err = count; |
1012 | /* Clean up and exit */ | 1051 | /* Clean up and exit */ |
1013 | partial = chain + depth - 1; /* the whole chain */ | 1052 | partial = chain + depth - 1; /* the whole chain */ |
@@ -1017,125 +1056,207 @@ cleanup: | |||
1017 | brelse(partial->bh); | 1056 | brelse(partial->bh); |
1018 | partial--; | 1057 | partial--; |
1019 | } | 1058 | } |
1020 | BUFFER_TRACE(bh_result, "returned"); | ||
1021 | out: | 1059 | out: |
1022 | return err; | 1060 | return err; |
1023 | } | 1061 | } |
1024 | 1062 | ||
1025 | qsize_t ext4_get_reserved_space(struct inode *inode) | 1063 | #ifdef CONFIG_QUOTA |
1064 | qsize_t *ext4_get_reserved_space(struct inode *inode) | ||
1026 | { | 1065 | { |
1027 | unsigned long long total; | 1066 | return &EXT4_I(inode)->i_reserved_quota; |
1028 | |||
1029 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1030 | total = EXT4_I(inode)->i_reserved_data_blocks + | ||
1031 | EXT4_I(inode)->i_reserved_meta_blocks; | ||
1032 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1033 | |||
1034 | return total; | ||
1035 | } | 1067 | } |
1068 | #endif | ||
1069 | |||
1036 | /* | 1070 | /* |
1037 | * Calculate the number of metadata blocks need to reserve | 1071 | * Calculate the number of metadata blocks need to reserve |
1038 | * to allocate @blocks for non extent file based file | 1072 | * to allocate a new block at @lblocks for non extent file based file |
1039 | */ | 1073 | */ |
1040 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) | 1074 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, |
1075 | sector_t lblock) | ||
1041 | { | 1076 | { |
1042 | int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 1077 | struct ext4_inode_info *ei = EXT4_I(inode); |
1043 | int ind_blks, dind_blks, tind_blks; | 1078 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); |
1044 | 1079 | int blk_bits; | |
1045 | /* number of new indirect blocks needed */ | ||
1046 | ind_blks = (blocks + icap - 1) / icap; | ||
1047 | 1080 | ||
1048 | dind_blks = (ind_blks + icap - 1) / icap; | 1081 | if (lblock < EXT4_NDIR_BLOCKS) |
1082 | return 0; | ||
1049 | 1083 | ||
1050 | tind_blks = 1; | 1084 | lblock -= EXT4_NDIR_BLOCKS; |
1051 | 1085 | ||
1052 | return ind_blks + dind_blks + tind_blks; | 1086 | if (ei->i_da_metadata_calc_len && |
1087 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
1088 | ei->i_da_metadata_calc_len++; | ||
1089 | return 0; | ||
1090 | } | ||
1091 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
1092 | ei->i_da_metadata_calc_len = 1; | ||
1093 | blk_bits = order_base_2(lblock); | ||
1094 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
1053 | } | 1095 | } |
1054 | 1096 | ||
1055 | /* | 1097 | /* |
1056 | * Calculate the number of metadata blocks need to reserve | 1098 | * Calculate the number of metadata blocks need to reserve |
1057 | * to allocate given number of blocks | 1099 | * to allocate a block located at @lblock |
1058 | */ | 1100 | */ |
1059 | static int ext4_calc_metadata_amount(struct inode *inode, int blocks) | 1101 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
1060 | { | 1102 | { |
1061 | if (!blocks) | 1103 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1062 | return 0; | 1104 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1063 | |||
1064 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
1065 | return ext4_ext_calc_metadata_amount(inode, blocks); | ||
1066 | 1105 | ||
1067 | return ext4_indirect_calc_metadata_amount(inode, blocks); | 1106 | return ext4_indirect_calc_metadata_amount(inode, lblock); |
1068 | } | 1107 | } |
1069 | 1108 | ||
1070 | static void ext4_da_update_reserve_space(struct inode *inode, int used) | 1109 | /* |
1110 | * Called with i_data_sem down, which is important since we can call | ||
1111 | * ext4_discard_preallocations() from here. | ||
1112 | */ | ||
1113 | void ext4_da_update_reserve_space(struct inode *inode, | ||
1114 | int used, int quota_claim) | ||
1071 | { | 1115 | { |
1072 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1116 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1073 | int total, mdb, mdb_free; | 1117 | struct ext4_inode_info *ei = EXT4_I(inode); |
1074 | |||
1075 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1076 | /* recalculate the number of metablocks still need to be reserved */ | ||
1077 | total = EXT4_I(inode)->i_reserved_data_blocks - used; | ||
1078 | mdb = ext4_calc_metadata_amount(inode, total); | ||
1079 | 1118 | ||
1080 | /* figure out how many metablocks to release */ | 1119 | spin_lock(&ei->i_block_reservation_lock); |
1081 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | 1120 | trace_ext4_da_update_reserve_space(inode, used); |
1082 | mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | 1121 | if (unlikely(used > ei->i_reserved_data_blocks)) { |
1122 | ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " | ||
1123 | "with only %d reserved data blocks\n", | ||
1124 | __func__, inode->i_ino, used, | ||
1125 | ei->i_reserved_data_blocks); | ||
1126 | WARN_ON(1); | ||
1127 | used = ei->i_reserved_data_blocks; | ||
1128 | } | ||
1083 | 1129 | ||
1084 | if (mdb_free) { | 1130 | /* Update per-inode reservations */ |
1085 | /* Account for allocated meta_blocks */ | 1131 | ei->i_reserved_data_blocks -= used; |
1086 | mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; | 1132 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; |
1133 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | ||
1134 | used + ei->i_allocated_meta_blocks); | ||
1135 | ei->i_allocated_meta_blocks = 0; | ||
1087 | 1136 | ||
1088 | /* update fs dirty blocks counter */ | 1137 | if (ei->i_reserved_data_blocks == 0) { |
1089 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); | 1138 | /* |
1090 | EXT4_I(inode)->i_allocated_meta_blocks = 0; | 1139 | * We can release all of the reserved metadata blocks |
1091 | EXT4_I(inode)->i_reserved_meta_blocks = mdb; | 1140 | * only when we have written all of the delayed |
1141 | * allocation blocks. | ||
1142 | */ | ||
1143 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | ||
1144 | ei->i_reserved_meta_blocks); | ||
1145 | ei->i_reserved_meta_blocks = 0; | ||
1146 | ei->i_da_metadata_calc_len = 0; | ||
1092 | } | 1147 | } |
1093 | |||
1094 | /* update per-inode reservations */ | ||
1095 | BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); | ||
1096 | EXT4_I(inode)->i_reserved_data_blocks -= used; | ||
1097 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1148 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1098 | 1149 | ||
1099 | /* | 1150 | /* Update quota subsystem for data blocks */ |
1100 | * free those over-booking quota for metadata blocks | 1151 | if (quota_claim) |
1101 | */ | 1152 | dquot_claim_block(inode, used); |
1102 | if (mdb_free) | 1153 | else { |
1103 | vfs_dq_release_reservation_block(inode, mdb_free); | 1154 | /* |
1155 | * We did fallocate with an offset that is already delayed | ||
1156 | * allocated. So on delayed allocated writeback we should | ||
1157 | * not re-claim the quota for fallocated blocks. | ||
1158 | */ | ||
1159 | dquot_release_reservation_block(inode, used); | ||
1160 | } | ||
1104 | 1161 | ||
1105 | /* | 1162 | /* |
1106 | * If we have done all the pending block allocations and if | 1163 | * If we have done all the pending block allocations and if |
1107 | * there aren't any writers on the inode, we can discard the | 1164 | * there aren't any writers on the inode, we can discard the |
1108 | * inode's preallocations. | 1165 | * inode's preallocations. |
1109 | */ | 1166 | */ |
1110 | if (!total && (atomic_read(&inode->i_writecount) == 0)) | 1167 | if ((ei->i_reserved_data_blocks == 0) && |
1168 | (atomic_read(&inode->i_writecount) == 0)) | ||
1111 | ext4_discard_preallocations(inode); | 1169 | ext4_discard_preallocations(inode); |
1112 | } | 1170 | } |
1113 | 1171 | ||
1114 | static int check_block_validity(struct inode *inode, sector_t logical, | 1172 | static int __check_block_validity(struct inode *inode, const char *func, |
1115 | sector_t phys, int len) | 1173 | unsigned int line, |
1174 | struct ext4_map_blocks *map) | ||
1116 | { | 1175 | { |
1117 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { | 1176 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, |
1118 | ext4_error(inode->i_sb, "check_block_validity", | 1177 | map->m_len)) { |
1119 | "inode #%lu logical block %llu mapped to %llu " | 1178 | ext4_error_inode(inode, func, line, map->m_pblk, |
1120 | "(size %d)", inode->i_ino, | 1179 | "lblock %lu mapped to illegal pblock " |
1121 | (unsigned long long) logical, | 1180 | "(length %d)", (unsigned long) map->m_lblk, |
1122 | (unsigned long long) phys, len); | 1181 | map->m_len); |
1123 | WARN_ON(1); | ||
1124 | return -EIO; | 1182 | return -EIO; |
1125 | } | 1183 | } |
1126 | return 0; | 1184 | return 0; |
1127 | } | 1185 | } |
1128 | 1186 | ||
1187 | #define check_block_validity(inode, map) \ | ||
1188 | __check_block_validity((inode), __func__, __LINE__, (map)) | ||
1189 | |||
1190 | /* | ||
1191 | * Return the number of contiguous dirty pages in a given inode | ||
1192 | * starting at page frame idx. | ||
1193 | */ | ||
1194 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
1195 | unsigned int max_pages) | ||
1196 | { | ||
1197 | struct address_space *mapping = inode->i_mapping; | ||
1198 | pgoff_t index; | ||
1199 | struct pagevec pvec; | ||
1200 | pgoff_t num = 0; | ||
1201 | int i, nr_pages, done = 0; | ||
1202 | |||
1203 | if (max_pages == 0) | ||
1204 | return 0; | ||
1205 | pagevec_init(&pvec, 0); | ||
1206 | while (!done) { | ||
1207 | index = idx; | ||
1208 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
1209 | PAGECACHE_TAG_DIRTY, | ||
1210 | (pgoff_t)PAGEVEC_SIZE); | ||
1211 | if (nr_pages == 0) | ||
1212 | break; | ||
1213 | for (i = 0; i < nr_pages; i++) { | ||
1214 | struct page *page = pvec.pages[i]; | ||
1215 | struct buffer_head *bh, *head; | ||
1216 | |||
1217 | lock_page(page); | ||
1218 | if (unlikely(page->mapping != mapping) || | ||
1219 | !PageDirty(page) || | ||
1220 | PageWriteback(page) || | ||
1221 | page->index != idx) { | ||
1222 | done = 1; | ||
1223 | unlock_page(page); | ||
1224 | break; | ||
1225 | } | ||
1226 | if (page_has_buffers(page)) { | ||
1227 | bh = head = page_buffers(page); | ||
1228 | do { | ||
1229 | if (!buffer_delay(bh) && | ||
1230 | !buffer_unwritten(bh)) | ||
1231 | done = 1; | ||
1232 | bh = bh->b_this_page; | ||
1233 | } while (!done && (bh != head)); | ||
1234 | } | ||
1235 | unlock_page(page); | ||
1236 | if (done) | ||
1237 | break; | ||
1238 | idx++; | ||
1239 | num++; | ||
1240 | if (num >= max_pages) { | ||
1241 | done = 1; | ||
1242 | break; | ||
1243 | } | ||
1244 | } | ||
1245 | pagevec_release(&pvec); | ||
1246 | } | ||
1247 | return num; | ||
1248 | } | ||
1249 | |||
1129 | /* | 1250 | /* |
1130 | * The ext4_get_blocks() function tries to look up the requested blocks, | 1251 | * The ext4_map_blocks() function tries to look up the requested blocks, |
1131 | * and returns if the blocks are already mapped. | 1252 | * and returns if the blocks are already mapped. |
1132 | * | 1253 | * |
1133 | * Otherwise it takes the write lock of the i_data_sem and allocate blocks | 1254 | * Otherwise it takes the write lock of the i_data_sem and allocate blocks |
1134 | * and store the allocated blocks in the result buffer head and mark it | 1255 | * and store the allocated blocks in the result buffer head and mark it |
1135 | * mapped. | 1256 | * mapped. |
1136 | * | 1257 | * |
1137 | * If file type is extents based, it will call ext4_ext_get_blocks(), | 1258 | * If file type is extents based, it will call ext4_ext_map_blocks(), |
1138 | * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping | 1259 | * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping |
1139 | * based files | 1260 | * based files |
1140 | * | 1261 | * |
1141 | * On success, it returns the number of blocks being mapped or allocate. | 1262 | * On success, it returns the number of blocks being mapped or allocate. |
@@ -1148,32 +1269,29 @@ static int check_block_validity(struct inode *inode, sector_t logical, | |||
1148 | * | 1269 | * |
1149 | * It returns the error in case of allocation failure. | 1270 | * It returns the error in case of allocation failure. |
1150 | */ | 1271 | */ |
1151 | int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | 1272 | int ext4_map_blocks(handle_t *handle, struct inode *inode, |
1152 | unsigned int max_blocks, struct buffer_head *bh, | 1273 | struct ext4_map_blocks *map, int flags) |
1153 | int flags) | ||
1154 | { | 1274 | { |
1155 | int retval; | 1275 | int retval; |
1156 | 1276 | ||
1157 | clear_buffer_mapped(bh); | 1277 | map->m_flags = 0; |
1158 | clear_buffer_unwritten(bh); | 1278 | ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," |
1159 | 1279 | "logical block %lu\n", inode->i_ino, flags, map->m_len, | |
1280 | (unsigned long) map->m_lblk); | ||
1160 | /* | 1281 | /* |
1161 | * Try to see if we can get the block without requesting a new | 1282 | * Try to see if we can get the block without requesting a new |
1162 | * file system block. | 1283 | * file system block. |
1163 | */ | 1284 | */ |
1164 | down_read((&EXT4_I(inode)->i_data_sem)); | 1285 | down_read((&EXT4_I(inode)->i_data_sem)); |
1165 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | 1286 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
1166 | retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, | 1287 | retval = ext4_ext_map_blocks(handle, inode, map, 0); |
1167 | bh, 0); | ||
1168 | } else { | 1288 | } else { |
1169 | retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, | 1289 | retval = ext4_ind_map_blocks(handle, inode, map, 0); |
1170 | bh, 0); | ||
1171 | } | 1290 | } |
1172 | up_read((&EXT4_I(inode)->i_data_sem)); | 1291 | up_read((&EXT4_I(inode)->i_data_sem)); |
1173 | 1292 | ||
1174 | if (retval > 0 && buffer_mapped(bh)) { | 1293 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { |
1175 | int ret = check_block_validity(inode, block, | 1294 | int ret = check_block_validity(inode, map); |
1176 | bh->b_blocknr, retval); | ||
1177 | if (ret != 0) | 1295 | if (ret != 0) |
1178 | return ret; | 1296 | return ret; |
1179 | } | 1297 | } |
@@ -1189,7 +1307,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1189 | * ext4_ext_get_block() returns th create = 0 | 1307 | * ext4_ext_get_block() returns th create = 0 |
1190 | * with buffer head unmapped. | 1308 | * with buffer head unmapped. |
1191 | */ | 1309 | */ |
1192 | if (retval > 0 && buffer_mapped(bh)) | 1310 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) |
1193 | return retval; | 1311 | return retval; |
1194 | 1312 | ||
1195 | /* | 1313 | /* |
@@ -1202,7 +1320,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1202 | * of BH_Unwritten and BH_Mapped flags being simultaneously | 1320 | * of BH_Unwritten and BH_Mapped flags being simultaneously |
1203 | * set on the buffer_head. | 1321 | * set on the buffer_head. |
1204 | */ | 1322 | */ |
1205 | clear_buffer_unwritten(bh); | 1323 | map->m_flags &= ~EXT4_MAP_UNWRITTEN; |
1206 | 1324 | ||
1207 | /* | 1325 | /* |
1208 | * New blocks allocate and/or writing to uninitialized extent | 1326 | * New blocks allocate and/or writing to uninitialized extent |
@@ -1219,43 +1337,41 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1219 | * avoid double accounting | 1337 | * avoid double accounting |
1220 | */ | 1338 | */ |
1221 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 1339 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
1222 | EXT4_I(inode)->i_delalloc_reserved_flag = 1; | 1340 | ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); |
1223 | /* | 1341 | /* |
1224 | * We need to check for EXT4 here because migrate | 1342 | * We need to check for EXT4 here because migrate |
1225 | * could have changed the inode type in between | 1343 | * could have changed the inode type in between |
1226 | */ | 1344 | */ |
1227 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | 1345 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
1228 | retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, | 1346 | retval = ext4_ext_map_blocks(handle, inode, map, flags); |
1229 | bh, flags); | ||
1230 | } else { | 1347 | } else { |
1231 | retval = ext4_ind_get_blocks(handle, inode, block, | 1348 | retval = ext4_ind_map_blocks(handle, inode, map, flags); |
1232 | max_blocks, bh, flags); | ||
1233 | 1349 | ||
1234 | if (retval > 0 && buffer_new(bh)) { | 1350 | if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { |
1235 | /* | 1351 | /* |
1236 | * We allocated new blocks which will result in | 1352 | * We allocated new blocks which will result in |
1237 | * i_data's format changing. Force the migrate | 1353 | * i_data's format changing. Force the migrate |
1238 | * to fail by clearing migrate flags | 1354 | * to fail by clearing migrate flags |
1239 | */ | 1355 | */ |
1240 | EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & | 1356 | ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); |
1241 | ~EXT4_EXT_MIGRATE; | ||
1242 | } | 1357 | } |
1243 | } | ||
1244 | 1358 | ||
1359 | /* | ||
1360 | * Update reserved blocks/metadata blocks after successful | ||
1361 | * block allocation which had been deferred till now. We don't | ||
1362 | * support fallocate for non extent files. So we can update | ||
1363 | * reserve space here. | ||
1364 | */ | ||
1365 | if ((retval > 0) && | ||
1366 | (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) | ||
1367 | ext4_da_update_reserve_space(inode, retval, 1); | ||
1368 | } | ||
1245 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 1369 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
1246 | EXT4_I(inode)->i_delalloc_reserved_flag = 0; | 1370 | ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); |
1247 | |||
1248 | /* | ||
1249 | * Update reserved blocks/metadata blocks after successful | ||
1250 | * block allocation which had been deferred till now. | ||
1251 | */ | ||
1252 | if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) | ||
1253 | ext4_da_update_reserve_space(inode, retval); | ||
1254 | 1371 | ||
1255 | up_write((&EXT4_I(inode)->i_data_sem)); | 1372 | up_write((&EXT4_I(inode)->i_data_sem)); |
1256 | if (retval > 0 && buffer_mapped(bh)) { | 1373 | if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { |
1257 | int ret = check_block_validity(inode, block, | 1374 | int ret = check_block_validity(inode, map); |
1258 | bh->b_blocknr, retval); | ||
1259 | if (ret != 0) | 1375 | if (ret != 0) |
1260 | return ret; | 1376 | return ret; |
1261 | } | 1377 | } |
@@ -1265,109 +1381,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, | |||
1265 | /* Maximum number of blocks we map for direct IO at once. */ | 1381 | /* Maximum number of blocks we map for direct IO at once. */ |
1266 | #define DIO_MAX_BLOCKS 4096 | 1382 | #define DIO_MAX_BLOCKS 4096 |
1267 | 1383 | ||
1268 | int ext4_get_block(struct inode *inode, sector_t iblock, | 1384 | static int _ext4_get_block(struct inode *inode, sector_t iblock, |
1269 | struct buffer_head *bh_result, int create) | 1385 | struct buffer_head *bh, int flags) |
1270 | { | 1386 | { |
1271 | handle_t *handle = ext4_journal_current_handle(); | 1387 | handle_t *handle = ext4_journal_current_handle(); |
1388 | struct ext4_map_blocks map; | ||
1272 | int ret = 0, started = 0; | 1389 | int ret = 0, started = 0; |
1273 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
1274 | int dio_credits; | 1390 | int dio_credits; |
1275 | 1391 | ||
1276 | if (create && !handle) { | 1392 | map.m_lblk = iblock; |
1393 | map.m_len = bh->b_size >> inode->i_blkbits; | ||
1394 | |||
1395 | if (flags && !handle) { | ||
1277 | /* Direct IO write... */ | 1396 | /* Direct IO write... */ |
1278 | if (max_blocks > DIO_MAX_BLOCKS) | 1397 | if (map.m_len > DIO_MAX_BLOCKS) |
1279 | max_blocks = DIO_MAX_BLOCKS; | 1398 | map.m_len = DIO_MAX_BLOCKS; |
1280 | dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | 1399 | dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); |
1281 | handle = ext4_journal_start(inode, dio_credits); | 1400 | handle = ext4_journal_start(inode, dio_credits); |
1282 | if (IS_ERR(handle)) { | 1401 | if (IS_ERR(handle)) { |
1283 | ret = PTR_ERR(handle); | 1402 | ret = PTR_ERR(handle); |
1284 | goto out; | 1403 | return ret; |
1285 | } | 1404 | } |
1286 | started = 1; | 1405 | started = 1; |
1287 | } | 1406 | } |
1288 | 1407 | ||
1289 | ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, | 1408 | ret = ext4_map_blocks(handle, inode, &map, flags); |
1290 | create ? EXT4_GET_BLOCKS_CREATE : 0); | ||
1291 | if (ret > 0) { | 1409 | if (ret > 0) { |
1292 | bh_result->b_size = (ret << inode->i_blkbits); | 1410 | map_bh(bh, inode->i_sb, map.m_pblk); |
1411 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; | ||
1412 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; | ||
1293 | ret = 0; | 1413 | ret = 0; |
1294 | } | 1414 | } |
1295 | if (started) | 1415 | if (started) |
1296 | ext4_journal_stop(handle); | 1416 | ext4_journal_stop(handle); |
1297 | out: | ||
1298 | return ret; | 1417 | return ret; |
1299 | } | 1418 | } |
1300 | 1419 | ||
1420 | int ext4_get_block(struct inode *inode, sector_t iblock, | ||
1421 | struct buffer_head *bh, int create) | ||
1422 | { | ||
1423 | return _ext4_get_block(inode, iblock, bh, | ||
1424 | create ? EXT4_GET_BLOCKS_CREATE : 0); | ||
1425 | } | ||
1426 | |||
1301 | /* | 1427 | /* |
1302 | * `handle' can be NULL if create is zero | 1428 | * `handle' can be NULL if create is zero |
1303 | */ | 1429 | */ |
1304 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | 1430 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, |
1305 | ext4_lblk_t block, int create, int *errp) | 1431 | ext4_lblk_t block, int create, int *errp) |
1306 | { | 1432 | { |
1307 | struct buffer_head dummy; | 1433 | struct ext4_map_blocks map; |
1434 | struct buffer_head *bh; | ||
1308 | int fatal = 0, err; | 1435 | int fatal = 0, err; |
1309 | int flags = 0; | ||
1310 | 1436 | ||
1311 | J_ASSERT(handle != NULL || create == 0); | 1437 | J_ASSERT(handle != NULL || create == 0); |
1312 | 1438 | ||
1313 | dummy.b_state = 0; | 1439 | map.m_lblk = block; |
1314 | dummy.b_blocknr = -1000; | 1440 | map.m_len = 1; |
1315 | buffer_trace_init(&dummy.b_history); | 1441 | err = ext4_map_blocks(handle, inode, &map, |
1316 | if (create) | 1442 | create ? EXT4_GET_BLOCKS_CREATE : 0); |
1317 | flags |= EXT4_GET_BLOCKS_CREATE; | 1443 | |
1318 | err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); | 1444 | if (err < 0) |
1319 | /* | 1445 | *errp = err; |
1320 | * ext4_get_blocks() returns number of blocks mapped. 0 in | 1446 | if (err <= 0) |
1321 | * case of a HOLE. | 1447 | return NULL; |
1322 | */ | 1448 | *errp = 0; |
1323 | if (err > 0) { | 1449 | |
1324 | if (err > 1) | 1450 | bh = sb_getblk(inode->i_sb, map.m_pblk); |
1325 | WARN_ON(1); | 1451 | if (!bh) { |
1326 | err = 0; | 1452 | *errp = -EIO; |
1453 | return NULL; | ||
1327 | } | 1454 | } |
1328 | *errp = err; | 1455 | if (map.m_flags & EXT4_MAP_NEW) { |
1329 | if (!err && buffer_mapped(&dummy)) { | 1456 | J_ASSERT(create != 0); |
1330 | struct buffer_head *bh; | 1457 | J_ASSERT(handle != NULL); |
1331 | bh = sb_getblk(inode->i_sb, dummy.b_blocknr); | ||
1332 | if (!bh) { | ||
1333 | *errp = -EIO; | ||
1334 | goto err; | ||
1335 | } | ||
1336 | if (buffer_new(&dummy)) { | ||
1337 | J_ASSERT(create != 0); | ||
1338 | J_ASSERT(handle != NULL); | ||
1339 | 1458 | ||
1340 | /* | 1459 | /* |
1341 | * Now that we do not always journal data, we should | 1460 | * Now that we do not always journal data, we should |
1342 | * keep in mind whether this should always journal the | 1461 | * keep in mind whether this should always journal the |
1343 | * new buffer as metadata. For now, regular file | 1462 | * new buffer as metadata. For now, regular file |
1344 | * writes use ext4_get_block instead, so it's not a | 1463 | * writes use ext4_get_block instead, so it's not a |
1345 | * problem. | 1464 | * problem. |
1346 | */ | 1465 | */ |
1347 | lock_buffer(bh); | 1466 | lock_buffer(bh); |
1348 | BUFFER_TRACE(bh, "call get_create_access"); | 1467 | BUFFER_TRACE(bh, "call get_create_access"); |
1349 | fatal = ext4_journal_get_create_access(handle, bh); | 1468 | fatal = ext4_journal_get_create_access(handle, bh); |
1350 | if (!fatal && !buffer_uptodate(bh)) { | 1469 | if (!fatal && !buffer_uptodate(bh)) { |
1351 | memset(bh->b_data, 0, inode->i_sb->s_blocksize); | 1470 | memset(bh->b_data, 0, inode->i_sb->s_blocksize); |
1352 | set_buffer_uptodate(bh); | 1471 | set_buffer_uptodate(bh); |
1353 | } | ||
1354 | unlock_buffer(bh); | ||
1355 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1356 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
1357 | if (!fatal) | ||
1358 | fatal = err; | ||
1359 | } else { | ||
1360 | BUFFER_TRACE(bh, "not a new buffer"); | ||
1361 | } | ||
1362 | if (fatal) { | ||
1363 | *errp = fatal; | ||
1364 | brelse(bh); | ||
1365 | bh = NULL; | ||
1366 | } | 1472 | } |
1367 | return bh; | 1473 | unlock_buffer(bh); |
1474 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1475 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
1476 | if (!fatal) | ||
1477 | fatal = err; | ||
1478 | } else { | ||
1479 | BUFFER_TRACE(bh, "not a new buffer"); | ||
1368 | } | 1480 | } |
1369 | err: | 1481 | if (fatal) { |
1370 | return NULL; | 1482 | *errp = fatal; |
1483 | brelse(bh); | ||
1484 | bh = NULL; | ||
1485 | } | ||
1486 | return bh; | ||
1371 | } | 1487 | } |
1372 | 1488 | ||
1373 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, | 1489 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, |
@@ -1448,11 +1564,39 @@ static int walk_page_buffers(handle_t *handle, | |||
1448 | static int do_journal_get_write_access(handle_t *handle, | 1564 | static int do_journal_get_write_access(handle_t *handle, |
1449 | struct buffer_head *bh) | 1565 | struct buffer_head *bh) |
1450 | { | 1566 | { |
1567 | int dirty = buffer_dirty(bh); | ||
1568 | int ret; | ||
1569 | |||
1451 | if (!buffer_mapped(bh) || buffer_freed(bh)) | 1570 | if (!buffer_mapped(bh) || buffer_freed(bh)) |
1452 | return 0; | 1571 | return 0; |
1453 | return ext4_journal_get_write_access(handle, bh); | 1572 | /* |
1573 | * __block_write_begin() could have dirtied some buffers. Clean | ||
1574 | * the dirty bit as jbd2_journal_get_write_access() could complain | ||
1575 | * otherwise about fs integrity issues. Setting of the dirty bit | ||
1576 | * by __block_write_begin() isn't a real problem here as we clear | ||
1577 | * the bit before releasing a page lock and thus writeback cannot | ||
1578 | * ever write the buffer. | ||
1579 | */ | ||
1580 | if (dirty) | ||
1581 | clear_buffer_dirty(bh); | ||
1582 | ret = ext4_journal_get_write_access(handle, bh); | ||
1583 | if (!ret && dirty) | ||
1584 | ret = ext4_handle_dirty_metadata(handle, NULL, bh); | ||
1585 | return ret; | ||
1454 | } | 1586 | } |
1455 | 1587 | ||
1588 | /* | ||
1589 | * Truncate blocks that were not used by write. We have to truncate the | ||
1590 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
1591 | */ | ||
1592 | static void ext4_truncate_failed_write(struct inode *inode) | ||
1593 | { | ||
1594 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1595 | ext4_truncate(inode); | ||
1596 | } | ||
1597 | |||
1598 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | ||
1599 | struct buffer_head *bh_result, int create); | ||
1456 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 1600 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
1457 | loff_t pos, unsigned len, unsigned flags, | 1601 | loff_t pos, unsigned len, unsigned flags, |
1458 | struct page **pagep, void **fsdata) | 1602 | struct page **pagep, void **fsdata) |
@@ -1494,8 +1638,10 @@ retry: | |||
1494 | } | 1638 | } |
1495 | *pagep = page; | 1639 | *pagep = page; |
1496 | 1640 | ||
1497 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | 1641 | if (ext4_should_dioread_nolock(inode)) |
1498 | ext4_get_block); | 1642 | ret = __block_write_begin(page, pos, len, ext4_get_block_write); |
1643 | else | ||
1644 | ret = __block_write_begin(page, pos, len, ext4_get_block); | ||
1499 | 1645 | ||
1500 | if (!ret && ext4_should_journal_data(inode)) { | 1646 | if (!ret && ext4_should_journal_data(inode)) { |
1501 | ret = walk_page_buffers(handle, page_buffers(page), | 1647 | ret = walk_page_buffers(handle, page_buffers(page), |
@@ -1506,21 +1652,21 @@ retry: | |||
1506 | unlock_page(page); | 1652 | unlock_page(page); |
1507 | page_cache_release(page); | 1653 | page_cache_release(page); |
1508 | /* | 1654 | /* |
1509 | * block_write_begin may have instantiated a few blocks | 1655 | * __block_write_begin may have instantiated a few blocks |
1510 | * outside i_size. Trim these off again. Don't need | 1656 | * outside i_size. Trim these off again. Don't need |
1511 | * i_size_read because we hold i_mutex. | 1657 | * i_size_read because we hold i_mutex. |
1512 | * | 1658 | * |
1513 | * Add inode to orphan list in case we crash before | 1659 | * Add inode to orphan list in case we crash before |
1514 | * truncate finishes | 1660 | * truncate finishes |
1515 | */ | 1661 | */ |
1516 | if (pos + len > inode->i_size) | 1662 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1517 | ext4_orphan_add(handle, inode); | 1663 | ext4_orphan_add(handle, inode); |
1518 | 1664 | ||
1519 | ext4_journal_stop(handle); | 1665 | ext4_journal_stop(handle); |
1520 | if (pos + len > inode->i_size) { | 1666 | if (pos + len > inode->i_size) { |
1521 | vmtruncate(inode, inode->i_size); | 1667 | ext4_truncate_failed_write(inode); |
1522 | /* | 1668 | /* |
1523 | * If vmtruncate failed early the inode might | 1669 | * If truncate failed early the inode might |
1524 | * still be on the orphan list; we need to | 1670 | * still be on the orphan list; we need to |
1525 | * make sure the inode is removed from the | 1671 | * make sure the inode is removed from the |
1526 | * orphan list in that case. | 1672 | * orphan list in that case. |
@@ -1614,7 +1760,7 @@ static int ext4_ordered_write_end(struct file *file, | |||
1614 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1760 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1615 | page, fsdata); | 1761 | page, fsdata); |
1616 | copied = ret2; | 1762 | copied = ret2; |
1617 | if (pos + len > inode->i_size) | 1763 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1618 | /* if we have allocated more blocks and copied | 1764 | /* if we have allocated more blocks and copied |
1619 | * less. We will have blocks allocated outside | 1765 | * less. We will have blocks allocated outside |
1620 | * inode->i_size. So truncate them | 1766 | * inode->i_size. So truncate them |
@@ -1628,9 +1774,9 @@ static int ext4_ordered_write_end(struct file *file, | |||
1628 | ret = ret2; | 1774 | ret = ret2; |
1629 | 1775 | ||
1630 | if (pos + len > inode->i_size) { | 1776 | if (pos + len > inode->i_size) { |
1631 | vmtruncate(inode, inode->i_size); | 1777 | ext4_truncate_failed_write(inode); |
1632 | /* | 1778 | /* |
1633 | * If vmtruncate failed early the inode might still be | 1779 | * If truncate failed early the inode might still be |
1634 | * on the orphan list; we need to make sure the inode | 1780 | * on the orphan list; we need to make sure the inode |
1635 | * is removed from the orphan list in that case. | 1781 | * is removed from the orphan list in that case. |
1636 | */ | 1782 | */ |
@@ -1655,7 +1801,7 @@ static int ext4_writeback_write_end(struct file *file, | |||
1655 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1801 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, |
1656 | page, fsdata); | 1802 | page, fsdata); |
1657 | copied = ret2; | 1803 | copied = ret2; |
1658 | if (pos + len > inode->i_size) | 1804 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1659 | /* if we have allocated more blocks and copied | 1805 | /* if we have allocated more blocks and copied |
1660 | * less. We will have blocks allocated outside | 1806 | * less. We will have blocks allocated outside |
1661 | * inode->i_size. So truncate them | 1807 | * inode->i_size. So truncate them |
@@ -1670,9 +1816,9 @@ static int ext4_writeback_write_end(struct file *file, | |||
1670 | ret = ret2; | 1816 | ret = ret2; |
1671 | 1817 | ||
1672 | if (pos + len > inode->i_size) { | 1818 | if (pos + len > inode->i_size) { |
1673 | vmtruncate(inode, inode->i_size); | 1819 | ext4_truncate_failed_write(inode); |
1674 | /* | 1820 | /* |
1675 | * If vmtruncate failed early the inode might still be | 1821 | * If truncate failed early the inode might still be |
1676 | * on the orphan list; we need to make sure the inode | 1822 | * on the orphan list; we need to make sure the inode |
1677 | * is removed from the orphan list in that case. | 1823 | * is removed from the orphan list in that case. |
1678 | */ | 1824 | */ |
@@ -1712,7 +1858,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1712 | new_i_size = pos + copied; | 1858 | new_i_size = pos + copied; |
1713 | if (new_i_size > inode->i_size) | 1859 | if (new_i_size > inode->i_size) |
1714 | i_size_write(inode, pos+copied); | 1860 | i_size_write(inode, pos+copied); |
1715 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | 1861 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1716 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1862 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1717 | ext4_update_i_disksize(inode, new_i_size); | 1863 | ext4_update_i_disksize(inode, new_i_size); |
1718 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1864 | ret2 = ext4_mark_inode_dirty(handle, inode); |
@@ -1722,7 +1868,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1722 | 1868 | ||
1723 | unlock_page(page); | 1869 | unlock_page(page); |
1724 | page_cache_release(page); | 1870 | page_cache_release(page); |
1725 | if (pos + len > inode->i_size) | 1871 | if (pos + len > inode->i_size && ext4_can_truncate(inode)) |
1726 | /* if we have allocated more blocks and copied | 1872 | /* if we have allocated more blocks and copied |
1727 | * less. We will have blocks allocated outside | 1873 | * less. We will have blocks allocated outside |
1728 | * inode->i_size. So truncate them | 1874 | * inode->i_size. So truncate them |
@@ -1733,9 +1879,9 @@ static int ext4_journalled_write_end(struct file *file, | |||
1733 | if (!ret) | 1879 | if (!ret) |
1734 | ret = ret2; | 1880 | ret = ret2; |
1735 | if (pos + len > inode->i_size) { | 1881 | if (pos + len > inode->i_size) { |
1736 | vmtruncate(inode, inode->i_size); | 1882 | ext4_truncate_failed_write(inode); |
1737 | /* | 1883 | /* |
1738 | * If vmtruncate failed early the inode might still be | 1884 | * If truncate failed early the inode might still be |
1739 | * on the orphan list; we need to make sure the inode | 1885 | * on the orphan list; we need to make sure the inode |
1740 | * is removed from the orphan list in that case. | 1886 | * is removed from the orphan list in that case. |
1741 | */ | 1887 | */ |
@@ -1746,11 +1892,16 @@ static int ext4_journalled_write_end(struct file *file, | |||
1746 | return ret ? ret : copied; | 1892 | return ret ? ret : copied; |
1747 | } | 1893 | } |
1748 | 1894 | ||
1749 | static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | 1895 | /* |
1896 | * Reserve a single block located at lblock | ||
1897 | */ | ||
1898 | static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) | ||
1750 | { | 1899 | { |
1751 | int retries = 0; | 1900 | int retries = 0; |
1752 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1901 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1753 | unsigned long md_needed, mdblocks, total = 0; | 1902 | struct ext4_inode_info *ei = EXT4_I(inode); |
1903 | unsigned long md_needed; | ||
1904 | int ret; | ||
1754 | 1905 | ||
1755 | /* | 1906 | /* |
1756 | * recalculate the amount of metadata blocks to reserve | 1907 | * recalculate the amount of metadata blocks to reserve |
@@ -1758,86 +1909,84 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | |||
1758 | * worse case is one extent per block | 1909 | * worse case is one extent per block |
1759 | */ | 1910 | */ |
1760 | repeat: | 1911 | repeat: |
1761 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1912 | spin_lock(&ei->i_block_reservation_lock); |
1762 | total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; | 1913 | md_needed = ext4_calc_metadata_amount(inode, lblock); |
1763 | mdblocks = ext4_calc_metadata_amount(inode, total); | 1914 | trace_ext4_da_reserve_space(inode, md_needed); |
1764 | BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); | 1915 | spin_unlock(&ei->i_block_reservation_lock); |
1765 | |||
1766 | md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; | ||
1767 | total = md_needed + nrblocks; | ||
1768 | 1916 | ||
1769 | /* | 1917 | /* |
1770 | * Make quota reservation here to prevent quota overflow | 1918 | * We will charge metadata quota at writeout time; this saves |
1771 | * later. Real quota accounting is done at pages writeout | 1919 | * us from metadata over-estimation, though we may go over by |
1772 | * time. | 1920 | * a small amount in the end. Here we just reserve for data. |
1773 | */ | 1921 | */ |
1774 | if (vfs_dq_reserve_block(inode, total)) { | 1922 | ret = dquot_reserve_block(inode, 1); |
1775 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1923 | if (ret) |
1776 | return -EDQUOT; | 1924 | return ret; |
1777 | } | 1925 | /* |
1778 | 1926 | * We do still charge estimated metadata to the sb though; | |
1779 | if (ext4_claim_free_blocks(sbi, total)) { | 1927 | * we cannot afford to run out of free blocks. |
1780 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1928 | */ |
1929 | if (ext4_claim_free_blocks(sbi, md_needed + 1)) { | ||
1930 | dquot_release_reservation_block(inode, 1); | ||
1781 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | 1931 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1782 | yield(); | 1932 | yield(); |
1783 | goto repeat; | 1933 | goto repeat; |
1784 | } | 1934 | } |
1785 | vfs_dq_release_reservation_block(inode, total); | ||
1786 | return -ENOSPC; | 1935 | return -ENOSPC; |
1787 | } | 1936 | } |
1788 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | 1937 | spin_lock(&ei->i_block_reservation_lock); |
1789 | EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; | 1938 | ei->i_reserved_data_blocks++; |
1939 | ei->i_reserved_meta_blocks += md_needed; | ||
1940 | spin_unlock(&ei->i_block_reservation_lock); | ||
1790 | 1941 | ||
1791 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1792 | return 0; /* success */ | 1942 | return 0; /* success */ |
1793 | } | 1943 | } |
1794 | 1944 | ||
1795 | static void ext4_da_release_space(struct inode *inode, int to_free) | 1945 | static void ext4_da_release_space(struct inode *inode, int to_free) |
1796 | { | 1946 | { |
1797 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1947 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1798 | int total, mdb, mdb_free, release; | 1948 | struct ext4_inode_info *ei = EXT4_I(inode); |
1799 | 1949 | ||
1800 | if (!to_free) | 1950 | if (!to_free) |
1801 | return; /* Nothing to release, exit */ | 1951 | return; /* Nothing to release, exit */ |
1802 | 1952 | ||
1803 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1953 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); |
1804 | 1954 | ||
1805 | if (!EXT4_I(inode)->i_reserved_data_blocks) { | 1955 | trace_ext4_da_release_space(inode, to_free); |
1956 | if (unlikely(to_free > ei->i_reserved_data_blocks)) { | ||
1806 | /* | 1957 | /* |
1807 | * if there is no reserved blocks, but we try to free some | 1958 | * if there aren't enough reserved blocks, then the |
1808 | * then the counter is messed up somewhere. | 1959 | * counter is messed up somewhere. Since this |
1809 | * but since this function is called from invalidate | 1960 | * function is called from invalidate page, it's |
1810 | * page, it's harmless to return without any action | 1961 | * harmless to return without any action. |
1811 | */ | 1962 | */ |
1812 | printk(KERN_INFO "ext4 delalloc try to release %d reserved " | 1963 | ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " |
1813 | "blocks for inode %lu, but there is no reserved " | 1964 | "ino %lu, to_free %d with only %d reserved " |
1814 | "data blocks\n", to_free, inode->i_ino); | 1965 | "data blocks\n", inode->i_ino, to_free, |
1815 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1966 | ei->i_reserved_data_blocks); |
1816 | return; | 1967 | WARN_ON(1); |
1968 | to_free = ei->i_reserved_data_blocks; | ||
1817 | } | 1969 | } |
1970 | ei->i_reserved_data_blocks -= to_free; | ||
1818 | 1971 | ||
1819 | /* recalculate the number of metablocks still need to be reserved */ | 1972 | if (ei->i_reserved_data_blocks == 0) { |
1820 | total = EXT4_I(inode)->i_reserved_data_blocks - to_free; | 1973 | /* |
1821 | mdb = ext4_calc_metadata_amount(inode, total); | 1974 | * We can release all of the reserved metadata blocks |
1822 | 1975 | * only when we have written all of the delayed | |
1823 | /* figure out how many metablocks to release */ | 1976 | * allocation blocks. |
1824 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | 1977 | */ |
1825 | mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | 1978 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1826 | 1979 | ei->i_reserved_meta_blocks); | |
1827 | release = to_free + mdb_free; | 1980 | ei->i_reserved_meta_blocks = 0; |
1828 | 1981 | ei->i_da_metadata_calc_len = 0; | |
1829 | /* update fs dirty blocks counter for truncate case */ | 1982 | } |
1830 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); | ||
1831 | 1983 | ||
1832 | /* update per-inode reservations */ | 1984 | /* update fs dirty data blocks counter */ |
1833 | BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); | 1985 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); |
1834 | EXT4_I(inode)->i_reserved_data_blocks -= to_free; | ||
1835 | 1986 | ||
1836 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | ||
1837 | EXT4_I(inode)->i_reserved_meta_blocks = mdb; | ||
1838 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1987 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1839 | 1988 | ||
1840 | vfs_dq_release_reservation_block(inode, release); | 1989 | dquot_release_reservation_block(inode, to_free); |
1841 | } | 1990 | } |
1842 | 1991 | ||
1843 | static void ext4_da_page_release_reservation(struct page *page, | 1992 | static void ext4_da_page_release_reservation(struct page *page, |
@@ -1865,18 +2014,6 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1865 | * Delayed allocation stuff | 2014 | * Delayed allocation stuff |
1866 | */ | 2015 | */ |
1867 | 2016 | ||
1868 | struct mpage_da_data { | ||
1869 | struct inode *inode; | ||
1870 | sector_t b_blocknr; /* start block number of extent */ | ||
1871 | size_t b_size; /* size of extent */ | ||
1872 | unsigned long b_state; /* state of the extent */ | ||
1873 | unsigned long first_page, next_page; /* extent of pages */ | ||
1874 | struct writeback_control *wbc; | ||
1875 | int io_done; | ||
1876 | int pages_written; | ||
1877 | int retval; | ||
1878 | }; | ||
1879 | |||
1880 | /* | 2017 | /* |
1881 | * mpage_da_submit_io - walks through extent of pages and try to write | 2018 | * mpage_da_submit_io - walks through extent of pages and try to write |
1882 | * them with writepage() call back | 2019 | * them with writepage() call back |
@@ -1890,16 +2027,23 @@ struct mpage_da_data { | |||
1890 | * | 2027 | * |
1891 | * As pages are already locked by write_cache_pages(), we can't use it | 2028 | * As pages are already locked by write_cache_pages(), we can't use it |
1892 | */ | 2029 | */ |
1893 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | 2030 | static int mpage_da_submit_io(struct mpage_da_data *mpd, |
2031 | struct ext4_map_blocks *map) | ||
1894 | { | 2032 | { |
1895 | long pages_skipped; | ||
1896 | struct pagevec pvec; | 2033 | struct pagevec pvec; |
1897 | unsigned long index, end; | 2034 | unsigned long index, end; |
1898 | int ret = 0, err, nr_pages, i; | 2035 | int ret = 0, err, nr_pages, i; |
1899 | struct inode *inode = mpd->inode; | 2036 | struct inode *inode = mpd->inode; |
1900 | struct address_space *mapping = inode->i_mapping; | 2037 | struct address_space *mapping = inode->i_mapping; |
2038 | loff_t size = i_size_read(inode); | ||
2039 | unsigned int len, block_start; | ||
2040 | struct buffer_head *bh, *page_bufs = NULL; | ||
2041 | int journal_data = ext4_should_journal_data(inode); | ||
2042 | sector_t pblock = 0, cur_logical = 0; | ||
2043 | struct ext4_io_submit io_submit; | ||
1901 | 2044 | ||
1902 | BUG_ON(mpd->next_page <= mpd->first_page); | 2045 | BUG_ON(mpd->next_page <= mpd->first_page); |
2046 | memset(&io_submit, 0, sizeof(io_submit)); | ||
1903 | /* | 2047 | /* |
1904 | * We need to start from the first_page to the next_page - 1 | 2048 | * We need to start from the first_page to the next_page - 1 |
1905 | * to make sure we also write the mapped dirty buffer_heads. | 2049 | * to make sure we also write the mapped dirty buffer_heads. |
@@ -1915,139 +2059,109 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1915 | if (nr_pages == 0) | 2059 | if (nr_pages == 0) |
1916 | break; | 2060 | break; |
1917 | for (i = 0; i < nr_pages; i++) { | 2061 | for (i = 0; i < nr_pages; i++) { |
2062 | int commit_write = 0, redirty_page = 0; | ||
1918 | struct page *page = pvec.pages[i]; | 2063 | struct page *page = pvec.pages[i]; |
1919 | 2064 | ||
1920 | index = page->index; | 2065 | index = page->index; |
1921 | if (index > end) | 2066 | if (index > end) |
1922 | break; | 2067 | break; |
2068 | |||
2069 | if (index == size >> PAGE_CACHE_SHIFT) | ||
2070 | len = size & ~PAGE_CACHE_MASK; | ||
2071 | else | ||
2072 | len = PAGE_CACHE_SIZE; | ||
2073 | if (map) { | ||
2074 | cur_logical = index << (PAGE_CACHE_SHIFT - | ||
2075 | inode->i_blkbits); | ||
2076 | pblock = map->m_pblk + (cur_logical - | ||
2077 | map->m_lblk); | ||
2078 | } | ||
1923 | index++; | 2079 | index++; |
1924 | 2080 | ||
1925 | BUG_ON(!PageLocked(page)); | 2081 | BUG_ON(!PageLocked(page)); |
1926 | BUG_ON(PageWriteback(page)); | 2082 | BUG_ON(PageWriteback(page)); |
1927 | 2083 | ||
1928 | pages_skipped = mpd->wbc->pages_skipped; | ||
1929 | err = mapping->a_ops->writepage(page, mpd->wbc); | ||
1930 | if (!err && (pages_skipped == mpd->wbc->pages_skipped)) | ||
1931 | /* | ||
1932 | * have successfully written the page | ||
1933 | * without skipping the same | ||
1934 | */ | ||
1935 | mpd->pages_written++; | ||
1936 | /* | 2084 | /* |
1937 | * In error case, we have to continue because | 2085 | * If the page does not have buffers (for |
1938 | * remaining pages are still locked | 2086 | * whatever reason), try to create them using |
1939 | * XXX: unlock and re-dirty them? | 2087 | * __block_write_begin. If this fails, |
2088 | * redirty the page and move on. | ||
1940 | */ | 2089 | */ |
1941 | if (ret == 0) | 2090 | if (!page_has_buffers(page)) { |
1942 | ret = err; | 2091 | if (__block_write_begin(page, 0, len, |
1943 | } | 2092 | noalloc_get_block_write)) { |
1944 | pagevec_release(&pvec); | 2093 | redirty_page: |
1945 | } | 2094 | redirty_page_for_writepage(mpd->wbc, |
1946 | return ret; | 2095 | page); |
1947 | } | 2096 | unlock_page(page); |
1948 | 2097 | continue; | |
1949 | /* | 2098 | } |
1950 | * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers | 2099 | commit_write = 1; |
1951 | * | 2100 | } |
1952 | * @mpd->inode - inode to walk through | ||
1953 | * @exbh->b_blocknr - first block on a disk | ||
1954 | * @exbh->b_size - amount of space in bytes | ||
1955 | * @logical - first logical block to start assignment with | ||
1956 | * | ||
1957 | * the function goes through all passed space and put actual disk | ||
1958 | * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten | ||
1959 | */ | ||
1960 | static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | ||
1961 | struct buffer_head *exbh) | ||
1962 | { | ||
1963 | struct inode *inode = mpd->inode; | ||
1964 | struct address_space *mapping = inode->i_mapping; | ||
1965 | int blocks = exbh->b_size >> inode->i_blkbits; | ||
1966 | sector_t pblock = exbh->b_blocknr, cur_logical; | ||
1967 | struct buffer_head *head, *bh; | ||
1968 | pgoff_t index, end; | ||
1969 | struct pagevec pvec; | ||
1970 | int nr_pages, i; | ||
1971 | |||
1972 | index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1973 | end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1974 | cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1975 | |||
1976 | pagevec_init(&pvec, 0); | ||
1977 | |||
1978 | while (index <= end) { | ||
1979 | /* XXX: optimize tail */ | ||
1980 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1981 | if (nr_pages == 0) | ||
1982 | break; | ||
1983 | for (i = 0; i < nr_pages; i++) { | ||
1984 | struct page *page = pvec.pages[i]; | ||
1985 | |||
1986 | index = page->index; | ||
1987 | if (index > end) | ||
1988 | break; | ||
1989 | index++; | ||
1990 | |||
1991 | BUG_ON(!PageLocked(page)); | ||
1992 | BUG_ON(PageWriteback(page)); | ||
1993 | BUG_ON(!page_has_buffers(page)); | ||
1994 | |||
1995 | bh = page_buffers(page); | ||
1996 | head = bh; | ||
1997 | |||
1998 | /* skip blocks out of the range */ | ||
1999 | do { | ||
2000 | if (cur_logical >= logical) | ||
2001 | break; | ||
2002 | cur_logical++; | ||
2003 | } while ((bh = bh->b_this_page) != head); | ||
2004 | 2101 | ||
2102 | bh = page_bufs = page_buffers(page); | ||
2103 | block_start = 0; | ||
2005 | do { | 2104 | do { |
2006 | if (cur_logical >= logical + blocks) | 2105 | if (!bh) |
2007 | break; | 2106 | goto redirty_page; |
2008 | 2107 | if (map && (cur_logical >= map->m_lblk) && | |
2009 | if (buffer_delay(bh) || | 2108 | (cur_logical <= (map->m_lblk + |
2010 | buffer_unwritten(bh)) { | 2109 | (map->m_len - 1)))) { |
2011 | |||
2012 | BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); | ||
2013 | |||
2014 | if (buffer_delay(bh)) { | 2110 | if (buffer_delay(bh)) { |
2015 | clear_buffer_delay(bh); | 2111 | clear_buffer_delay(bh); |
2016 | bh->b_blocknr = pblock; | 2112 | bh->b_blocknr = pblock; |
2017 | } else { | ||
2018 | /* | ||
2019 | * unwritten already should have | ||
2020 | * blocknr assigned. Verify that | ||
2021 | */ | ||
2022 | clear_buffer_unwritten(bh); | ||
2023 | BUG_ON(bh->b_blocknr != pblock); | ||
2024 | } | 2113 | } |
2114 | if (buffer_unwritten(bh) || | ||
2115 | buffer_mapped(bh)) | ||
2116 | BUG_ON(bh->b_blocknr != pblock); | ||
2117 | if (map->m_flags & EXT4_MAP_UNINIT) | ||
2118 | set_buffer_uninit(bh); | ||
2119 | clear_buffer_unwritten(bh); | ||
2120 | } | ||
2025 | 2121 | ||
2026 | } else if (buffer_mapped(bh)) | 2122 | /* redirty page if block allocation undone */ |
2027 | BUG_ON(bh->b_blocknr != pblock); | 2123 | if (buffer_delay(bh) || buffer_unwritten(bh)) |
2028 | 2124 | redirty_page = 1; | |
2125 | bh = bh->b_this_page; | ||
2126 | block_start += bh->b_size; | ||
2029 | cur_logical++; | 2127 | cur_logical++; |
2030 | pblock++; | 2128 | pblock++; |
2031 | } while ((bh = bh->b_this_page) != head); | 2129 | } while (bh != page_bufs); |
2032 | } | ||
2033 | pagevec_release(&pvec); | ||
2034 | } | ||
2035 | } | ||
2036 | 2130 | ||
2131 | if (redirty_page) | ||
2132 | goto redirty_page; | ||
2037 | 2133 | ||
2038 | /* | 2134 | if (commit_write) |
2039 | * __unmap_underlying_blocks - just a helper function to unmap | 2135 | /* mark the buffer_heads as dirty & uptodate */ |
2040 | * set of blocks described by @bh | 2136 | block_commit_write(page, 0, len); |
2041 | */ | ||
2042 | static inline void __unmap_underlying_blocks(struct inode *inode, | ||
2043 | struct buffer_head *bh) | ||
2044 | { | ||
2045 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
2046 | int blocks, i; | ||
2047 | 2137 | ||
2048 | blocks = bh->b_size >> inode->i_blkbits; | 2138 | /* |
2049 | for (i = 0; i < blocks; i++) | 2139 | * Delalloc doesn't support data journalling, |
2050 | unmap_underlying_metadata(bdev, bh->b_blocknr + i); | 2140 | * but eventually maybe we'll lift this |
2141 | * restriction. | ||
2142 | */ | ||
2143 | if (unlikely(journal_data && PageChecked(page))) | ||
2144 | err = __ext4_journalled_writepage(page, len); | ||
2145 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) | ||
2146 | err = ext4_bio_write_page(&io_submit, page, | ||
2147 | len, mpd->wbc); | ||
2148 | else | ||
2149 | err = block_write_full_page(page, | ||
2150 | noalloc_get_block_write, mpd->wbc); | ||
2151 | |||
2152 | if (!err) | ||
2153 | mpd->pages_written++; | ||
2154 | /* | ||
2155 | * In error case, we have to continue because | ||
2156 | * remaining pages are still locked | ||
2157 | */ | ||
2158 | if (ret == 0) | ||
2159 | ret = err; | ||
2160 | } | ||
2161 | pagevec_release(&pvec); | ||
2162 | } | ||
2163 | ext4_io_submit(&io_submit); | ||
2164 | return ret; | ||
2051 | } | 2165 | } |
2052 | 2166 | ||
2053 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, | 2167 | static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, |
@@ -2068,17 +2182,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, | |||
2068 | break; | 2182 | break; |
2069 | for (i = 0; i < nr_pages; i++) { | 2183 | for (i = 0; i < nr_pages; i++) { |
2070 | struct page *page = pvec.pages[i]; | 2184 | struct page *page = pvec.pages[i]; |
2071 | index = page->index; | 2185 | if (page->index > end) |
2072 | if (index > end) | ||
2073 | break; | 2186 | break; |
2074 | index++; | ||
2075 | |||
2076 | BUG_ON(!PageLocked(page)); | 2187 | BUG_ON(!PageLocked(page)); |
2077 | BUG_ON(PageWriteback(page)); | 2188 | BUG_ON(PageWriteback(page)); |
2078 | block_invalidatepage(page, 0); | 2189 | block_invalidatepage(page, 0); |
2079 | ClearPageUptodate(page); | 2190 | ClearPageUptodate(page); |
2080 | unlock_page(page); | 2191 | unlock_page(page); |
2081 | } | 2192 | } |
2193 | index = pvec.pages[nr_pages - 1]->index + 1; | ||
2194 | pagevec_release(&pvec); | ||
2082 | } | 2195 | } |
2083 | return; | 2196 | return; |
2084 | } | 2197 | } |
@@ -2086,57 +2199,54 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, | |||
2086 | static void ext4_print_free_blocks(struct inode *inode) | 2199 | static void ext4_print_free_blocks(struct inode *inode) |
2087 | { | 2200 | { |
2088 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2201 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2089 | printk(KERN_EMERG "Total free blocks count %lld\n", | 2202 | printk(KERN_CRIT "Total free blocks count %lld\n", |
2090 | ext4_count_free_blocks(inode->i_sb)); | 2203 | ext4_count_free_blocks(inode->i_sb)); |
2091 | printk(KERN_EMERG "Free/Dirty block details\n"); | 2204 | printk(KERN_CRIT "Free/Dirty block details\n"); |
2092 | printk(KERN_EMERG "free_blocks=%lld\n", | 2205 | printk(KERN_CRIT "free_blocks=%lld\n", |
2093 | (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); | 2206 | (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); |
2094 | printk(KERN_EMERG "dirty_blocks=%lld\n", | 2207 | printk(KERN_CRIT "dirty_blocks=%lld\n", |
2095 | (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); | 2208 | (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); |
2096 | printk(KERN_EMERG "Block reservation details\n"); | 2209 | printk(KERN_CRIT "Block reservation details\n"); |
2097 | printk(KERN_EMERG "i_reserved_data_blocks=%u\n", | 2210 | printk(KERN_CRIT "i_reserved_data_blocks=%u\n", |
2098 | EXT4_I(inode)->i_reserved_data_blocks); | 2211 | EXT4_I(inode)->i_reserved_data_blocks); |
2099 | printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", | 2212 | printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", |
2100 | EXT4_I(inode)->i_reserved_meta_blocks); | 2213 | EXT4_I(inode)->i_reserved_meta_blocks); |
2101 | return; | 2214 | return; |
2102 | } | 2215 | } |
2103 | 2216 | ||
2104 | /* | 2217 | /* |
2105 | * mpage_da_map_blocks - go through given space | 2218 | * mpage_da_map_and_submit - go through given space, map them |
2219 | * if necessary, and then submit them for I/O | ||
2106 | * | 2220 | * |
2107 | * @mpd - bh describing space | 2221 | * @mpd - bh describing space |
2108 | * | 2222 | * |
2109 | * The function skips space we know is already mapped to disk blocks. | 2223 | * The function skips space we know is already mapped to disk blocks. |
2110 | * | 2224 | * |
2111 | */ | 2225 | */ |
2112 | static int mpage_da_map_blocks(struct mpage_da_data *mpd) | 2226 | static void mpage_da_map_and_submit(struct mpage_da_data *mpd) |
2113 | { | 2227 | { |
2114 | int err, blks, get_blocks_flags; | 2228 | int err, blks, get_blocks_flags; |
2115 | struct buffer_head new; | 2229 | struct ext4_map_blocks map, *mapp = NULL; |
2116 | sector_t next = mpd->b_blocknr; | 2230 | sector_t next = mpd->b_blocknr; |
2117 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; | 2231 | unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; |
2118 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; | 2232 | loff_t disksize = EXT4_I(mpd->inode)->i_disksize; |
2119 | handle_t *handle = NULL; | 2233 | handle_t *handle = NULL; |
2120 | 2234 | ||
2121 | /* | 2235 | /* |
2122 | * We consider only non-mapped and non-allocated blocks | 2236 | * If the blocks are mapped already, or we couldn't accumulate |
2237 | * any blocks, then proceed immediately to the submission stage. | ||
2123 | */ | 2238 | */ |
2124 | if ((mpd->b_state & (1 << BH_Mapped)) && | 2239 | if ((mpd->b_size == 0) || |
2125 | !(mpd->b_state & (1 << BH_Delay)) && | 2240 | ((mpd->b_state & (1 << BH_Mapped)) && |
2126 | !(mpd->b_state & (1 << BH_Unwritten))) | 2241 | !(mpd->b_state & (1 << BH_Delay)) && |
2127 | return 0; | 2242 | !(mpd->b_state & (1 << BH_Unwritten)))) |
2128 | 2243 | goto submit_io; | |
2129 | /* | ||
2130 | * If we didn't accumulate anything to write simply return | ||
2131 | */ | ||
2132 | if (!mpd->b_size) | ||
2133 | return 0; | ||
2134 | 2244 | ||
2135 | handle = ext4_journal_current_handle(); | 2245 | handle = ext4_journal_current_handle(); |
2136 | BUG_ON(!handle); | 2246 | BUG_ON(!handle); |
2137 | 2247 | ||
2138 | /* | 2248 | /* |
2139 | * Call ext4_get_blocks() to allocate any delayed allocation | 2249 | * Call ext4_map_blocks() to allocate any delayed allocation |
2140 | * blocks, or to convert an uninitialized extent to be | 2250 | * blocks, or to convert an uninitialized extent to be |
2141 | * initialized (in the case where we have written into | 2251 | * initialized (in the case where we have written into |
2142 | * one or more preallocated blocks). | 2252 | * one or more preallocated blocks). |
@@ -2145,35 +2255,40 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
2145 | * indicate that we are on the delayed allocation path. This | 2255 | * indicate that we are on the delayed allocation path. This |
2146 | * affects functions in many different parts of the allocation | 2256 | * affects functions in many different parts of the allocation |
2147 | * call path. This flag exists primarily because we don't | 2257 | * call path. This flag exists primarily because we don't |
2148 | * want to change *many* call functions, so ext4_get_blocks() | 2258 | * want to change *many* call functions, so ext4_map_blocks() |
2149 | * will set the magic i_delalloc_reserved_flag once the | 2259 | * will set the EXT4_STATE_DELALLOC_RESERVED flag once the |
2150 | * inode's allocation semaphore is taken. | 2260 | * inode's allocation semaphore is taken. |
2151 | * | 2261 | * |
2152 | * If the blocks in questions were delalloc blocks, set | 2262 | * If the blocks in questions were delalloc blocks, set |
2153 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting | 2263 | * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting |
2154 | * variables are updated after the blocks have been allocated. | 2264 | * variables are updated after the blocks have been allocated. |
2155 | */ | 2265 | */ |
2156 | new.b_state = 0; | 2266 | map.m_lblk = next; |
2157 | get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | | 2267 | map.m_len = max_blocks; |
2158 | EXT4_GET_BLOCKS_DELALLOC_RESERVE); | 2268 | get_blocks_flags = EXT4_GET_BLOCKS_CREATE; |
2269 | if (ext4_should_dioread_nolock(mpd->inode)) | ||
2270 | get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||
2159 | if (mpd->b_state & (1 << BH_Delay)) | 2271 | if (mpd->b_state & (1 << BH_Delay)) |
2160 | get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; | 2272 | get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; |
2161 | blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, | 2273 | |
2162 | &new, get_blocks_flags); | 2274 | blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); |
2163 | if (blks < 0) { | 2275 | if (blks < 0) { |
2276 | struct super_block *sb = mpd->inode->i_sb; | ||
2277 | |||
2164 | err = blks; | 2278 | err = blks; |
2165 | /* | 2279 | /* |
2166 | * If get block returns with error we simply | 2280 | * If get block returns EAGAIN or ENOSPC and there |
2167 | * return. Later writepage will redirty the page and | 2281 | * appears to be free blocks we will call |
2168 | * writepages will find the dirty page again | 2282 | * ext4_writepage() for all of the pages which will |
2283 | * just redirty the pages. | ||
2169 | */ | 2284 | */ |
2170 | if (err == -EAGAIN) | 2285 | if (err == -EAGAIN) |
2171 | return 0; | 2286 | goto submit_io; |
2172 | 2287 | ||
2173 | if (err == -ENOSPC && | 2288 | if (err == -ENOSPC && |
2174 | ext4_count_free_blocks(mpd->inode->i_sb)) { | 2289 | ext4_count_free_blocks(sb)) { |
2175 | mpd->retval = err; | 2290 | mpd->retval = err; |
2176 | return 0; | 2291 | goto submit_io; |
2177 | } | 2292 | } |
2178 | 2293 | ||
2179 | /* | 2294 | /* |
@@ -2183,41 +2298,39 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
2183 | * writepage and writepages will again try to write | 2298 | * writepage and writepages will again try to write |
2184 | * the same. | 2299 | * the same. |
2185 | */ | 2300 | */ |
2186 | printk(KERN_EMERG "%s block allocation failed for inode %lu " | 2301 | if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { |
2187 | "at logical offset %llu with max blocks " | 2302 | ext4_msg(sb, KERN_CRIT, |
2188 | "%zd with error %d\n", | 2303 | "delayed block allocation failed for inode %lu " |
2189 | __func__, mpd->inode->i_ino, | 2304 | "at logical offset %llu with max blocks %zd " |
2190 | (unsigned long long)next, | 2305 | "with error %d", mpd->inode->i_ino, |
2191 | mpd->b_size >> mpd->inode->i_blkbits, err); | 2306 | (unsigned long long) next, |
2192 | printk(KERN_EMERG "This should not happen.!! " | 2307 | mpd->b_size >> mpd->inode->i_blkbits, err); |
2193 | "Data will be lost\n"); | 2308 | ext4_msg(sb, KERN_CRIT, |
2194 | if (err == -ENOSPC) { | 2309 | "This should not happen!! Data will be lost\n"); |
2195 | ext4_print_free_blocks(mpd->inode); | 2310 | if (err == -ENOSPC) |
2311 | ext4_print_free_blocks(mpd->inode); | ||
2196 | } | 2312 | } |
2197 | /* invalidate all the pages */ | 2313 | /* invalidate all the pages */ |
2198 | ext4_da_block_invalidatepages(mpd, next, | 2314 | ext4_da_block_invalidatepages(mpd, next, |
2199 | mpd->b_size >> mpd->inode->i_blkbits); | 2315 | mpd->b_size >> mpd->inode->i_blkbits); |
2200 | return err; | 2316 | return; |
2201 | } | 2317 | } |
2202 | BUG_ON(blks == 0); | 2318 | BUG_ON(blks == 0); |
2203 | 2319 | ||
2204 | new.b_size = (blks << mpd->inode->i_blkbits); | 2320 | mapp = ↦ |
2321 | if (map.m_flags & EXT4_MAP_NEW) { | ||
2322 | struct block_device *bdev = mpd->inode->i_sb->s_bdev; | ||
2323 | int i; | ||
2205 | 2324 | ||
2206 | if (buffer_new(&new)) | 2325 | for (i = 0; i < map.m_len; i++) |
2207 | __unmap_underlying_blocks(mpd->inode, &new); | 2326 | unmap_underlying_metadata(bdev, map.m_pblk + i); |
2208 | 2327 | } | |
2209 | /* | ||
2210 | * If blocks are delayed marked, we need to | ||
2211 | * put actual blocknr and drop delayed bit | ||
2212 | */ | ||
2213 | if ((mpd->b_state & (1 << BH_Delay)) || | ||
2214 | (mpd->b_state & (1 << BH_Unwritten))) | ||
2215 | mpage_put_bnr_to_bhs(mpd, next, &new); | ||
2216 | 2328 | ||
2217 | if (ext4_should_order_data(mpd->inode)) { | 2329 | if (ext4_should_order_data(mpd->inode)) { |
2218 | err = ext4_jbd2_file_inode(handle, mpd->inode); | 2330 | err = ext4_jbd2_file_inode(handle, mpd->inode); |
2219 | if (err) | 2331 | if (err) |
2220 | return err; | 2332 | /* This only happens if the journal is aborted */ |
2333 | return; | ||
2221 | } | 2334 | } |
2222 | 2335 | ||
2223 | /* | 2336 | /* |
@@ -2228,10 +2341,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
2228 | disksize = i_size_read(mpd->inode); | 2341 | disksize = i_size_read(mpd->inode); |
2229 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { | 2342 | if (disksize > EXT4_I(mpd->inode)->i_disksize) { |
2230 | ext4_update_i_disksize(mpd->inode, disksize); | 2343 | ext4_update_i_disksize(mpd->inode, disksize); |
2231 | return ext4_mark_inode_dirty(handle, mpd->inode); | 2344 | err = ext4_mark_inode_dirty(handle, mpd->inode); |
2345 | if (err) | ||
2346 | ext4_error(mpd->inode->i_sb, | ||
2347 | "Failed to mark inode %lu dirty", | ||
2348 | mpd->inode->i_ino); | ||
2232 | } | 2349 | } |
2233 | 2350 | ||
2234 | return 0; | 2351 | submit_io: |
2352 | mpage_da_submit_io(mpd, mapp); | ||
2353 | mpd->io_done = 1; | ||
2235 | } | 2354 | } |
2236 | 2355 | ||
2237 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ | 2356 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ |
@@ -2253,8 +2372,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | |||
2253 | sector_t next; | 2372 | sector_t next; |
2254 | int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; | 2373 | int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; |
2255 | 2374 | ||
2375 | /* | ||
2376 | * XXX Don't go larger than mballoc is willing to allocate | ||
2377 | * This is a stopgap solution. We eventually need to fold | ||
2378 | * mpage_da_submit_io() into this function and then call | ||
2379 | * ext4_map_blocks() multiple times in a loop | ||
2380 | */ | ||
2381 | if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) | ||
2382 | goto flush_it; | ||
2383 | |||
2256 | /* check if thereserved journal credits might overflow */ | 2384 | /* check if thereserved journal credits might overflow */ |
2257 | if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { | 2385 | if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { |
2258 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { | 2386 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { |
2259 | /* | 2387 | /* |
2260 | * With non-extent format we are limited by the journal | 2388 | * With non-extent format we are limited by the journal |
@@ -2299,21 +2427,13 @@ flush_it: | |||
2299 | * We couldn't merge the block to our extent, so we | 2427 | * We couldn't merge the block to our extent, so we |
2300 | * need to flush current extent and start new one | 2428 | * need to flush current extent and start new one |
2301 | */ | 2429 | */ |
2302 | if (mpage_da_map_blocks(mpd) == 0) | 2430 | mpage_da_map_and_submit(mpd); |
2303 | mpage_da_submit_io(mpd); | ||
2304 | mpd->io_done = 1; | ||
2305 | return; | 2431 | return; |
2306 | } | 2432 | } |
2307 | 2433 | ||
2308 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | 2434 | static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) |
2309 | { | 2435 | { |
2310 | /* | 2436 | return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); |
2311 | * unmapped buffer is possible for holes. | ||
2312 | * delay buffer is possible with delayed allocation. | ||
2313 | * We also need to consider unwritten buffer as unmapped. | ||
2314 | */ | ||
2315 | return (!buffer_mapped(bh) || buffer_delay(bh) || | ||
2316 | buffer_unwritten(bh)) && buffer_dirty(bh); | ||
2317 | } | 2437 | } |
2318 | 2438 | ||
2319 | /* | 2439 | /* |
@@ -2326,39 +2446,26 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | |||
2326 | * The function finds extents of pages and scan them for all blocks. | 2446 | * The function finds extents of pages and scan them for all blocks. |
2327 | */ | 2447 | */ |
2328 | static int __mpage_da_writepage(struct page *page, | 2448 | static int __mpage_da_writepage(struct page *page, |
2329 | struct writeback_control *wbc, void *data) | 2449 | struct writeback_control *wbc, |
2450 | struct mpage_da_data *mpd) | ||
2330 | { | 2451 | { |
2331 | struct mpage_da_data *mpd = data; | ||
2332 | struct inode *inode = mpd->inode; | 2452 | struct inode *inode = mpd->inode; |
2333 | struct buffer_head *bh, *head; | 2453 | struct buffer_head *bh, *head; |
2334 | sector_t logical; | 2454 | sector_t logical; |
2335 | 2455 | ||
2336 | if (mpd->io_done) { | ||
2337 | /* | ||
2338 | * Rest of the page in the page_vec | ||
2339 | * redirty then and skip then. We will | ||
2340 | * try to to write them again after | ||
2341 | * starting a new transaction | ||
2342 | */ | ||
2343 | redirty_page_for_writepage(wbc, page); | ||
2344 | unlock_page(page); | ||
2345 | return MPAGE_DA_EXTENT_TAIL; | ||
2346 | } | ||
2347 | /* | 2456 | /* |
2348 | * Can we merge this page to current extent? | 2457 | * Can we merge this page to current extent? |
2349 | */ | 2458 | */ |
2350 | if (mpd->next_page != page->index) { | 2459 | if (mpd->next_page != page->index) { |
2351 | /* | 2460 | /* |
2352 | * Nope, we can't. So, we map non-allocated blocks | 2461 | * Nope, we can't. So, we map non-allocated blocks |
2353 | * and start IO on them using writepage() | 2462 | * and start IO on them |
2354 | */ | 2463 | */ |
2355 | if (mpd->next_page != mpd->first_page) { | 2464 | if (mpd->next_page != mpd->first_page) { |
2356 | if (mpage_da_map_blocks(mpd) == 0) | 2465 | mpage_da_map_and_submit(mpd); |
2357 | mpage_da_submit_io(mpd); | ||
2358 | /* | 2466 | /* |
2359 | * skip rest of the page in the page_vec | 2467 | * skip rest of the page in the page_vec |
2360 | */ | 2468 | */ |
2361 | mpd->io_done = 1; | ||
2362 | redirty_page_for_writepage(wbc, page); | 2469 | redirty_page_for_writepage(wbc, page); |
2363 | unlock_page(page); | 2470 | unlock_page(page); |
2364 | return MPAGE_DA_EXTENT_TAIL; | 2471 | return MPAGE_DA_EXTENT_TAIL; |
@@ -2398,9 +2505,9 @@ static int __mpage_da_writepage(struct page *page, | |||
2398 | * We need to try to allocate | 2505 | * We need to try to allocate |
2399 | * unmapped blocks in the same page. | 2506 | * unmapped blocks in the same page. |
2400 | * Otherwise we won't make progress | 2507 | * Otherwise we won't make progress |
2401 | * with the page in ext4_da_writepage | 2508 | * with the page in ext4_writepage |
2402 | */ | 2509 | */ |
2403 | if (ext4_bh_unmapped_or_delay(NULL, bh)) { | 2510 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { |
2404 | mpage_add_bh_to_extent(mpd, logical, | 2511 | mpage_add_bh_to_extent(mpd, logical, |
2405 | bh->b_size, | 2512 | bh->b_size, |
2406 | bh->b_state); | 2513 | bh->b_state); |
@@ -2438,8 +2545,9 @@ static int __mpage_da_writepage(struct page *page, | |||
2438 | * initialized properly. | 2545 | * initialized properly. |
2439 | */ | 2546 | */ |
2440 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | 2547 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, |
2441 | struct buffer_head *bh_result, int create) | 2548 | struct buffer_head *bh, int create) |
2442 | { | 2549 | { |
2550 | struct ext4_map_blocks map; | ||
2443 | int ret = 0; | 2551 | int ret = 0; |
2444 | sector_t invalid_block = ~((sector_t) 0xffff); | 2552 | sector_t invalid_block = ~((sector_t) 0xffff); |
2445 | 2553 | ||
@@ -2447,165 +2555,228 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | |||
2447 | invalid_block = ~0; | 2555 | invalid_block = ~0; |
2448 | 2556 | ||
2449 | BUG_ON(create == 0); | 2557 | BUG_ON(create == 0); |
2450 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | 2558 | BUG_ON(bh->b_size != inode->i_sb->s_blocksize); |
2559 | |||
2560 | map.m_lblk = iblock; | ||
2561 | map.m_len = 1; | ||
2451 | 2562 | ||
2452 | /* | 2563 | /* |
2453 | * first, we need to know whether the block is allocated already | 2564 | * first, we need to know whether the block is allocated already |
2454 | * preallocated blocks are unmapped but should treated | 2565 | * preallocated blocks are unmapped but should treated |
2455 | * the same as allocated blocks. | 2566 | * the same as allocated blocks. |
2456 | */ | 2567 | */ |
2457 | ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); | 2568 | ret = ext4_map_blocks(NULL, inode, &map, 0); |
2458 | if ((ret == 0) && !buffer_delay(bh_result)) { | 2569 | if (ret < 0) |
2459 | /* the block isn't (pre)allocated yet, let's reserve space */ | 2570 | return ret; |
2571 | if (ret == 0) { | ||
2572 | if (buffer_delay(bh)) | ||
2573 | return 0; /* Not sure this could or should happen */ | ||
2460 | /* | 2574 | /* |
2461 | * XXX: __block_prepare_write() unmaps passed block, | 2575 | * XXX: __block_write_begin() unmaps passed block, is it OK? |
2462 | * is it OK? | ||
2463 | */ | 2576 | */ |
2464 | ret = ext4_da_reserve_space(inode, 1); | 2577 | ret = ext4_da_reserve_space(inode, iblock); |
2465 | if (ret) | 2578 | if (ret) |
2466 | /* not enough space to reserve */ | 2579 | /* not enough space to reserve */ |
2467 | return ret; | 2580 | return ret; |
2468 | 2581 | ||
2469 | map_bh(bh_result, inode->i_sb, invalid_block); | 2582 | map_bh(bh, inode->i_sb, invalid_block); |
2470 | set_buffer_new(bh_result); | 2583 | set_buffer_new(bh); |
2471 | set_buffer_delay(bh_result); | 2584 | set_buffer_delay(bh); |
2472 | } else if (ret > 0) { | 2585 | return 0; |
2473 | bh_result->b_size = (ret << inode->i_blkbits); | ||
2474 | if (buffer_unwritten(bh_result)) { | ||
2475 | /* A delayed write to unwritten bh should | ||
2476 | * be marked new and mapped. Mapped ensures | ||
2477 | * that we don't do get_block multiple times | ||
2478 | * when we write to the same offset and new | ||
2479 | * ensures that we do proper zero out for | ||
2480 | * partial write. | ||
2481 | */ | ||
2482 | set_buffer_new(bh_result); | ||
2483 | set_buffer_mapped(bh_result); | ||
2484 | } | ||
2485 | ret = 0; | ||
2486 | } | 2586 | } |
2487 | 2587 | ||
2488 | return ret; | 2588 | map_bh(bh, inode->i_sb, map.m_pblk); |
2589 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; | ||
2590 | |||
2591 | if (buffer_unwritten(bh)) { | ||
2592 | /* A delayed write to unwritten bh should be marked | ||
2593 | * new and mapped. Mapped ensures that we don't do | ||
2594 | * get_block multiple times when we write to the same | ||
2595 | * offset and new ensures that we do proper zero out | ||
2596 | * for partial write. | ||
2597 | */ | ||
2598 | set_buffer_new(bh); | ||
2599 | set_buffer_mapped(bh); | ||
2600 | } | ||
2601 | return 0; | ||
2489 | } | 2602 | } |
2490 | 2603 | ||
2491 | /* | 2604 | /* |
2492 | * This function is used as a standard get_block_t calback function | 2605 | * This function is used as a standard get_block_t calback function |
2493 | * when there is no desire to allocate any blocks. It is used as a | 2606 | * when there is no desire to allocate any blocks. It is used as a |
2494 | * callback function for block_prepare_write(), nobh_writepage(), and | 2607 | * callback function for block_write_begin() and block_write_full_page(). |
2495 | * block_write_full_page(). These functions should only try to map a | 2608 | * These functions should only try to map a single block at a time. |
2496 | * single block at a time. | ||
2497 | * | 2609 | * |
2498 | * Since this function doesn't do block allocations even if the caller | 2610 | * Since this function doesn't do block allocations even if the caller |
2499 | * requests it by passing in create=1, it is critically important that | 2611 | * requests it by passing in create=1, it is critically important that |
2500 | * any caller checks to make sure that any buffer heads are returned | 2612 | * any caller checks to make sure that any buffer heads are returned |
2501 | * by this function are either all already mapped or marked for | 2613 | * by this function are either all already mapped or marked for |
2502 | * delayed allocation before calling nobh_writepage() or | 2614 | * delayed allocation before calling block_write_full_page(). Otherwise, |
2503 | * block_write_full_page(). Otherwise, b_blocknr could be left | 2615 | * b_blocknr could be left unitialized, and the page write functions will |
2504 | * unitialized, and the page write functions will be taken by | 2616 | * be taken by surprise. |
2505 | * surprise. | ||
2506 | */ | 2617 | */ |
2507 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, | 2618 | static int noalloc_get_block_write(struct inode *inode, sector_t iblock, |
2508 | struct buffer_head *bh_result, int create) | 2619 | struct buffer_head *bh_result, int create) |
2509 | { | 2620 | { |
2621 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | ||
2622 | return _ext4_get_block(inode, iblock, bh_result, 0); | ||
2623 | } | ||
2624 | |||
2625 | static int bget_one(handle_t *handle, struct buffer_head *bh) | ||
2626 | { | ||
2627 | get_bh(bh); | ||
2628 | return 0; | ||
2629 | } | ||
2630 | |||
2631 | static int bput_one(handle_t *handle, struct buffer_head *bh) | ||
2632 | { | ||
2633 | put_bh(bh); | ||
2634 | return 0; | ||
2635 | } | ||
2636 | |||
2637 | static int __ext4_journalled_writepage(struct page *page, | ||
2638 | unsigned int len) | ||
2639 | { | ||
2640 | struct address_space *mapping = page->mapping; | ||
2641 | struct inode *inode = mapping->host; | ||
2642 | struct buffer_head *page_bufs; | ||
2643 | handle_t *handle = NULL; | ||
2510 | int ret = 0; | 2644 | int ret = 0; |
2511 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | 2645 | int err; |
2512 | 2646 | ||
2513 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | 2647 | ClearPageChecked(page); |
2648 | page_bufs = page_buffers(page); | ||
2649 | BUG_ON(!page_bufs); | ||
2650 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); | ||
2651 | /* As soon as we unlock the page, it can go away, but we have | ||
2652 | * references to buffers so we are safe */ | ||
2653 | unlock_page(page); | ||
2514 | 2654 | ||
2515 | /* | 2655 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); |
2516 | * we don't want to do block allocation in writepage | 2656 | if (IS_ERR(handle)) { |
2517 | * so call get_block_wrap with create = 0 | 2657 | ret = PTR_ERR(handle); |
2518 | */ | 2658 | goto out; |
2519 | ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); | ||
2520 | BUG_ON(create && ret == 0); | ||
2521 | if (ret > 0) { | ||
2522 | bh_result->b_size = (ret << inode->i_blkbits); | ||
2523 | ret = 0; | ||
2524 | } | 2659 | } |
2660 | |||
2661 | ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, | ||
2662 | do_journal_get_write_access); | ||
2663 | |||
2664 | err = walk_page_buffers(handle, page_bufs, 0, len, NULL, | ||
2665 | write_end_fn); | ||
2666 | if (ret == 0) | ||
2667 | ret = err; | ||
2668 | err = ext4_journal_stop(handle); | ||
2669 | if (!ret) | ||
2670 | ret = err; | ||
2671 | |||
2672 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); | ||
2673 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | ||
2674 | out: | ||
2525 | return ret; | 2675 | return ret; |
2526 | } | 2676 | } |
2527 | 2677 | ||
2678 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); | ||
2679 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); | ||
2680 | |||
2528 | /* | 2681 | /* |
2682 | * Note that we don't need to start a transaction unless we're journaling data | ||
2683 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | ||
2684 | * need to file the inode to the transaction's list in ordered mode because if | ||
2685 | * we are writing back data added by write(), the inode is already there and if | ||
2686 | * we are writing back data modified via mmap(), noone guarantees in which | ||
2687 | * transaction the data will hit the disk. In case we are journaling data, we | ||
2688 | * cannot start transaction directly because transaction start ranks above page | ||
2689 | * lock so we have to do some magic. | ||
2690 | * | ||
2529 | * This function can get called via... | 2691 | * This function can get called via... |
2530 | * - ext4_da_writepages after taking page lock (have journal handle) | 2692 | * - ext4_da_writepages after taking page lock (have journal handle) |
2531 | * - journal_submit_inode_data_buffers (no journal handle) | 2693 | * - journal_submit_inode_data_buffers (no journal handle) |
2532 | * - shrink_page_list via pdflush (no journal handle) | 2694 | * - shrink_page_list via pdflush (no journal handle) |
2533 | * - grab_page_cache when doing write_begin (have journal handle) | 2695 | * - grab_page_cache when doing write_begin (have journal handle) |
2696 | * | ||
2697 | * We don't do any block allocation in this function. If we have page with | ||
2698 | * multiple blocks we need to write those buffer_heads that are mapped. This | ||
2699 | * is important for mmaped based write. So if we do with blocksize 1K | ||
2700 | * truncate(f, 1024); | ||
2701 | * a = mmap(f, 0, 4096); | ||
2702 | * a[0] = 'a'; | ||
2703 | * truncate(f, 4096); | ||
2704 | * we have in the page first buffer_head mapped via page_mkwrite call back | ||
2705 | * but other bufer_heads would be unmapped but dirty(dirty done via the | ||
2706 | * do_wp_page). So writepage should write the first block. If we modify | ||
2707 | * the mmap area beyond 1024 we will again get a page_fault and the | ||
2708 | * page_mkwrite callback will do the block allocation and mark the | ||
2709 | * buffer_heads mapped. | ||
2710 | * | ||
2711 | * We redirty the page if we have any buffer_heads that is either delay or | ||
2712 | * unwritten in the page. | ||
2713 | * | ||
2714 | * We can get recursively called as show below. | ||
2715 | * | ||
2716 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | ||
2717 | * ext4_writepage() | ||
2718 | * | ||
2719 | * But since we don't do any block allocation we should not deadlock. | ||
2720 | * Page also have the dirty flag cleared so we don't get recurive page_lock. | ||
2534 | */ | 2721 | */ |
2535 | static int ext4_da_writepage(struct page *page, | 2722 | static int ext4_writepage(struct page *page, |
2536 | struct writeback_control *wbc) | 2723 | struct writeback_control *wbc) |
2537 | { | 2724 | { |
2538 | int ret = 0; | 2725 | int ret = 0, commit_write = 0; |
2539 | loff_t size; | 2726 | loff_t size; |
2540 | unsigned int len; | 2727 | unsigned int len; |
2541 | struct buffer_head *page_bufs; | 2728 | struct buffer_head *page_bufs = NULL; |
2542 | struct inode *inode = page->mapping->host; | 2729 | struct inode *inode = page->mapping->host; |
2543 | 2730 | ||
2544 | trace_ext4_da_writepage(inode, page); | 2731 | trace_ext4_writepage(inode, page); |
2545 | size = i_size_read(inode); | 2732 | size = i_size_read(inode); |
2546 | if (page->index == size >> PAGE_CACHE_SHIFT) | 2733 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2547 | len = size & ~PAGE_CACHE_MASK; | 2734 | len = size & ~PAGE_CACHE_MASK; |
2548 | else | 2735 | else |
2549 | len = PAGE_CACHE_SIZE; | 2736 | len = PAGE_CACHE_SIZE; |
2550 | 2737 | ||
2551 | if (page_has_buffers(page)) { | 2738 | /* |
2552 | page_bufs = page_buffers(page); | 2739 | * If the page does not have buffers (for whatever reason), |
2553 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | 2740 | * try to create them using __block_write_begin. If this |
2554 | ext4_bh_unmapped_or_delay)) { | 2741 | * fails, redirty the page and move on. |
2555 | /* | 2742 | */ |
2556 | * We don't want to do block allocation | 2743 | if (!page_has_buffers(page)) { |
2557 | * So redirty the page and return | 2744 | if (__block_write_begin(page, 0, len, |
2558 | * We may reach here when we do a journal commit | 2745 | noalloc_get_block_write)) { |
2559 | * via journal_submit_inode_data_buffers. | 2746 | redirty_page: |
2560 | * If we don't have mapping block we just ignore | ||
2561 | * them. We can also reach here via shrink_page_list | ||
2562 | */ | ||
2563 | redirty_page_for_writepage(wbc, page); | 2747 | redirty_page_for_writepage(wbc, page); |
2564 | unlock_page(page); | 2748 | unlock_page(page); |
2565 | return 0; | 2749 | return 0; |
2566 | } | 2750 | } |
2567 | } else { | 2751 | commit_write = 1; |
2752 | } | ||
2753 | page_bufs = page_buffers(page); | ||
2754 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | ||
2755 | ext4_bh_delay_or_unwritten)) { | ||
2568 | /* | 2756 | /* |
2569 | * The test for page_has_buffers() is subtle: | 2757 | * We don't want to do block allocation, so redirty |
2570 | * We know the page is dirty but it lost buffers. That means | 2758 | * the page and return. We may reach here when we do |
2571 | * that at some moment in time after write_begin()/write_end() | 2759 | * a journal commit via journal_submit_inode_data_buffers. |
2572 | * has been called all buffers have been clean and thus they | 2760 | * We can also reach here via shrink_page_list |
2573 | * must have been written at least once. So they are all | ||
2574 | * mapped and we can happily proceed with mapping them | ||
2575 | * and writing the page. | ||
2576 | * | ||
2577 | * Try to initialize the buffer_heads and check whether | ||
2578 | * all are mapped and non delay. We don't want to | ||
2579 | * do block allocation here. | ||
2580 | */ | 2761 | */ |
2581 | ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | 2762 | goto redirty_page; |
2582 | noalloc_get_block_write); | ||
2583 | if (!ret) { | ||
2584 | page_bufs = page_buffers(page); | ||
2585 | /* check whether all are mapped and non delay */ | ||
2586 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | ||
2587 | ext4_bh_unmapped_or_delay)) { | ||
2588 | redirty_page_for_writepage(wbc, page); | ||
2589 | unlock_page(page); | ||
2590 | return 0; | ||
2591 | } | ||
2592 | } else { | ||
2593 | /* | ||
2594 | * We can't do block allocation here | ||
2595 | * so just redity the page and unlock | ||
2596 | * and return | ||
2597 | */ | ||
2598 | redirty_page_for_writepage(wbc, page); | ||
2599 | unlock_page(page); | ||
2600 | return 0; | ||
2601 | } | ||
2602 | /* now mark the buffer_heads as dirty and uptodate */ | ||
2603 | block_commit_write(page, 0, PAGE_CACHE_SIZE); | ||
2604 | } | 2763 | } |
2764 | if (commit_write) | ||
2765 | /* now mark the buffer_heads as dirty and uptodate */ | ||
2766 | block_commit_write(page, 0, len); | ||
2605 | 2767 | ||
2606 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | 2768 | if (PageChecked(page) && ext4_should_journal_data(inode)) |
2607 | ret = nobh_writepage(page, noalloc_get_block_write, wbc); | 2769 | /* |
2608 | else | 2770 | * It's mmapped pagecache. Add buffers and journal it. There |
2771 | * doesn't seem much point in redirtying the page here. | ||
2772 | */ | ||
2773 | return __ext4_journalled_writepage(page, len); | ||
2774 | |||
2775 | if (buffer_uninit(page_bufs)) { | ||
2776 | ext4_set_bh_endio(page_bufs, inode); | ||
2777 | ret = block_write_full_page_endio(page, noalloc_get_block_write, | ||
2778 | wbc, ext4_end_io_buffer_write); | ||
2779 | } else | ||
2609 | ret = block_write_full_page(page, noalloc_get_block_write, | 2780 | ret = block_write_full_page(page, noalloc_get_block_write, |
2610 | wbc); | 2781 | wbc); |
2611 | 2782 | ||
@@ -2630,13 +2801,140 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) | |||
2630 | * number of contiguous block. So we will limit | 2801 | * number of contiguous block. So we will limit |
2631 | * number of contiguous block to a sane value | 2802 | * number of contiguous block to a sane value |
2632 | */ | 2803 | */ |
2633 | if (!(inode->i_flags & EXT4_EXTENTS_FL) && | 2804 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && |
2634 | (max_blocks > EXT4_MAX_TRANS_DATA)) | 2805 | (max_blocks > EXT4_MAX_TRANS_DATA)) |
2635 | max_blocks = EXT4_MAX_TRANS_DATA; | 2806 | max_blocks = EXT4_MAX_TRANS_DATA; |
2636 | 2807 | ||
2637 | return ext4_chunk_trans_blocks(inode, max_blocks); | 2808 | return ext4_chunk_trans_blocks(inode, max_blocks); |
2638 | } | 2809 | } |
2639 | 2810 | ||
2811 | /* | ||
2812 | * write_cache_pages_da - walk the list of dirty pages of the given | ||
2813 | * address space and call the callback function (which usually writes | ||
2814 | * the pages). | ||
2815 | * | ||
2816 | * This is a forked version of write_cache_pages(). Differences: | ||
2817 | * Range cyclic is ignored. | ||
2818 | * no_nrwrite_index_update is always presumed true | ||
2819 | */ | ||
2820 | static int write_cache_pages_da(struct address_space *mapping, | ||
2821 | struct writeback_control *wbc, | ||
2822 | struct mpage_da_data *mpd, | ||
2823 | pgoff_t *done_index) | ||
2824 | { | ||
2825 | int ret = 0; | ||
2826 | int done = 0; | ||
2827 | struct pagevec pvec; | ||
2828 | unsigned nr_pages; | ||
2829 | pgoff_t index; | ||
2830 | pgoff_t end; /* Inclusive */ | ||
2831 | long nr_to_write = wbc->nr_to_write; | ||
2832 | int tag; | ||
2833 | |||
2834 | pagevec_init(&pvec, 0); | ||
2835 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
2836 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
2837 | |||
2838 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
2839 | tag = PAGECACHE_TAG_TOWRITE; | ||
2840 | else | ||
2841 | tag = PAGECACHE_TAG_DIRTY; | ||
2842 | |||
2843 | *done_index = index; | ||
2844 | while (!done && (index <= end)) { | ||
2845 | int i; | ||
2846 | |||
2847 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | ||
2848 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | ||
2849 | if (nr_pages == 0) | ||
2850 | break; | ||
2851 | |||
2852 | for (i = 0; i < nr_pages; i++) { | ||
2853 | struct page *page = pvec.pages[i]; | ||
2854 | |||
2855 | /* | ||
2856 | * At this point, the page may be truncated or | ||
2857 | * invalidated (changing page->mapping to NULL), or | ||
2858 | * even swizzled back from swapper_space to tmpfs file | ||
2859 | * mapping. However, page->index will not change | ||
2860 | * because we have a reference on the page. | ||
2861 | */ | ||
2862 | if (page->index > end) { | ||
2863 | done = 1; | ||
2864 | break; | ||
2865 | } | ||
2866 | |||
2867 | *done_index = page->index + 1; | ||
2868 | |||
2869 | lock_page(page); | ||
2870 | |||
2871 | /* | ||
2872 | * Page truncated or invalidated. We can freely skip it | ||
2873 | * then, even for data integrity operations: the page | ||
2874 | * has disappeared concurrently, so there could be no | ||
2875 | * real expectation of this data interity operation | ||
2876 | * even if there is now a new, dirty page at the same | ||
2877 | * pagecache address. | ||
2878 | */ | ||
2879 | if (unlikely(page->mapping != mapping)) { | ||
2880 | continue_unlock: | ||
2881 | unlock_page(page); | ||
2882 | continue; | ||
2883 | } | ||
2884 | |||
2885 | if (!PageDirty(page)) { | ||
2886 | /* someone wrote it for us */ | ||
2887 | goto continue_unlock; | ||
2888 | } | ||
2889 | |||
2890 | if (PageWriteback(page)) { | ||
2891 | if (wbc->sync_mode != WB_SYNC_NONE) | ||
2892 | wait_on_page_writeback(page); | ||
2893 | else | ||
2894 | goto continue_unlock; | ||
2895 | } | ||
2896 | |||
2897 | BUG_ON(PageWriteback(page)); | ||
2898 | if (!clear_page_dirty_for_io(page)) | ||
2899 | goto continue_unlock; | ||
2900 | |||
2901 | ret = __mpage_da_writepage(page, wbc, mpd); | ||
2902 | if (unlikely(ret)) { | ||
2903 | if (ret == AOP_WRITEPAGE_ACTIVATE) { | ||
2904 | unlock_page(page); | ||
2905 | ret = 0; | ||
2906 | } else { | ||
2907 | done = 1; | ||
2908 | break; | ||
2909 | } | ||
2910 | } | ||
2911 | |||
2912 | if (nr_to_write > 0) { | ||
2913 | nr_to_write--; | ||
2914 | if (nr_to_write == 0 && | ||
2915 | wbc->sync_mode == WB_SYNC_NONE) { | ||
2916 | /* | ||
2917 | * We stop writing back only if we are | ||
2918 | * not doing integrity sync. In case of | ||
2919 | * integrity sync we have to keep going | ||
2920 | * because someone may be concurrently | ||
2921 | * dirtying pages, and we might have | ||
2922 | * synced a lot of newly appeared dirty | ||
2923 | * pages, but have not synced all of the | ||
2924 | * old dirty pages. | ||
2925 | */ | ||
2926 | done = 1; | ||
2927 | break; | ||
2928 | } | ||
2929 | } | ||
2930 | } | ||
2931 | pagevec_release(&pvec); | ||
2932 | cond_resched(); | ||
2933 | } | ||
2934 | return ret; | ||
2935 | } | ||
2936 | |||
2937 | |||
2640 | static int ext4_da_writepages(struct address_space *mapping, | 2938 | static int ext4_da_writepages(struct address_space *mapping, |
2641 | struct writeback_control *wbc) | 2939 | struct writeback_control *wbc) |
2642 | { | 2940 | { |
@@ -2645,12 +2943,16 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2645 | handle_t *handle = NULL; | 2943 | handle_t *handle = NULL; |
2646 | struct mpage_da_data mpd; | 2944 | struct mpage_da_data mpd; |
2647 | struct inode *inode = mapping->host; | 2945 | struct inode *inode = mapping->host; |
2648 | int no_nrwrite_index_update; | ||
2649 | int pages_written = 0; | 2946 | int pages_written = 0; |
2650 | long pages_skipped; | 2947 | long pages_skipped; |
2948 | unsigned int max_pages; | ||
2651 | int range_cyclic, cycled = 1, io_done = 0; | 2949 | int range_cyclic, cycled = 1, io_done = 0; |
2652 | int needed_blocks, ret = 0, nr_to_writebump = 0; | 2950 | int needed_blocks, ret = 0; |
2951 | long desired_nr_to_write, nr_to_writebump = 0; | ||
2952 | loff_t range_start = wbc->range_start; | ||
2653 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2953 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2954 | pgoff_t done_index = 0; | ||
2955 | pgoff_t end; | ||
2654 | 2956 | ||
2655 | trace_ext4_da_writepages(inode, wbc); | 2957 | trace_ext4_da_writepages(inode, wbc); |
2656 | 2958 | ||
@@ -2675,16 +2977,6 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2675 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2977 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2676 | return -EROFS; | 2978 | return -EROFS; |
2677 | 2979 | ||
2678 | /* | ||
2679 | * Make sure nr_to_write is >= sbi->s_mb_stream_request | ||
2680 | * This make sure small files blocks are allocated in | ||
2681 | * single attempt. This ensure that small files | ||
2682 | * get less fragmented. | ||
2683 | */ | ||
2684 | if (wbc->nr_to_write < sbi->s_mb_stream_request) { | ||
2685 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | ||
2686 | wbc->nr_to_write = sbi->s_mb_stream_request; | ||
2687 | } | ||
2688 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2980 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2689 | range_whole = 1; | 2981 | range_whole = 1; |
2690 | 2982 | ||
@@ -2696,21 +2988,54 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2696 | wbc->range_start = index << PAGE_CACHE_SHIFT; | 2988 | wbc->range_start = index << PAGE_CACHE_SHIFT; |
2697 | wbc->range_end = LLONG_MAX; | 2989 | wbc->range_end = LLONG_MAX; |
2698 | wbc->range_cyclic = 0; | 2990 | wbc->range_cyclic = 0; |
2699 | } else | 2991 | end = -1; |
2992 | } else { | ||
2700 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2993 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2994 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
2995 | } | ||
2996 | |||
2997 | /* | ||
2998 | * This works around two forms of stupidity. The first is in | ||
2999 | * the writeback code, which caps the maximum number of pages | ||
3000 | * written to be 1024 pages. This is wrong on multiple | ||
3001 | * levels; different architectues have a different page size, | ||
3002 | * which changes the maximum amount of data which gets | ||
3003 | * written. Secondly, 4 megabytes is way too small. XFS | ||
3004 | * forces this value to be 16 megabytes by multiplying | ||
3005 | * nr_to_write parameter by four, and then relies on its | ||
3006 | * allocator to allocate larger extents to make them | ||
3007 | * contiguous. Unfortunately this brings us to the second | ||
3008 | * stupidity, which is that ext4's mballoc code only allocates | ||
3009 | * at most 2048 blocks. So we force contiguous writes up to | ||
3010 | * the number of dirty blocks in the inode, or | ||
3011 | * sbi->max_writeback_mb_bump whichever is smaller. | ||
3012 | */ | ||
3013 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
3014 | if (!range_cyclic && range_whole) { | ||
3015 | if (wbc->nr_to_write == LONG_MAX) | ||
3016 | desired_nr_to_write = wbc->nr_to_write; | ||
3017 | else | ||
3018 | desired_nr_to_write = wbc->nr_to_write * 8; | ||
3019 | } else | ||
3020 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
3021 | max_pages); | ||
3022 | if (desired_nr_to_write > max_pages) | ||
3023 | desired_nr_to_write = max_pages; | ||
3024 | |||
3025 | if (wbc->nr_to_write < desired_nr_to_write) { | ||
3026 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
3027 | wbc->nr_to_write = desired_nr_to_write; | ||
3028 | } | ||
2701 | 3029 | ||
2702 | mpd.wbc = wbc; | 3030 | mpd.wbc = wbc; |
2703 | mpd.inode = mapping->host; | 3031 | mpd.inode = mapping->host; |
2704 | 3032 | ||
2705 | /* | ||
2706 | * we don't want write_cache_pages to update | ||
2707 | * nr_to_write and writeback_index | ||
2708 | */ | ||
2709 | no_nrwrite_index_update = wbc->no_nrwrite_index_update; | ||
2710 | wbc->no_nrwrite_index_update = 1; | ||
2711 | pages_skipped = wbc->pages_skipped; | 3033 | pages_skipped = wbc->pages_skipped; |
2712 | 3034 | ||
2713 | retry: | 3035 | retry: |
3036 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
3037 | tag_pages_for_writeback(mapping, index, end); | ||
3038 | |||
2714 | while (!ret && wbc->nr_to_write > 0) { | 3039 | while (!ret && wbc->nr_to_write > 0) { |
2715 | 3040 | ||
2716 | /* | 3041 | /* |
@@ -2726,10 +3051,9 @@ retry: | |||
2726 | handle = ext4_journal_start(inode, needed_blocks); | 3051 | handle = ext4_journal_start(inode, needed_blocks); |
2727 | if (IS_ERR(handle)) { | 3052 | if (IS_ERR(handle)) { |
2728 | ret = PTR_ERR(handle); | 3053 | ret = PTR_ERR(handle); |
2729 | printk(KERN_CRIT "%s: jbd2_start: " | 3054 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2730 | "%ld pages, ino %lu; err %d\n", __func__, | 3055 | "%ld pages, ino %lu; err %d", __func__, |
2731 | wbc->nr_to_write, inode->i_ino, ret); | 3056 | wbc->nr_to_write, inode->i_ino, ret); |
2732 | dump_stack(); | ||
2733 | goto out_writepages; | 3057 | goto out_writepages; |
2734 | } | 3058 | } |
2735 | 3059 | ||
@@ -2750,19 +3074,17 @@ retry: | |||
2750 | mpd.io_done = 0; | 3074 | mpd.io_done = 0; |
2751 | mpd.pages_written = 0; | 3075 | mpd.pages_written = 0; |
2752 | mpd.retval = 0; | 3076 | mpd.retval = 0; |
2753 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, | 3077 | ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); |
2754 | &mpd); | ||
2755 | /* | 3078 | /* |
2756 | * If we have a contigous extent of pages and we | 3079 | * If we have a contiguous extent of pages and we |
2757 | * haven't done the I/O yet, map the blocks and submit | 3080 | * haven't done the I/O yet, map the blocks and submit |
2758 | * them for I/O. | 3081 | * them for I/O. |
2759 | */ | 3082 | */ |
2760 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { | 3083 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { |
2761 | if (mpage_da_map_blocks(&mpd) == 0) | 3084 | mpage_da_map_and_submit(&mpd); |
2762 | mpage_da_submit_io(&mpd); | ||
2763 | mpd.io_done = 1; | ||
2764 | ret = MPAGE_DA_EXTENT_TAIL; | 3085 | ret = MPAGE_DA_EXTENT_TAIL; |
2765 | } | 3086 | } |
3087 | trace_ext4_da_write_pages(inode, &mpd); | ||
2766 | wbc->nr_to_write -= mpd.pages_written; | 3088 | wbc->nr_to_write -= mpd.pages_written; |
2767 | 3089 | ||
2768 | ext4_journal_stop(handle); | 3090 | ext4_journal_stop(handle); |
@@ -2800,24 +3122,23 @@ retry: | |||
2800 | goto retry; | 3122 | goto retry; |
2801 | } | 3123 | } |
2802 | if (pages_skipped != wbc->pages_skipped) | 3124 | if (pages_skipped != wbc->pages_skipped) |
2803 | printk(KERN_EMERG "This should not happen leaving %s " | 3125 | ext4_msg(inode->i_sb, KERN_CRIT, |
2804 | "with nr_to_write = %ld ret = %d\n", | 3126 | "This should not happen leaving %s " |
2805 | __func__, wbc->nr_to_write, ret); | 3127 | "with nr_to_write = %ld ret = %d", |
3128 | __func__, wbc->nr_to_write, ret); | ||
2806 | 3129 | ||
2807 | /* Update index */ | 3130 | /* Update index */ |
2808 | index += pages_written; | ||
2809 | wbc->range_cyclic = range_cyclic; | 3131 | wbc->range_cyclic = range_cyclic; |
2810 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 3132 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
2811 | /* | 3133 | /* |
2812 | * set the writeback_index so that range_cyclic | 3134 | * set the writeback_index so that range_cyclic |
2813 | * mode will write it back later | 3135 | * mode will write it back later |
2814 | */ | 3136 | */ |
2815 | mapping->writeback_index = index; | 3137 | mapping->writeback_index = done_index; |
2816 | 3138 | ||
2817 | out_writepages: | 3139 | out_writepages: |
2818 | if (!no_nrwrite_index_update) | ||
2819 | wbc->no_nrwrite_index_update = 0; | ||
2820 | wbc->nr_to_write -= nr_to_writebump; | 3140 | wbc->nr_to_write -= nr_to_writebump; |
3141 | wbc->range_start = range_start; | ||
2821 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | 3142 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); |
2822 | return ret; | 3143 | return ret; |
2823 | } | 3144 | } |
@@ -2841,11 +3162,18 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2841 | if (2 * free_blocks < 3 * dirty_blocks || | 3162 | if (2 * free_blocks < 3 * dirty_blocks || |
2842 | free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { | 3163 | free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { |
2843 | /* | 3164 | /* |
2844 | * free block count is less that 150% of dirty blocks | 3165 | * free block count is less than 150% of dirty blocks |
2845 | * or free blocks is less that watermark | 3166 | * or free blocks is less than watermark |
2846 | */ | 3167 | */ |
2847 | return 1; | 3168 | return 1; |
2848 | } | 3169 | } |
3170 | /* | ||
3171 | * Even if we don't switch but are nearing capacity, | ||
3172 | * start pushing delalloc when 1/2 of free blocks are dirty. | ||
3173 | */ | ||
3174 | if (free_blocks < 2 * dirty_blocks) | ||
3175 | writeback_inodes_sb_if_idle(sb); | ||
3176 | |||
2849 | return 0; | 3177 | return 0; |
2850 | } | 3178 | } |
2851 | 3179 | ||
@@ -2856,13 +3184,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | |||
2856 | int ret, retries = 0; | 3184 | int ret, retries = 0; |
2857 | struct page *page; | 3185 | struct page *page; |
2858 | pgoff_t index; | 3186 | pgoff_t index; |
2859 | unsigned from, to; | ||
2860 | struct inode *inode = mapping->host; | 3187 | struct inode *inode = mapping->host; |
2861 | handle_t *handle; | 3188 | handle_t *handle; |
2862 | 3189 | ||
2863 | index = pos >> PAGE_CACHE_SHIFT; | 3190 | index = pos >> PAGE_CACHE_SHIFT; |
2864 | from = pos & (PAGE_CACHE_SIZE - 1); | ||
2865 | to = from + len; | ||
2866 | 3191 | ||
2867 | if (ext4_nonda_switch(inode->i_sb)) { | 3192 | if (ext4_nonda_switch(inode->i_sb)) { |
2868 | *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; | 3193 | *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; |
@@ -2895,8 +3220,7 @@ retry: | |||
2895 | } | 3220 | } |
2896 | *pagep = page; | 3221 | *pagep = page; |
2897 | 3222 | ||
2898 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | 3223 | ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); |
2899 | ext4_da_get_block_prep); | ||
2900 | if (ret < 0) { | 3224 | if (ret < 0) { |
2901 | unlock_page(page); | 3225 | unlock_page(page); |
2902 | ext4_journal_stop(handle); | 3226 | ext4_journal_stop(handle); |
@@ -2907,7 +3231,7 @@ retry: | |||
2907 | * i_size_read because we hold i_mutex. | 3231 | * i_size_read because we hold i_mutex. |
2908 | */ | 3232 | */ |
2909 | if (pos + len > inode->i_size) | 3233 | if (pos + len > inode->i_size) |
2910 | vmtruncate(inode, inode->i_size); | 3234 | ext4_truncate_failed_write(inode); |
2911 | } | 3235 | } |
2912 | 3236 | ||
2913 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 3237 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
@@ -3030,6 +3354,8 @@ out: | |||
3030 | */ | 3354 | */ |
3031 | int ext4_alloc_da_blocks(struct inode *inode) | 3355 | int ext4_alloc_da_blocks(struct inode *inode) |
3032 | { | 3356 | { |
3357 | trace_ext4_alloc_da_blocks(inode); | ||
3358 | |||
3033 | if (!EXT4_I(inode)->i_reserved_data_blocks && | 3359 | if (!EXT4_I(inode)->i_reserved_data_blocks && |
3034 | !EXT4_I(inode)->i_reserved_meta_blocks) | 3360 | !EXT4_I(inode)->i_reserved_meta_blocks) |
3035 | return 0; | 3361 | return 0; |
@@ -3098,7 +3424,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
3098 | filemap_write_and_wait(mapping); | 3424 | filemap_write_and_wait(mapping); |
3099 | } | 3425 | } |
3100 | 3426 | ||
3101 | if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | 3427 | if (EXT4_JOURNAL(inode) && |
3428 | ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { | ||
3102 | /* | 3429 | /* |
3103 | * This is a REALLY heavyweight approach, but the use of | 3430 | * This is a REALLY heavyweight approach, but the use of |
3104 | * bmap on dirty files is expected to be extremely rare: | 3431 | * bmap on dirty files is expected to be extremely rare: |
@@ -3117,7 +3444,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
3117 | * everything they get. | 3444 | * everything they get. |
3118 | */ | 3445 | */ |
3119 | 3446 | ||
3120 | EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; | 3447 | ext4_clear_inode_state(inode, EXT4_STATE_JDATA); |
3121 | journal = EXT4_JOURNAL(inode); | 3448 | journal = EXT4_JOURNAL(inode); |
3122 | jbd2_journal_lock_updates(journal); | 3449 | jbd2_journal_lock_updates(journal); |
3123 | err = jbd2_journal_flush(journal); | 3450 | err = jbd2_journal_flush(journal); |
@@ -3130,222 +3457,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
3130 | return generic_block_bmap(mapping, block, ext4_get_block); | 3457 | return generic_block_bmap(mapping, block, ext4_get_block); |
3131 | } | 3458 | } |
3132 | 3459 | ||
3133 | static int bget_one(handle_t *handle, struct buffer_head *bh) | ||
3134 | { | ||
3135 | get_bh(bh); | ||
3136 | return 0; | ||
3137 | } | ||
3138 | |||
3139 | static int bput_one(handle_t *handle, struct buffer_head *bh) | ||
3140 | { | ||
3141 | put_bh(bh); | ||
3142 | return 0; | ||
3143 | } | ||
3144 | |||
3145 | /* | ||
3146 | * Note that we don't need to start a transaction unless we're journaling data | ||
3147 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | ||
3148 | * need to file the inode to the transaction's list in ordered mode because if | ||
3149 | * we are writing back data added by write(), the inode is already there and if | ||
3150 | * we are writing back data modified via mmap(), noone guarantees in which | ||
3151 | * transaction the data will hit the disk. In case we are journaling data, we | ||
3152 | * cannot start transaction directly because transaction start ranks above page | ||
3153 | * lock so we have to do some magic. | ||
3154 | * | ||
3155 | * In all journaling modes block_write_full_page() will start the I/O. | ||
3156 | * | ||
3157 | * Problem: | ||
3158 | * | ||
3159 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | ||
3160 | * ext4_writepage() | ||
3161 | * | ||
3162 | * Similar for: | ||
3163 | * | ||
3164 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... | ||
3165 | * | ||
3166 | * Same applies to ext4_get_block(). We will deadlock on various things like | ||
3167 | * lock_journal and i_data_sem | ||
3168 | * | ||
3169 | * Setting PF_MEMALLOC here doesn't work - too many internal memory | ||
3170 | * allocations fail. | ||
3171 | * | ||
3172 | * 16May01: If we're reentered then journal_current_handle() will be | ||
3173 | * non-zero. We simply *return*. | ||
3174 | * | ||
3175 | * 1 July 2001: @@@ FIXME: | ||
3176 | * In journalled data mode, a data buffer may be metadata against the | ||
3177 | * current transaction. But the same file is part of a shared mapping | ||
3178 | * and someone does a writepage() on it. | ||
3179 | * | ||
3180 | * We will move the buffer onto the async_data list, but *after* it has | ||
3181 | * been dirtied. So there's a small window where we have dirty data on | ||
3182 | * BJ_Metadata. | ||
3183 | * | ||
3184 | * Note that this only applies to the last partial page in the file. The | ||
3185 | * bit which block_write_full_page() uses prepare/commit for. (That's | ||
3186 | * broken code anyway: it's wrong for msync()). | ||
3187 | * | ||
3188 | * It's a rare case: affects the final partial page, for journalled data | ||
3189 | * where the file is subject to bith write() and writepage() in the same | ||
3190 | * transction. To fix it we'll need a custom block_write_full_page(). | ||
3191 | * We'll probably need that anyway for journalling writepage() output. | ||
3192 | * | ||
3193 | * We don't honour synchronous mounts for writepage(). That would be | ||
3194 | * disastrous. Any write() or metadata operation will sync the fs for | ||
3195 | * us. | ||
3196 | * | ||
3197 | */ | ||
3198 | static int __ext4_normal_writepage(struct page *page, | ||
3199 | struct writeback_control *wbc) | ||
3200 | { | ||
3201 | struct inode *inode = page->mapping->host; | ||
3202 | |||
3203 | if (test_opt(inode->i_sb, NOBH)) | ||
3204 | return nobh_writepage(page, noalloc_get_block_write, wbc); | ||
3205 | else | ||
3206 | return block_write_full_page(page, noalloc_get_block_write, | ||
3207 | wbc); | ||
3208 | } | ||
3209 | |||
3210 | static int ext4_normal_writepage(struct page *page, | ||
3211 | struct writeback_control *wbc) | ||
3212 | { | ||
3213 | struct inode *inode = page->mapping->host; | ||
3214 | loff_t size = i_size_read(inode); | ||
3215 | loff_t len; | ||
3216 | |||
3217 | trace_ext4_normal_writepage(inode, page); | ||
3218 | J_ASSERT(PageLocked(page)); | ||
3219 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
3220 | len = size & ~PAGE_CACHE_MASK; | ||
3221 | else | ||
3222 | len = PAGE_CACHE_SIZE; | ||
3223 | |||
3224 | if (page_has_buffers(page)) { | ||
3225 | /* if page has buffers it should all be mapped | ||
3226 | * and allocated. If there are not buffers attached | ||
3227 | * to the page we know the page is dirty but it lost | ||
3228 | * buffers. That means that at some moment in time | ||
3229 | * after write_begin() / write_end() has been called | ||
3230 | * all buffers have been clean and thus they must have been | ||
3231 | * written at least once. So they are all mapped and we can | ||
3232 | * happily proceed with mapping them and writing the page. | ||
3233 | */ | ||
3234 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
3235 | ext4_bh_unmapped_or_delay)); | ||
3236 | } | ||
3237 | |||
3238 | if (!ext4_journal_current_handle()) | ||
3239 | return __ext4_normal_writepage(page, wbc); | ||
3240 | |||
3241 | redirty_page_for_writepage(wbc, page); | ||
3242 | unlock_page(page); | ||
3243 | return 0; | ||
3244 | } | ||
3245 | |||
3246 | static int __ext4_journalled_writepage(struct page *page, | ||
3247 | struct writeback_control *wbc) | ||
3248 | { | ||
3249 | struct address_space *mapping = page->mapping; | ||
3250 | struct inode *inode = mapping->host; | ||
3251 | struct buffer_head *page_bufs; | ||
3252 | handle_t *handle = NULL; | ||
3253 | int ret = 0; | ||
3254 | int err; | ||
3255 | |||
3256 | ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | ||
3257 | noalloc_get_block_write); | ||
3258 | if (ret != 0) | ||
3259 | goto out_unlock; | ||
3260 | |||
3261 | page_bufs = page_buffers(page); | ||
3262 | walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, | ||
3263 | bget_one); | ||
3264 | /* As soon as we unlock the page, it can go away, but we have | ||
3265 | * references to buffers so we are safe */ | ||
3266 | unlock_page(page); | ||
3267 | |||
3268 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | ||
3269 | if (IS_ERR(handle)) { | ||
3270 | ret = PTR_ERR(handle); | ||
3271 | goto out; | ||
3272 | } | ||
3273 | |||
3274 | ret = walk_page_buffers(handle, page_bufs, 0, | ||
3275 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); | ||
3276 | |||
3277 | err = walk_page_buffers(handle, page_bufs, 0, | ||
3278 | PAGE_CACHE_SIZE, NULL, write_end_fn); | ||
3279 | if (ret == 0) | ||
3280 | ret = err; | ||
3281 | err = ext4_journal_stop(handle); | ||
3282 | if (!ret) | ||
3283 | ret = err; | ||
3284 | |||
3285 | walk_page_buffers(handle, page_bufs, 0, | ||
3286 | PAGE_CACHE_SIZE, NULL, bput_one); | ||
3287 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | ||
3288 | goto out; | ||
3289 | |||
3290 | out_unlock: | ||
3291 | unlock_page(page); | ||
3292 | out: | ||
3293 | return ret; | ||
3294 | } | ||
3295 | |||
3296 | static int ext4_journalled_writepage(struct page *page, | ||
3297 | struct writeback_control *wbc) | ||
3298 | { | ||
3299 | struct inode *inode = page->mapping->host; | ||
3300 | loff_t size = i_size_read(inode); | ||
3301 | loff_t len; | ||
3302 | |||
3303 | trace_ext4_journalled_writepage(inode, page); | ||
3304 | J_ASSERT(PageLocked(page)); | ||
3305 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
3306 | len = size & ~PAGE_CACHE_MASK; | ||
3307 | else | ||
3308 | len = PAGE_CACHE_SIZE; | ||
3309 | |||
3310 | if (page_has_buffers(page)) { | ||
3311 | /* if page has buffers it should all be mapped | ||
3312 | * and allocated. If there are not buffers attached | ||
3313 | * to the page we know the page is dirty but it lost | ||
3314 | * buffers. That means that at some moment in time | ||
3315 | * after write_begin() / write_end() has been called | ||
3316 | * all buffers have been clean and thus they must have been | ||
3317 | * written at least once. So they are all mapped and we can | ||
3318 | * happily proceed with mapping them and writing the page. | ||
3319 | */ | ||
3320 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
3321 | ext4_bh_unmapped_or_delay)); | ||
3322 | } | ||
3323 | |||
3324 | if (ext4_journal_current_handle()) | ||
3325 | goto no_write; | ||
3326 | |||
3327 | if (PageChecked(page)) { | ||
3328 | /* | ||
3329 | * It's mmapped pagecache. Add buffers and journal it. There | ||
3330 | * doesn't seem much point in redirtying the page here. | ||
3331 | */ | ||
3332 | ClearPageChecked(page); | ||
3333 | return __ext4_journalled_writepage(page, wbc); | ||
3334 | } else { | ||
3335 | /* | ||
3336 | * It may be a page full of checkpoint-mode buffers. We don't | ||
3337 | * really know unless we go poke around in the buffer_heads. | ||
3338 | * But block_write_full_page will do the right thing. | ||
3339 | */ | ||
3340 | return block_write_full_page(page, noalloc_get_block_write, | ||
3341 | wbc); | ||
3342 | } | ||
3343 | no_write: | ||
3344 | redirty_page_for_writepage(wbc, page); | ||
3345 | unlock_page(page); | ||
3346 | return 0; | ||
3347 | } | ||
3348 | |||
3349 | static int ext4_readpage(struct file *file, struct page *page) | 3460 | static int ext4_readpage(struct file *file, struct page *page) |
3350 | { | 3461 | { |
3351 | return mpage_readpage(page, ext4_get_block); | 3462 | return mpage_readpage(page, ext4_get_block); |
@@ -3358,11 +3469,36 @@ ext4_readpages(struct file *file, struct address_space *mapping, | |||
3358 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); | 3469 | return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); |
3359 | } | 3470 | } |
3360 | 3471 | ||
3472 | static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) | ||
3473 | { | ||
3474 | struct buffer_head *head, *bh; | ||
3475 | unsigned int curr_off = 0; | ||
3476 | |||
3477 | if (!page_has_buffers(page)) | ||
3478 | return; | ||
3479 | head = bh = page_buffers(page); | ||
3480 | do { | ||
3481 | if (offset <= curr_off && test_clear_buffer_uninit(bh) | ||
3482 | && bh->b_private) { | ||
3483 | ext4_free_io_end(bh->b_private); | ||
3484 | bh->b_private = NULL; | ||
3485 | bh->b_end_io = NULL; | ||
3486 | } | ||
3487 | curr_off = curr_off + bh->b_size; | ||
3488 | bh = bh->b_this_page; | ||
3489 | } while (bh != head); | ||
3490 | } | ||
3491 | |||
3361 | static void ext4_invalidatepage(struct page *page, unsigned long offset) | 3492 | static void ext4_invalidatepage(struct page *page, unsigned long offset) |
3362 | { | 3493 | { |
3363 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); | 3494 | journal_t *journal = EXT4_JOURNAL(page->mapping->host); |
3364 | 3495 | ||
3365 | /* | 3496 | /* |
3497 | * free any io_end structure allocated for buffers to be discarded | ||
3498 | */ | ||
3499 | if (ext4_should_dioread_nolock(page->mapping->host)) | ||
3500 | ext4_invalidatepage_free_endio(page, offset); | ||
3501 | /* | ||
3366 | * If it's a full truncate we just forget about the pending dirtying | 3502 | * If it's a full truncate we just forget about the pending dirtying |
3367 | */ | 3503 | */ |
3368 | if (offset == 0) | 3504 | if (offset == 0) |
@@ -3388,6 +3524,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3388 | } | 3524 | } |
3389 | 3525 | ||
3390 | /* | 3526 | /* |
3527 | * O_DIRECT for ext3 (or indirect map) based files | ||
3528 | * | ||
3391 | * If the O_DIRECT write will extend the file then add this inode to the | 3529 | * If the O_DIRECT write will extend the file then add this inode to the |
3392 | * orphan list. So recovery will truncate it back to the original size | 3530 | * orphan list. So recovery will truncate it back to the original size |
3393 | * if the machine crashes during the write. | 3531 | * if the machine crashes during the write. |
@@ -3396,7 +3534,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3396 | * crashes then stale disk data _may_ be exposed inside the file. But current | 3534 | * crashes then stale disk data _may_ be exposed inside the file. But current |
3397 | * VFS code falls back into buffered path in that case so we are safe. | 3535 | * VFS code falls back into buffered path in that case so we are safe. |
3398 | */ | 3536 | */ |
3399 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | 3537 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
3400 | const struct iovec *iov, loff_t offset, | 3538 | const struct iovec *iov, loff_t offset, |
3401 | unsigned long nr_segs) | 3539 | unsigned long nr_segs) |
3402 | { | 3540 | { |
@@ -3407,6 +3545,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | |||
3407 | ssize_t ret; | 3545 | ssize_t ret; |
3408 | int orphan = 0; | 3546 | int orphan = 0; |
3409 | size_t count = iov_length(iov, nr_segs); | 3547 | size_t count = iov_length(iov, nr_segs); |
3548 | int retries = 0; | ||
3410 | 3549 | ||
3411 | if (rw == WRITE) { | 3550 | if (rw == WRITE) { |
3412 | loff_t final_size = offset + count; | 3551 | loff_t final_size = offset + count; |
@@ -3429,10 +3568,29 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | |||
3429 | } | 3568 | } |
3430 | } | 3569 | } |
3431 | 3570 | ||
3432 | ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, | 3571 | retry: |
3572 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
3573 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
3574 | inode->i_sb->s_bdev, iov, | ||
3575 | offset, nr_segs, | ||
3576 | ext4_get_block, NULL, NULL, 0); | ||
3577 | else { | ||
3578 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
3579 | inode->i_sb->s_bdev, iov, | ||
3433 | offset, nr_segs, | 3580 | offset, nr_segs, |
3434 | ext4_get_block, NULL); | 3581 | ext4_get_block, NULL); |
3435 | 3582 | ||
3583 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
3584 | loff_t isize = i_size_read(inode); | ||
3585 | loff_t end = offset + iov_length(iov, nr_segs); | ||
3586 | |||
3587 | if (end > isize) | ||
3588 | vmtruncate(inode, isize); | ||
3589 | } | ||
3590 | } | ||
3591 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3592 | goto retry; | ||
3593 | |||
3436 | if (orphan) { | 3594 | if (orphan) { |
3437 | int err; | 3595 | int err; |
3438 | 3596 | ||
@@ -3443,6 +3601,9 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | |||
3443 | * but cannot extend i_size. Bail out and pretend | 3601 | * but cannot extend i_size. Bail out and pretend |
3444 | * the write failed... */ | 3602 | * the write failed... */ |
3445 | ret = PTR_ERR(handle); | 3603 | ret = PTR_ERR(handle); |
3604 | if (inode->i_nlink) | ||
3605 | ext4_orphan_del(NULL, inode); | ||
3606 | |||
3446 | goto out; | 3607 | goto out; |
3447 | } | 3608 | } |
3448 | if (inode->i_nlink) | 3609 | if (inode->i_nlink) |
@@ -3471,6 +3632,254 @@ out: | |||
3471 | } | 3632 | } |
3472 | 3633 | ||
3473 | /* | 3634 | /* |
3635 | * ext4_get_block used when preparing for a DIO write or buffer write. | ||
3636 | * We allocate an uinitialized extent if blocks haven't been allocated. | ||
3637 | * The extent will be converted to initialized after the IO is complete. | ||
3638 | */ | ||
3639 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | ||
3640 | struct buffer_head *bh_result, int create) | ||
3641 | { | ||
3642 | ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", | ||
3643 | inode->i_ino, create); | ||
3644 | return _ext4_get_block(inode, iblock, bh_result, | ||
3645 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||
3646 | } | ||
3647 | |||
3648 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||
3649 | ssize_t size, void *private, int ret, | ||
3650 | bool is_async) | ||
3651 | { | ||
3652 | ext4_io_end_t *io_end = iocb->private; | ||
3653 | struct workqueue_struct *wq; | ||
3654 | unsigned long flags; | ||
3655 | struct ext4_inode_info *ei; | ||
3656 | |||
3657 | /* if not async direct IO or dio with 0 bytes write, just return */ | ||
3658 | if (!io_end || !size) | ||
3659 | goto out; | ||
3660 | |||
3661 | ext_debug("ext4_end_io_dio(): io_end 0x%p" | ||
3662 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", | ||
3663 | iocb->private, io_end->inode->i_ino, iocb, offset, | ||
3664 | size); | ||
3665 | |||
3666 | /* if not aio dio with unwritten extents, just free io and return */ | ||
3667 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
3668 | ext4_free_io_end(io_end); | ||
3669 | iocb->private = NULL; | ||
3670 | out: | ||
3671 | if (is_async) | ||
3672 | aio_complete(iocb, ret, 0); | ||
3673 | return; | ||
3674 | } | ||
3675 | |||
3676 | io_end->offset = offset; | ||
3677 | io_end->size = size; | ||
3678 | if (is_async) { | ||
3679 | io_end->iocb = iocb; | ||
3680 | io_end->result = ret; | ||
3681 | } | ||
3682 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
3683 | |||
3684 | /* Add the io_end to per-inode completed aio dio list*/ | ||
3685 | ei = EXT4_I(io_end->inode); | ||
3686 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
3687 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | ||
3688 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
3689 | |||
3690 | /* queue the work to convert unwritten extents to written */ | ||
3691 | queue_work(wq, &io_end->work); | ||
3692 | iocb->private = NULL; | ||
3693 | } | ||
3694 | |||
3695 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | ||
3696 | { | ||
3697 | ext4_io_end_t *io_end = bh->b_private; | ||
3698 | struct workqueue_struct *wq; | ||
3699 | struct inode *inode; | ||
3700 | unsigned long flags; | ||
3701 | |||
3702 | if (!test_clear_buffer_uninit(bh) || !io_end) | ||
3703 | goto out; | ||
3704 | |||
3705 | if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { | ||
3706 | printk("sb umounted, discard end_io request for inode %lu\n", | ||
3707 | io_end->inode->i_ino); | ||
3708 | ext4_free_io_end(io_end); | ||
3709 | goto out; | ||
3710 | } | ||
3711 | |||
3712 | io_end->flag = EXT4_IO_END_UNWRITTEN; | ||
3713 | inode = io_end->inode; | ||
3714 | |||
3715 | /* Add the io_end to per-inode completed io list*/ | ||
3716 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
3717 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | ||
3718 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
3719 | |||
3720 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | ||
3721 | /* queue the work to convert unwritten extents to written */ | ||
3722 | queue_work(wq, &io_end->work); | ||
3723 | out: | ||
3724 | bh->b_private = NULL; | ||
3725 | bh->b_end_io = NULL; | ||
3726 | clear_buffer_uninit(bh); | ||
3727 | end_buffer_async_write(bh, uptodate); | ||
3728 | } | ||
3729 | |||
3730 | static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) | ||
3731 | { | ||
3732 | ext4_io_end_t *io_end; | ||
3733 | struct page *page = bh->b_page; | ||
3734 | loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; | ||
3735 | size_t size = bh->b_size; | ||
3736 | |||
3737 | retry: | ||
3738 | io_end = ext4_init_io_end(inode, GFP_ATOMIC); | ||
3739 | if (!io_end) { | ||
3740 | pr_warning_ratelimited("%s: allocation fail\n", __func__); | ||
3741 | schedule(); | ||
3742 | goto retry; | ||
3743 | } | ||
3744 | io_end->offset = offset; | ||
3745 | io_end->size = size; | ||
3746 | /* | ||
3747 | * We need to hold a reference to the page to make sure it | ||
3748 | * doesn't get evicted before ext4_end_io_work() has a chance | ||
3749 | * to convert the extent from written to unwritten. | ||
3750 | */ | ||
3751 | io_end->page = page; | ||
3752 | get_page(io_end->page); | ||
3753 | |||
3754 | bh->b_private = io_end; | ||
3755 | bh->b_end_io = ext4_end_io_buffer_write; | ||
3756 | return 0; | ||
3757 | } | ||
3758 | |||
3759 | /* | ||
3760 | * For ext4 extent files, ext4 will do direct-io write to holes, | ||
3761 | * preallocated extents, and those write extend the file, no need to | ||
3762 | * fall back to buffered IO. | ||
3763 | * | ||
3764 | * For holes, we fallocate those blocks, mark them as unintialized | ||
3765 | * If those blocks were preallocated, we mark sure they are splited, but | ||
3766 | * still keep the range to write as unintialized. | ||
3767 | * | ||
3768 | * The unwrritten extents will be converted to written when DIO is completed. | ||
3769 | * For async direct IO, since the IO may still pending when return, we | ||
3770 | * set up an end_io call back function, which will do the convertion | ||
3771 | * when async direct IO completed. | ||
3772 | * | ||
3773 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3774 | * orphan list. So recovery will truncate it back to the original size | ||
3775 | * if the machine crashes during the write. | ||
3776 | * | ||
3777 | */ | ||
3778 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | ||
3779 | const struct iovec *iov, loff_t offset, | ||
3780 | unsigned long nr_segs) | ||
3781 | { | ||
3782 | struct file *file = iocb->ki_filp; | ||
3783 | struct inode *inode = file->f_mapping->host; | ||
3784 | ssize_t ret; | ||
3785 | size_t count = iov_length(iov, nr_segs); | ||
3786 | |||
3787 | loff_t final_size = offset + count; | ||
3788 | if (rw == WRITE && final_size <= inode->i_size) { | ||
3789 | /* | ||
3790 | * We could direct write to holes and fallocate. | ||
3791 | * | ||
3792 | * Allocated blocks to fill the hole are marked as uninitialized | ||
3793 | * to prevent paralel buffered read to expose the stale data | ||
3794 | * before DIO complete the data IO. | ||
3795 | * | ||
3796 | * As to previously fallocated extents, ext4 get_block | ||
3797 | * will just simply mark the buffer mapped but still | ||
3798 | * keep the extents uninitialized. | ||
3799 | * | ||
3800 | * for non AIO case, we will convert those unwritten extents | ||
3801 | * to written after return back from blockdev_direct_IO. | ||
3802 | * | ||
3803 | * for async DIO, the conversion needs to be defered when | ||
3804 | * the IO is completed. The ext4 end_io callback function | ||
3805 | * will be called to take care of the conversion work. | ||
3806 | * Here for async case, we allocate an io_end structure to | ||
3807 | * hook to the iocb. | ||
3808 | */ | ||
3809 | iocb->private = NULL; | ||
3810 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
3811 | if (!is_sync_kiocb(iocb)) { | ||
3812 | iocb->private = ext4_init_io_end(inode, GFP_NOFS); | ||
3813 | if (!iocb->private) | ||
3814 | return -ENOMEM; | ||
3815 | /* | ||
3816 | * we save the io structure for current async | ||
3817 | * direct IO, so that later ext4_map_blocks() | ||
3818 | * could flag the io structure whether there | ||
3819 | * is a unwritten extents needs to be converted | ||
3820 | * when IO is completed. | ||
3821 | */ | ||
3822 | EXT4_I(inode)->cur_aio_dio = iocb->private; | ||
3823 | } | ||
3824 | |||
3825 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
3826 | inode->i_sb->s_bdev, iov, | ||
3827 | offset, nr_segs, | ||
3828 | ext4_get_block_write, | ||
3829 | ext4_end_io_dio); | ||
3830 | if (iocb->private) | ||
3831 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
3832 | /* | ||
3833 | * The io_end structure takes a reference to the inode, | ||
3834 | * that structure needs to be destroyed and the | ||
3835 | * reference to the inode need to be dropped, when IO is | ||
3836 | * complete, even with 0 byte write, or failed. | ||
3837 | * | ||
3838 | * In the successful AIO DIO case, the io_end structure will be | ||
3839 | * desctroyed and the reference to the inode will be dropped | ||
3840 | * after the end_io call back function is called. | ||
3841 | * | ||
3842 | * In the case there is 0 byte write, or error case, since | ||
3843 | * VFS direct IO won't invoke the end_io call back function, | ||
3844 | * we need to free the end_io structure here. | ||
3845 | */ | ||
3846 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | ||
3847 | ext4_free_io_end(iocb->private); | ||
3848 | iocb->private = NULL; | ||
3849 | } else if (ret > 0 && ext4_test_inode_state(inode, | ||
3850 | EXT4_STATE_DIO_UNWRITTEN)) { | ||
3851 | int err; | ||
3852 | /* | ||
3853 | * for non AIO case, since the IO is already | ||
3854 | * completed, we could do the convertion right here | ||
3855 | */ | ||
3856 | err = ext4_convert_unwritten_extents(inode, | ||
3857 | offset, ret); | ||
3858 | if (err < 0) | ||
3859 | ret = err; | ||
3860 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | ||
3861 | } | ||
3862 | return ret; | ||
3863 | } | ||
3864 | |||
3865 | /* for write the the end of file case, we fall back to old way */ | ||
3866 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3867 | } | ||
3868 | |||
3869 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
3870 | const struct iovec *iov, loff_t offset, | ||
3871 | unsigned long nr_segs) | ||
3872 | { | ||
3873 | struct file *file = iocb->ki_filp; | ||
3874 | struct inode *inode = file->f_mapping->host; | ||
3875 | |||
3876 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||
3877 | return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3878 | |||
3879 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3880 | } | ||
3881 | |||
3882 | /* | ||
3474 | * Pages can be marked dirty completely asynchronously from ext4's journalling | 3883 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
3475 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | 3884 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
3476 | * much here because ->set_page_dirty is called under VFS locks. The page is | 3885 | * much here because ->set_page_dirty is called under VFS locks. The page is |
@@ -3492,7 +3901,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
3492 | static const struct address_space_operations ext4_ordered_aops = { | 3901 | static const struct address_space_operations ext4_ordered_aops = { |
3493 | .readpage = ext4_readpage, | 3902 | .readpage = ext4_readpage, |
3494 | .readpages = ext4_readpages, | 3903 | .readpages = ext4_readpages, |
3495 | .writepage = ext4_normal_writepage, | 3904 | .writepage = ext4_writepage, |
3496 | .sync_page = block_sync_page, | 3905 | .sync_page = block_sync_page, |
3497 | .write_begin = ext4_write_begin, | 3906 | .write_begin = ext4_write_begin, |
3498 | .write_end = ext4_ordered_write_end, | 3907 | .write_end = ext4_ordered_write_end, |
@@ -3502,12 +3911,13 @@ static const struct address_space_operations ext4_ordered_aops = { | |||
3502 | .direct_IO = ext4_direct_IO, | 3911 | .direct_IO = ext4_direct_IO, |
3503 | .migratepage = buffer_migrate_page, | 3912 | .migratepage = buffer_migrate_page, |
3504 | .is_partially_uptodate = block_is_partially_uptodate, | 3913 | .is_partially_uptodate = block_is_partially_uptodate, |
3914 | .error_remove_page = generic_error_remove_page, | ||
3505 | }; | 3915 | }; |
3506 | 3916 | ||
3507 | static const struct address_space_operations ext4_writeback_aops = { | 3917 | static const struct address_space_operations ext4_writeback_aops = { |
3508 | .readpage = ext4_readpage, | 3918 | .readpage = ext4_readpage, |
3509 | .readpages = ext4_readpages, | 3919 | .readpages = ext4_readpages, |
3510 | .writepage = ext4_normal_writepage, | 3920 | .writepage = ext4_writepage, |
3511 | .sync_page = block_sync_page, | 3921 | .sync_page = block_sync_page, |
3512 | .write_begin = ext4_write_begin, | 3922 | .write_begin = ext4_write_begin, |
3513 | .write_end = ext4_writeback_write_end, | 3923 | .write_end = ext4_writeback_write_end, |
@@ -3517,12 +3927,13 @@ static const struct address_space_operations ext4_writeback_aops = { | |||
3517 | .direct_IO = ext4_direct_IO, | 3927 | .direct_IO = ext4_direct_IO, |
3518 | .migratepage = buffer_migrate_page, | 3928 | .migratepage = buffer_migrate_page, |
3519 | .is_partially_uptodate = block_is_partially_uptodate, | 3929 | .is_partially_uptodate = block_is_partially_uptodate, |
3930 | .error_remove_page = generic_error_remove_page, | ||
3520 | }; | 3931 | }; |
3521 | 3932 | ||
3522 | static const struct address_space_operations ext4_journalled_aops = { | 3933 | static const struct address_space_operations ext4_journalled_aops = { |
3523 | .readpage = ext4_readpage, | 3934 | .readpage = ext4_readpage, |
3524 | .readpages = ext4_readpages, | 3935 | .readpages = ext4_readpages, |
3525 | .writepage = ext4_journalled_writepage, | 3936 | .writepage = ext4_writepage, |
3526 | .sync_page = block_sync_page, | 3937 | .sync_page = block_sync_page, |
3527 | .write_begin = ext4_write_begin, | 3938 | .write_begin = ext4_write_begin, |
3528 | .write_end = ext4_journalled_write_end, | 3939 | .write_end = ext4_journalled_write_end, |
@@ -3531,12 +3942,13 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
3531 | .invalidatepage = ext4_invalidatepage, | 3942 | .invalidatepage = ext4_invalidatepage, |
3532 | .releasepage = ext4_releasepage, | 3943 | .releasepage = ext4_releasepage, |
3533 | .is_partially_uptodate = block_is_partially_uptodate, | 3944 | .is_partially_uptodate = block_is_partially_uptodate, |
3945 | .error_remove_page = generic_error_remove_page, | ||
3534 | }; | 3946 | }; |
3535 | 3947 | ||
3536 | static const struct address_space_operations ext4_da_aops = { | 3948 | static const struct address_space_operations ext4_da_aops = { |
3537 | .readpage = ext4_readpage, | 3949 | .readpage = ext4_readpage, |
3538 | .readpages = ext4_readpages, | 3950 | .readpages = ext4_readpages, |
3539 | .writepage = ext4_da_writepage, | 3951 | .writepage = ext4_writepage, |
3540 | .writepages = ext4_da_writepages, | 3952 | .writepages = ext4_da_writepages, |
3541 | .sync_page = block_sync_page, | 3953 | .sync_page = block_sync_page, |
3542 | .write_begin = ext4_da_write_begin, | 3954 | .write_begin = ext4_da_write_begin, |
@@ -3547,6 +3959,7 @@ static const struct address_space_operations ext4_da_aops = { | |||
3547 | .direct_IO = ext4_direct_IO, | 3959 | .direct_IO = ext4_direct_IO, |
3548 | .migratepage = buffer_migrate_page, | 3960 | .migratepage = buffer_migrate_page, |
3549 | .is_partially_uptodate = block_is_partially_uptodate, | 3961 | .is_partially_uptodate = block_is_partially_uptodate, |
3962 | .error_remove_page = generic_error_remove_page, | ||
3550 | }; | 3963 | }; |
3551 | 3964 | ||
3552 | void ext4_set_aops(struct inode *inode) | 3965 | void ext4_set_aops(struct inode *inode) |
@@ -3583,7 +3996,8 @@ int ext4_block_truncate_page(handle_t *handle, | |||
3583 | struct page *page; | 3996 | struct page *page; |
3584 | int err = 0; | 3997 | int err = 0; |
3585 | 3998 | ||
3586 | page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); | 3999 | page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, |
4000 | mapping_gfp_mask(mapping) & ~__GFP_FS); | ||
3587 | if (!page) | 4001 | if (!page) |
3588 | return -EINVAL; | 4002 | return -EINVAL; |
3589 | 4003 | ||
@@ -3591,17 +4005,6 @@ int ext4_block_truncate_page(handle_t *handle, | |||
3591 | length = blocksize - (offset & (blocksize - 1)); | 4005 | length = blocksize - (offset & (blocksize - 1)); |
3592 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | 4006 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); |
3593 | 4007 | ||
3594 | /* | ||
3595 | * For "nobh" option, we can only work if we don't need to | ||
3596 | * read-in the page - otherwise we create buffers to do the IO. | ||
3597 | */ | ||
3598 | if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && | ||
3599 | ext4_should_writeback_data(inode) && PageUptodate(page)) { | ||
3600 | zero_user(page, offset, length); | ||
3601 | set_page_dirty(page); | ||
3602 | goto unlock; | ||
3603 | } | ||
3604 | |||
3605 | if (!page_has_buffers(page)) | 4008 | if (!page_has_buffers(page)) |
3606 | create_empty_buffers(page, blocksize, 0); | 4009 | create_empty_buffers(page, blocksize, 0); |
3607 | 4010 | ||
@@ -3658,7 +4061,7 @@ int ext4_block_truncate_page(handle_t *handle, | |||
3658 | if (ext4_should_journal_data(inode)) { | 4061 | if (ext4_should_journal_data(inode)) { |
3659 | err = ext4_handle_dirty_metadata(handle, inode, bh); | 4062 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
3660 | } else { | 4063 | } else { |
3661 | if (ext4_should_order_data(inode)) | 4064 | if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) |
3662 | err = ext4_jbd2_file_inode(handle, inode); | 4065 | err = ext4_jbd2_file_inode(handle, inode); |
3663 | mark_buffer_dirty(bh); | 4066 | mark_buffer_dirty(bh); |
3664 | } | 4067 | } |
@@ -3725,7 +4128,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, | |||
3725 | int k, err; | 4128 | int k, err; |
3726 | 4129 | ||
3727 | *top = 0; | 4130 | *top = 0; |
3728 | /* Make k index the deepest non-null offest + 1 */ | 4131 | /* Make k index the deepest non-null offset + 1 */ |
3729 | for (k = depth; k > 1 && !offsets[k-1]; k--) | 4132 | for (k = depth; k > 1 && !offsets[k-1]; k--) |
3730 | ; | 4133 | ; |
3731 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | 4134 | partial = ext4_get_branch(inode, k, offsets, chain, &err); |
@@ -3774,47 +4177,58 @@ no_top: | |||
3774 | * We release `count' blocks on disk, but (last - first) may be greater | 4177 | * We release `count' blocks on disk, but (last - first) may be greater |
3775 | * than `count' because there can be holes in there. | 4178 | * than `count' because there can be holes in there. |
3776 | */ | 4179 | */ |
3777 | static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | 4180 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, |
3778 | struct buffer_head *bh, | 4181 | struct buffer_head *bh, |
3779 | ext4_fsblk_t block_to_free, | 4182 | ext4_fsblk_t block_to_free, |
3780 | unsigned long count, __le32 *first, | 4183 | unsigned long count, __le32 *first, |
3781 | __le32 *last) | 4184 | __le32 *last) |
3782 | { | 4185 | { |
3783 | __le32 *p; | 4186 | __le32 *p; |
4187 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
4188 | int err; | ||
4189 | |||
4190 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
4191 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4192 | |||
4193 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
4194 | count)) { | ||
4195 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
4196 | "blocks %llu len %lu", | ||
4197 | (unsigned long long) block_to_free, count); | ||
4198 | return 1; | ||
4199 | } | ||
4200 | |||
3784 | if (try_to_extend_transaction(handle, inode)) { | 4201 | if (try_to_extend_transaction(handle, inode)) { |
3785 | if (bh) { | 4202 | if (bh) { |
3786 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 4203 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
3787 | ext4_handle_dirty_metadata(handle, inode, bh); | 4204 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
4205 | if (unlikely(err)) { | ||
4206 | ext4_std_error(inode->i_sb, err); | ||
4207 | return 1; | ||
4208 | } | ||
4209 | } | ||
4210 | err = ext4_mark_inode_dirty(handle, inode); | ||
4211 | if (unlikely(err)) { | ||
4212 | ext4_std_error(inode->i_sb, err); | ||
4213 | return 1; | ||
4214 | } | ||
4215 | err = ext4_truncate_restart_trans(handle, inode, | ||
4216 | blocks_for_truncate(inode)); | ||
4217 | if (unlikely(err)) { | ||
4218 | ext4_std_error(inode->i_sb, err); | ||
4219 | return 1; | ||
3788 | } | 4220 | } |
3789 | ext4_mark_inode_dirty(handle, inode); | ||
3790 | ext4_journal_test_restart(handle, inode); | ||
3791 | if (bh) { | 4221 | if (bh) { |
3792 | BUFFER_TRACE(bh, "retaking write access"); | 4222 | BUFFER_TRACE(bh, "retaking write access"); |
3793 | ext4_journal_get_write_access(handle, bh); | 4223 | ext4_journal_get_write_access(handle, bh); |
3794 | } | 4224 | } |
3795 | } | 4225 | } |
3796 | 4226 | ||
3797 | /* | 4227 | for (p = first; p < last; p++) |
3798 | * Any buffers which are on the journal will be in memory. We | 4228 | *p = 0; |
3799 | * find them on the hash table so jbd2_journal_revoke() will | ||
3800 | * run jbd2_journal_forget() on them. We've already detached | ||
3801 | * each block from the file, so bforget() in | ||
3802 | * jbd2_journal_forget() should be safe. | ||
3803 | * | ||
3804 | * AKPM: turn on bforget in jbd2_journal_forget()!!! | ||
3805 | */ | ||
3806 | for (p = first; p < last; p++) { | ||
3807 | u32 nr = le32_to_cpu(*p); | ||
3808 | if (nr) { | ||
3809 | struct buffer_head *tbh; | ||
3810 | 4229 | ||
3811 | *p = 0; | 4230 | ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); |
3812 | tbh = sb_find_get_block(inode->i_sb, nr); | 4231 | return 0; |
3813 | ext4_forget(handle, 0, inode, tbh, nr); | ||
3814 | } | ||
3815 | } | ||
3816 | |||
3817 | ext4_free_blocks(handle, inode, block_to_free, count, 0); | ||
3818 | } | 4232 | } |
3819 | 4233 | ||
3820 | /** | 4234 | /** |
@@ -3870,9 +4284,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |||
3870 | } else if (nr == block_to_free + count) { | 4284 | } else if (nr == block_to_free + count) { |
3871 | count++; | 4285 | count++; |
3872 | } else { | 4286 | } else { |
3873 | ext4_clear_blocks(handle, inode, this_bh, | 4287 | if (ext4_clear_blocks(handle, inode, this_bh, |
3874 | block_to_free, | 4288 | block_to_free, count, |
3875 | count, block_to_free_p, p); | 4289 | block_to_free_p, p)) |
4290 | break; | ||
3876 | block_to_free = nr; | 4291 | block_to_free = nr; |
3877 | block_to_free_p = p; | 4292 | block_to_free_p = p; |
3878 | count = 1; | 4293 | count = 1; |
@@ -3896,11 +4311,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |||
3896 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | 4311 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) |
3897 | ext4_handle_dirty_metadata(handle, inode, this_bh); | 4312 | ext4_handle_dirty_metadata(handle, inode, this_bh); |
3898 | else | 4313 | else |
3899 | ext4_error(inode->i_sb, __func__, | 4314 | EXT4_ERROR_INODE(inode, |
3900 | "circular indirect block detected, " | 4315 | "circular indirect block detected at " |
3901 | "inode=%lu, block=%llu", | 4316 | "block %llu", |
3902 | inode->i_ino, | 4317 | (unsigned long long) this_bh->b_blocknr); |
3903 | (unsigned long long) this_bh->b_blocknr); | ||
3904 | } | 4318 | } |
3905 | } | 4319 | } |
3906 | 4320 | ||
@@ -3936,6 +4350,15 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3936 | if (!nr) | 4350 | if (!nr) |
3937 | continue; /* A hole */ | 4351 | continue; /* A hole */ |
3938 | 4352 | ||
4353 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
4354 | nr, 1)) { | ||
4355 | EXT4_ERROR_INODE(inode, | ||
4356 | "invalid indirect mapped " | ||
4357 | "block %lu (level %d)", | ||
4358 | (unsigned long) nr, depth); | ||
4359 | break; | ||
4360 | } | ||
4361 | |||
3939 | /* Go read the buffer for the next level down */ | 4362 | /* Go read the buffer for the next level down */ |
3940 | bh = sb_bread(inode->i_sb, nr); | 4363 | bh = sb_bread(inode->i_sb, nr); |
3941 | 4364 | ||
@@ -3944,9 +4367,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3944 | * (should be rare). | 4367 | * (should be rare). |
3945 | */ | 4368 | */ |
3946 | if (!bh) { | 4369 | if (!bh) { |
3947 | ext4_error(inode->i_sb, "ext4_free_branches", | 4370 | EXT4_ERROR_INODE_BLOCK(inode, nr, |
3948 | "Read failure, inode=%lu, block=%llu", | 4371 | "Read failure"); |
3949 | inode->i_ino, nr); | ||
3950 | continue; | 4372 | continue; |
3951 | } | 4373 | } |
3952 | 4374 | ||
@@ -3956,27 +4378,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3956 | (__le32 *) bh->b_data, | 4378 | (__le32 *) bh->b_data, |
3957 | (__le32 *) bh->b_data + addr_per_block, | 4379 | (__le32 *) bh->b_data + addr_per_block, |
3958 | depth); | 4380 | depth); |
3959 | 4381 | brelse(bh); | |
3960 | /* | ||
3961 | * We've probably journalled the indirect block several | ||
3962 | * times during the truncate. But it's no longer | ||
3963 | * needed and we now drop it from the transaction via | ||
3964 | * jbd2_journal_revoke(). | ||
3965 | * | ||
3966 | * That's easy if it's exclusively part of this | ||
3967 | * transaction. But if it's part of the committing | ||
3968 | * transaction then jbd2_journal_forget() will simply | ||
3969 | * brelse() it. That means that if the underlying | ||
3970 | * block is reallocated in ext4_get_block(), | ||
3971 | * unmap_underlying_metadata() will find this block | ||
3972 | * and will try to get rid of it. damn, damn. | ||
3973 | * | ||
3974 | * If this block has already been committed to the | ||
3975 | * journal, a revoke record will be written. And | ||
3976 | * revoke records must be emitted *before* clearing | ||
3977 | * this block's bit in the bitmaps. | ||
3978 | */ | ||
3979 | ext4_forget(handle, 1, inode, bh, bh->b_blocknr); | ||
3980 | 4382 | ||
3981 | /* | 4383 | /* |
3982 | * Everything below this this pointer has been | 4384 | * Everything below this this pointer has been |
@@ -3998,10 +4400,24 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3998 | return; | 4400 | return; |
3999 | if (try_to_extend_transaction(handle, inode)) { | 4401 | if (try_to_extend_transaction(handle, inode)) { |
4000 | ext4_mark_inode_dirty(handle, inode); | 4402 | ext4_mark_inode_dirty(handle, inode); |
4001 | ext4_journal_test_restart(handle, inode); | 4403 | ext4_truncate_restart_trans(handle, inode, |
4404 | blocks_for_truncate(inode)); | ||
4002 | } | 4405 | } |
4003 | 4406 | ||
4004 | ext4_free_blocks(handle, inode, nr, 1, 1); | 4407 | /* |
4408 | * The forget flag here is critical because if | ||
4409 | * we are journaling (and not doing data | ||
4410 | * journaling), we have to make sure a revoke | ||
4411 | * record is written to prevent the journal | ||
4412 | * replay from overwriting the (former) | ||
4413 | * indirect block if it gets reallocated as a | ||
4414 | * data block. This must happen in the same | ||
4415 | * transaction where the data blocks are | ||
4416 | * actually freed. | ||
4417 | */ | ||
4418 | ext4_free_blocks(handle, inode, 0, nr, 1, | ||
4419 | EXT4_FREE_BLOCKS_METADATA| | ||
4420 | EXT4_FREE_BLOCKS_FORGET); | ||
4005 | 4421 | ||
4006 | if (parent_bh) { | 4422 | if (parent_bh) { |
4007 | /* | 4423 | /* |
@@ -4086,11 +4502,12 @@ void ext4_truncate(struct inode *inode) | |||
4086 | if (!ext4_can_truncate(inode)) | 4502 | if (!ext4_can_truncate(inode)) |
4087 | return; | 4503 | return; |
4088 | 4504 | ||
4089 | if (ei->i_disksize && inode->i_size == 0 && | 4505 | ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); |
4090 | !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | ||
4091 | ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; | ||
4092 | 4506 | ||
4093 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | 4507 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4508 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | ||
4509 | |||
4510 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | ||
4094 | ext4_ext_truncate(inode); | 4511 | ext4_ext_truncate(inode); |
4095 | return; | 4512 | return; |
4096 | } | 4513 | } |
@@ -4258,9 +4675,8 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
4258 | 4675 | ||
4259 | bh = sb_getblk(sb, block); | 4676 | bh = sb_getblk(sb, block); |
4260 | if (!bh) { | 4677 | if (!bh) { |
4261 | ext4_error(sb, "ext4_get_inode_loc", "unable to read " | 4678 | EXT4_ERROR_INODE_BLOCK(inode, block, |
4262 | "inode block - inode=%lu, block=%llu", | 4679 | "unable to read itable block"); |
4263 | inode->i_ino, block); | ||
4264 | return -EIO; | 4680 | return -EIO; |
4265 | } | 4681 | } |
4266 | if (!buffer_uptodate(bh)) { | 4682 | if (!buffer_uptodate(bh)) { |
@@ -4358,9 +4774,8 @@ make_io: | |||
4358 | submit_bh(READ_META, bh); | 4774 | submit_bh(READ_META, bh); |
4359 | wait_on_buffer(bh); | 4775 | wait_on_buffer(bh); |
4360 | if (!buffer_uptodate(bh)) { | 4776 | if (!buffer_uptodate(bh)) { |
4361 | ext4_error(sb, __func__, | 4777 | EXT4_ERROR_INODE_BLOCK(inode, block, |
4362 | "unable to read inode block - inode=%lu, " | 4778 | "unable to read itable block"); |
4363 | "block=%llu", inode->i_ino, block); | ||
4364 | brelse(bh); | 4779 | brelse(bh); |
4365 | return -EIO; | 4780 | return -EIO; |
4366 | } | 4781 | } |
@@ -4374,7 +4789,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) | |||
4374 | { | 4789 | { |
4375 | /* We have all inode data except xattrs in memory here. */ | 4790 | /* We have all inode data except xattrs in memory here. */ |
4376 | return __ext4_get_inode_loc(inode, iloc, | 4791 | return __ext4_get_inode_loc(inode, iloc, |
4377 | !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); | 4792 | !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); |
4378 | } | 4793 | } |
4379 | 4794 | ||
4380 | void ext4_set_inode_flags(struct inode *inode) | 4795 | void ext4_set_inode_flags(struct inode *inode) |
@@ -4397,20 +4812,26 @@ void ext4_set_inode_flags(struct inode *inode) | |||
4397 | /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ | 4812 | /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ |
4398 | void ext4_get_inode_flags(struct ext4_inode_info *ei) | 4813 | void ext4_get_inode_flags(struct ext4_inode_info *ei) |
4399 | { | 4814 | { |
4400 | unsigned int flags = ei->vfs_inode.i_flags; | 4815 | unsigned int vfs_fl; |
4401 | 4816 | unsigned long old_fl, new_fl; | |
4402 | ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| | 4817 | |
4403 | EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); | 4818 | do { |
4404 | if (flags & S_SYNC) | 4819 | vfs_fl = ei->vfs_inode.i_flags; |
4405 | ei->i_flags |= EXT4_SYNC_FL; | 4820 | old_fl = ei->i_flags; |
4406 | if (flags & S_APPEND) | 4821 | new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| |
4407 | ei->i_flags |= EXT4_APPEND_FL; | 4822 | EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| |
4408 | if (flags & S_IMMUTABLE) | 4823 | EXT4_DIRSYNC_FL); |
4409 | ei->i_flags |= EXT4_IMMUTABLE_FL; | 4824 | if (vfs_fl & S_SYNC) |
4410 | if (flags & S_NOATIME) | 4825 | new_fl |= EXT4_SYNC_FL; |
4411 | ei->i_flags |= EXT4_NOATIME_FL; | 4826 | if (vfs_fl & S_APPEND) |
4412 | if (flags & S_DIRSYNC) | 4827 | new_fl |= EXT4_APPEND_FL; |
4413 | ei->i_flags |= EXT4_DIRSYNC_FL; | 4828 | if (vfs_fl & S_IMMUTABLE) |
4829 | new_fl |= EXT4_IMMUTABLE_FL; | ||
4830 | if (vfs_fl & S_NOATIME) | ||
4831 | new_fl |= EXT4_NOATIME_FL; | ||
4832 | if (vfs_fl & S_DIRSYNC) | ||
4833 | new_fl |= EXT4_DIRSYNC_FL; | ||
4834 | } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); | ||
4414 | } | 4835 | } |
4415 | 4836 | ||
4416 | static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, | 4837 | static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, |
@@ -4425,7 +4846,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, | |||
4425 | /* we are using combined 48 bit field */ | 4846 | /* we are using combined 48 bit field */ |
4426 | i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | | 4847 | i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | |
4427 | le32_to_cpu(raw_inode->i_blocks_lo); | 4848 | le32_to_cpu(raw_inode->i_blocks_lo); |
4428 | if (ei->i_flags & EXT4_HUGE_FILE_FL) { | 4849 | if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { |
4429 | /* i_blocks represent file system block size */ | 4850 | /* i_blocks represent file system block size */ |
4430 | return i_blocks << (inode->i_blkbits - 9); | 4851 | return i_blocks << (inode->i_blkbits - 9); |
4431 | } else { | 4852 | } else { |
@@ -4441,8 +4862,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4441 | struct ext4_iloc iloc; | 4862 | struct ext4_iloc iloc; |
4442 | struct ext4_inode *raw_inode; | 4863 | struct ext4_inode *raw_inode; |
4443 | struct ext4_inode_info *ei; | 4864 | struct ext4_inode_info *ei; |
4444 | struct buffer_head *bh; | ||
4445 | struct inode *inode; | 4865 | struct inode *inode; |
4866 | journal_t *journal = EXT4_SB(sb)->s_journal; | ||
4446 | long ret; | 4867 | long ret; |
4447 | int block; | 4868 | int block; |
4448 | 4869 | ||
@@ -4453,15 +4874,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4453 | return inode; | 4874 | return inode; |
4454 | 4875 | ||
4455 | ei = EXT4_I(inode); | 4876 | ei = EXT4_I(inode); |
4456 | #ifdef CONFIG_EXT4_FS_POSIX_ACL | 4877 | iloc.bh = 0; |
4457 | ei->i_acl = EXT4_ACL_NOT_CACHED; | ||
4458 | ei->i_default_acl = EXT4_ACL_NOT_CACHED; | ||
4459 | #endif | ||
4460 | 4878 | ||
4461 | ret = __ext4_get_inode_loc(inode, &iloc, 0); | 4879 | ret = __ext4_get_inode_loc(inode, &iloc, 0); |
4462 | if (ret < 0) | 4880 | if (ret < 0) |
4463 | goto bad_inode; | 4881 | goto bad_inode; |
4464 | bh = iloc.bh; | ||
4465 | raw_inode = ext4_raw_inode(&iloc); | 4882 | raw_inode = ext4_raw_inode(&iloc); |
4466 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); | 4883 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); |
4467 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); | 4884 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); |
@@ -4472,7 +4889,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4472 | } | 4889 | } |
4473 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); | 4890 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); |
4474 | 4891 | ||
4475 | ei->i_state = 0; | 4892 | ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ |
4476 | ei->i_dir_start_lookup = 0; | 4893 | ei->i_dir_start_lookup = 0; |
4477 | ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); | 4894 | ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); |
4478 | /* We now have enough fields to check if the inode was active or not. | 4895 | /* We now have enough fields to check if the inode was active or not. |
@@ -4484,7 +4901,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4484 | if (inode->i_mode == 0 || | 4901 | if (inode->i_mode == 0 || |
4485 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { | 4902 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { |
4486 | /* this inode is deleted */ | 4903 | /* this inode is deleted */ |
4487 | brelse(bh); | ||
4488 | ret = -ESTALE; | 4904 | ret = -ESTALE; |
4489 | goto bad_inode; | 4905 | goto bad_inode; |
4490 | } | 4906 | } |
@@ -4501,6 +4917,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4501 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; | 4917 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; |
4502 | inode->i_size = ext4_isize(raw_inode); | 4918 | inode->i_size = ext4_isize(raw_inode); |
4503 | ei->i_disksize = inode->i_size; | 4919 | ei->i_disksize = inode->i_size; |
4920 | #ifdef CONFIG_QUOTA | ||
4921 | ei->i_reserved_quota = 0; | ||
4922 | #endif | ||
4504 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); | 4923 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); |
4505 | ei->i_block_group = iloc.block_group; | 4924 | ei->i_block_group = iloc.block_group; |
4506 | ei->i_last_alloc_group = ~0; | 4925 | ei->i_last_alloc_group = ~0; |
@@ -4512,11 +4931,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4512 | ei->i_data[block] = raw_inode->i_block[block]; | 4931 | ei->i_data[block] = raw_inode->i_block[block]; |
4513 | INIT_LIST_HEAD(&ei->i_orphan); | 4932 | INIT_LIST_HEAD(&ei->i_orphan); |
4514 | 4933 | ||
4934 | /* | ||
4935 | * Set transaction id's of transactions that have to be committed | ||
4936 | * to finish f[data]sync. We set them to currently running transaction | ||
4937 | * as we cannot be sure that the inode or some of its metadata isn't | ||
4938 | * part of the transaction - the inode could have been reclaimed and | ||
4939 | * now it is reread from disk. | ||
4940 | */ | ||
4941 | if (journal) { | ||
4942 | transaction_t *transaction; | ||
4943 | tid_t tid; | ||
4944 | |||
4945 | read_lock(&journal->j_state_lock); | ||
4946 | if (journal->j_running_transaction) | ||
4947 | transaction = journal->j_running_transaction; | ||
4948 | else | ||
4949 | transaction = journal->j_committing_transaction; | ||
4950 | if (transaction) | ||
4951 | tid = transaction->t_tid; | ||
4952 | else | ||
4953 | tid = journal->j_commit_sequence; | ||
4954 | read_unlock(&journal->j_state_lock); | ||
4955 | ei->i_sync_tid = tid; | ||
4956 | ei->i_datasync_tid = tid; | ||
4957 | } | ||
4958 | |||
4515 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | 4959 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { |
4516 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); | 4960 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); |
4517 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > | 4961 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > |
4518 | EXT4_INODE_SIZE(inode->i_sb)) { | 4962 | EXT4_INODE_SIZE(inode->i_sb)) { |
4519 | brelse(bh); | ||
4520 | ret = -EIO; | 4963 | ret = -EIO; |
4521 | goto bad_inode; | 4964 | goto bad_inode; |
4522 | } | 4965 | } |
@@ -4529,7 +4972,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4529 | EXT4_GOOD_OLD_INODE_SIZE + | 4972 | EXT4_GOOD_OLD_INODE_SIZE + |
4530 | ei->i_extra_isize; | 4973 | ei->i_extra_isize; |
4531 | if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) | 4974 | if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) |
4532 | ei->i_state |= EXT4_STATE_XATTR; | 4975 | ext4_set_inode_state(inode, EXT4_STATE_XATTR); |
4533 | } | 4976 | } |
4534 | } else | 4977 | } else |
4535 | ei->i_extra_isize = 0; | 4978 | ei->i_extra_isize = 0; |
@@ -4548,16 +4991,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4548 | 4991 | ||
4549 | ret = 0; | 4992 | ret = 0; |
4550 | if (ei->i_file_acl && | 4993 | if (ei->i_file_acl && |
4551 | ((ei->i_file_acl < | 4994 | !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { |
4552 | (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + | 4995 | EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", |
4553 | EXT4_SB(sb)->s_gdb_count)) || | 4996 | ei->i_file_acl); |
4554 | (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { | ||
4555 | ext4_error(sb, __func__, | ||
4556 | "bad extended attribute block %llu in inode #%lu", | ||
4557 | ei->i_file_acl, inode->i_ino); | ||
4558 | ret = -EIO; | 4997 | ret = -EIO; |
4559 | goto bad_inode; | 4998 | goto bad_inode; |
4560 | } else if (ei->i_flags & EXT4_EXTENTS_FL) { | 4999 | } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { |
4561 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 5000 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
4562 | (S_ISLNK(inode->i_mode) && | 5001 | (S_ISLNK(inode->i_mode) && |
4563 | !ext4_inode_is_fast_symlink(inode))) | 5002 | !ext4_inode_is_fast_symlink(inode))) |
@@ -4569,10 +5008,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4569 | /* Validate block references which are part of inode */ | 5008 | /* Validate block references which are part of inode */ |
4570 | ret = ext4_check_inode_blockref(inode); | 5009 | ret = ext4_check_inode_blockref(inode); |
4571 | } | 5010 | } |
4572 | if (ret) { | 5011 | if (ret) |
4573 | brelse(bh); | ||
4574 | goto bad_inode; | 5012 | goto bad_inode; |
4575 | } | ||
4576 | 5013 | ||
4577 | if (S_ISREG(inode->i_mode)) { | 5014 | if (S_ISREG(inode->i_mode)) { |
4578 | inode->i_op = &ext4_file_inode_operations; | 5015 | inode->i_op = &ext4_file_inode_operations; |
@@ -4600,11 +5037,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4600 | init_special_inode(inode, inode->i_mode, | 5037 | init_special_inode(inode, inode->i_mode, |
4601 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); | 5038 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); |
4602 | } else { | 5039 | } else { |
4603 | brelse(bh); | ||
4604 | ret = -EIO; | 5040 | ret = -EIO; |
4605 | ext4_error(inode->i_sb, __func__, | 5041 | EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); |
4606 | "bogus i_mode (%o) for inode=%lu", | ||
4607 | inode->i_mode, inode->i_ino); | ||
4608 | goto bad_inode; | 5042 | goto bad_inode; |
4609 | } | 5043 | } |
4610 | brelse(iloc.bh); | 5044 | brelse(iloc.bh); |
@@ -4613,6 +5047,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4613 | return inode; | 5047 | return inode; |
4614 | 5048 | ||
4615 | bad_inode: | 5049 | bad_inode: |
5050 | brelse(iloc.bh); | ||
4616 | iget_failed(inode); | 5051 | iget_failed(inode); |
4617 | return ERR_PTR(ret); | 5052 | return ERR_PTR(ret); |
4618 | } | 5053 | } |
@@ -4632,7 +5067,7 @@ static int ext4_inode_blocks_set(handle_t *handle, | |||
4632 | */ | 5067 | */ |
4633 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5068 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
4634 | raw_inode->i_blocks_high = 0; | 5069 | raw_inode->i_blocks_high = 0; |
4635 | ei->i_flags &= ~EXT4_HUGE_FILE_FL; | 5070 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
4636 | return 0; | 5071 | return 0; |
4637 | } | 5072 | } |
4638 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) | 5073 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) |
@@ -4645,9 +5080,9 @@ static int ext4_inode_blocks_set(handle_t *handle, | |||
4645 | */ | 5080 | */ |
4646 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5081 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
4647 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); | 5082 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); |
4648 | ei->i_flags &= ~EXT4_HUGE_FILE_FL; | 5083 | ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
4649 | } else { | 5084 | } else { |
4650 | ei->i_flags |= EXT4_HUGE_FILE_FL; | 5085 | ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); |
4651 | /* i_block is stored in file system block size */ | 5086 | /* i_block is stored in file system block size */ |
4652 | i_blocks = i_blocks >> (inode->i_blkbits - 9); | 5087 | i_blocks = i_blocks >> (inode->i_blkbits - 9); |
4653 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | 5088 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); |
@@ -4674,7 +5109,7 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4674 | 5109 | ||
4675 | /* For fields not not tracking in the in-memory inode, | 5110 | /* For fields not not tracking in the in-memory inode, |
4676 | * initialise them to zero for new inodes. */ | 5111 | * initialise them to zero for new inodes. */ |
4677 | if (ei->i_state & EXT4_STATE_NEW) | 5112 | if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) |
4678 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); | 5113 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); |
4679 | 5114 | ||
4680 | ext4_get_inode_flags(ei); | 5115 | ext4_get_inode_flags(ei); |
@@ -4713,8 +5148,7 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4713 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) | 5148 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) |
4714 | goto out_brelse; | 5149 | goto out_brelse; |
4715 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); | 5150 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); |
4716 | /* clear the migrate flag in the raw_inode */ | 5151 | raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); |
4717 | raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); | ||
4718 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | 5152 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
4719 | cpu_to_le32(EXT4_OS_HURD)) | 5153 | cpu_to_le32(EXT4_OS_HURD)) |
4720 | raw_inode->i_file_acl_high = | 5154 | raw_inode->i_file_acl_high = |
@@ -4739,7 +5173,7 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4739 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); | 5173 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); |
4740 | sb->s_dirt = 1; | 5174 | sb->s_dirt = 1; |
4741 | ext4_handle_sync(handle); | 5175 | ext4_handle_sync(handle); |
4742 | err = ext4_handle_dirty_metadata(handle, inode, | 5176 | err = ext4_handle_dirty_metadata(handle, NULL, |
4743 | EXT4_SB(sb)->s_sbh); | 5177 | EXT4_SB(sb)->s_sbh); |
4744 | } | 5178 | } |
4745 | } | 5179 | } |
@@ -4768,11 +5202,12 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4768 | } | 5202 | } |
4769 | 5203 | ||
4770 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | 5204 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
4771 | rc = ext4_handle_dirty_metadata(handle, inode, bh); | 5205 | rc = ext4_handle_dirty_metadata(handle, NULL, bh); |
4772 | if (!err) | 5206 | if (!err) |
4773 | err = rc; | 5207 | err = rc; |
4774 | ei->i_state &= ~EXT4_STATE_NEW; | 5208 | ext4_clear_inode_state(inode, EXT4_STATE_NEW); |
4775 | 5209 | ||
5210 | ext4_update_inode_fsync_trans(handle, inode, 0); | ||
4776 | out_brelse: | 5211 | out_brelse: |
4777 | brelse(bh); | 5212 | brelse(bh); |
4778 | ext4_std_error(inode->i_sb, err); | 5213 | ext4_std_error(inode->i_sb, err); |
@@ -4814,21 +5249,40 @@ out_brelse: | |||
4814 | * `stuff()' is running, and the new i_size will be lost. Plus the inode | 5249 | * `stuff()' is running, and the new i_size will be lost. Plus the inode |
4815 | * will no longer be on the superblock's dirty inode list. | 5250 | * will no longer be on the superblock's dirty inode list. |
4816 | */ | 5251 | */ |
4817 | int ext4_write_inode(struct inode *inode, int wait) | 5252 | int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) |
4818 | { | 5253 | { |
5254 | int err; | ||
5255 | |||
4819 | if (current->flags & PF_MEMALLOC) | 5256 | if (current->flags & PF_MEMALLOC) |
4820 | return 0; | 5257 | return 0; |
4821 | 5258 | ||
4822 | if (ext4_journal_current_handle()) { | 5259 | if (EXT4_SB(inode->i_sb)->s_journal) { |
4823 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); | 5260 | if (ext4_journal_current_handle()) { |
4824 | dump_stack(); | 5261 | jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); |
4825 | return -EIO; | 5262 | dump_stack(); |
4826 | } | 5263 | return -EIO; |
5264 | } | ||
4827 | 5265 | ||
4828 | if (!wait) | 5266 | if (wbc->sync_mode != WB_SYNC_ALL) |
4829 | return 0; | 5267 | return 0; |
5268 | |||
5269 | err = ext4_force_commit(inode->i_sb); | ||
5270 | } else { | ||
5271 | struct ext4_iloc iloc; | ||
4830 | 5272 | ||
4831 | return ext4_force_commit(inode->i_sb); | 5273 | err = __ext4_get_inode_loc(inode, &iloc, 0); |
5274 | if (err) | ||
5275 | return err; | ||
5276 | if (wbc->sync_mode == WB_SYNC_ALL) | ||
5277 | sync_dirty_buffer(iloc.bh); | ||
5278 | if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { | ||
5279 | EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, | ||
5280 | "IO error syncing inode"); | ||
5281 | err = -EIO; | ||
5282 | } | ||
5283 | brelse(iloc.bh); | ||
5284 | } | ||
5285 | return err; | ||
4832 | } | 5286 | } |
4833 | 5287 | ||
4834 | /* | 5288 | /* |
@@ -4859,25 +5313,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4859 | { | 5313 | { |
4860 | struct inode *inode = dentry->d_inode; | 5314 | struct inode *inode = dentry->d_inode; |
4861 | int error, rc = 0; | 5315 | int error, rc = 0; |
5316 | int orphan = 0; | ||
4862 | const unsigned int ia_valid = attr->ia_valid; | 5317 | const unsigned int ia_valid = attr->ia_valid; |
4863 | 5318 | ||
4864 | error = inode_change_ok(inode, attr); | 5319 | error = inode_change_ok(inode, attr); |
4865 | if (error) | 5320 | if (error) |
4866 | return error; | 5321 | return error; |
4867 | 5322 | ||
5323 | if (is_quota_modification(inode, attr)) | ||
5324 | dquot_initialize(inode); | ||
4868 | if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || | 5325 | if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || |
4869 | (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { | 5326 | (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { |
4870 | handle_t *handle; | 5327 | handle_t *handle; |
4871 | 5328 | ||
4872 | /* (user+group)*(old+new) structure, inode write (sb, | 5329 | /* (user+group)*(old+new) structure, inode write (sb, |
4873 | * inode block, ? - but truncate inode update has it) */ | 5330 | * inode block, ? - but truncate inode update has it) */ |
4874 | handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ | 5331 | handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ |
4875 | EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); | 5332 | EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); |
4876 | if (IS_ERR(handle)) { | 5333 | if (IS_ERR(handle)) { |
4877 | error = PTR_ERR(handle); | 5334 | error = PTR_ERR(handle); |
4878 | goto err_out; | 5335 | goto err_out; |
4879 | } | 5336 | } |
4880 | error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; | 5337 | error = dquot_transfer(inode, attr); |
4881 | if (error) { | 5338 | if (error) { |
4882 | ext4_journal_stop(handle); | 5339 | ext4_journal_stop(handle); |
4883 | return error; | 5340 | return error; |
@@ -4893,18 +5350,18 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4893 | } | 5350 | } |
4894 | 5351 | ||
4895 | if (attr->ia_valid & ATTR_SIZE) { | 5352 | if (attr->ia_valid & ATTR_SIZE) { |
4896 | if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { | 5353 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { |
4897 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 5354 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
4898 | 5355 | ||
4899 | if (attr->ia_size > sbi->s_bitmap_maxbytes) { | 5356 | if (attr->ia_size > sbi->s_bitmap_maxbytes) |
4900 | error = -EFBIG; | 5357 | return -EFBIG; |
4901 | goto err_out; | ||
4902 | } | ||
4903 | } | 5358 | } |
4904 | } | 5359 | } |
4905 | 5360 | ||
4906 | if (S_ISREG(inode->i_mode) && | 5361 | if (S_ISREG(inode->i_mode) && |
4907 | attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { | 5362 | attr->ia_valid & ATTR_SIZE && |
5363 | (attr->ia_size < inode->i_size || | ||
5364 | (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { | ||
4908 | handle_t *handle; | 5365 | handle_t *handle; |
4909 | 5366 | ||
4910 | handle = ext4_journal_start(inode, 3); | 5367 | handle = ext4_journal_start(inode, 3); |
@@ -4912,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4912 | error = PTR_ERR(handle); | 5369 | error = PTR_ERR(handle); |
4913 | goto err_out; | 5370 | goto err_out; |
4914 | } | 5371 | } |
4915 | 5372 | if (ext4_handle_valid(handle)) { | |
4916 | error = ext4_orphan_add(handle, inode); | 5373 | error = ext4_orphan_add(handle, inode); |
5374 | orphan = 1; | ||
5375 | } | ||
4917 | EXT4_I(inode)->i_disksize = attr->ia_size; | 5376 | EXT4_I(inode)->i_disksize = attr->ia_size; |
4918 | rc = ext4_mark_inode_dirty(handle, inode); | 5377 | rc = ext4_mark_inode_dirty(handle, inode); |
4919 | if (!error) | 5378 | if (!error) |
@@ -4931,18 +5390,30 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
4931 | goto err_out; | 5390 | goto err_out; |
4932 | } | 5391 | } |
4933 | ext4_orphan_del(handle, inode); | 5392 | ext4_orphan_del(handle, inode); |
5393 | orphan = 0; | ||
4934 | ext4_journal_stop(handle); | 5394 | ext4_journal_stop(handle); |
4935 | goto err_out; | 5395 | goto err_out; |
4936 | } | 5396 | } |
4937 | } | 5397 | } |
5398 | /* ext4_truncate will clear the flag */ | ||
5399 | if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) | ||
5400 | ext4_truncate(inode); | ||
4938 | } | 5401 | } |
4939 | 5402 | ||
4940 | rc = inode_setattr(inode, attr); | 5403 | if ((attr->ia_valid & ATTR_SIZE) && |
5404 | attr->ia_size != i_size_read(inode)) | ||
5405 | rc = vmtruncate(inode, attr->ia_size); | ||
4941 | 5406 | ||
4942 | /* If inode_setattr's call to ext4_truncate failed to get a | 5407 | if (!rc) { |
4943 | * transaction handle at all, we need to clean up the in-core | 5408 | setattr_copy(inode, attr); |
4944 | * orphan list manually. */ | 5409 | mark_inode_dirty(inode); |
4945 | if (inode->i_nlink) | 5410 | } |
5411 | |||
5412 | /* | ||
5413 | * If the call to ext4_truncate failed to get a transaction handle at | ||
5414 | * all, we need to clean up the in-core orphan list manually. | ||
5415 | */ | ||
5416 | if (orphan && inode->i_nlink) | ||
4946 | ext4_orphan_del(NULL, inode); | 5417 | ext4_orphan_del(NULL, inode); |
4947 | 5418 | ||
4948 | if (!rc && (ia_valid & ATTR_MODE)) | 5419 | if (!rc && (ia_valid & ATTR_MODE)) |
@@ -4974,9 +5445,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4974 | * will return the blocks that include the delayed allocation | 5445 | * will return the blocks that include the delayed allocation |
4975 | * blocks for this file. | 5446 | * blocks for this file. |
4976 | */ | 5447 | */ |
4977 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
4978 | delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; | 5448 | delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; |
4979 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
4980 | 5449 | ||
4981 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | 5450 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; |
4982 | return 0; | 5451 | return 0; |
@@ -5009,7 +5478,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | |||
5009 | 5478 | ||
5010 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 5479 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5011 | { | 5480 | { |
5012 | if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | 5481 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5013 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 5482 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); |
5014 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 5483 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5015 | } | 5484 | } |
@@ -5020,12 +5489,12 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |||
5020 | * worse case, the indexs blocks spread over different block groups | 5489 | * worse case, the indexs blocks spread over different block groups |
5021 | * | 5490 | * |
5022 | * If datablocks are discontiguous, they are possible to spread over | 5491 | * If datablocks are discontiguous, they are possible to spread over |
5023 | * different block groups too. If they are contiugous, with flexbg, | 5492 | * different block groups too. If they are contiuguous, with flexbg, |
5024 | * they could still across block group boundary. | 5493 | * they could still across block group boundary. |
5025 | * | 5494 | * |
5026 | * Also account for superblock, inode, quota and xattr blocks | 5495 | * Also account for superblock, inode, quota and xattr blocks |
5027 | */ | 5496 | */ |
5028 | int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 5497 | static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5029 | { | 5498 | { |
5030 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); | 5499 | ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); |
5031 | int gdpblocks; | 5500 | int gdpblocks; |
@@ -5096,7 +5565,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) | |||
5096 | * Calculate the journal credits for a chunk of data modification. | 5565 | * Calculate the journal credits for a chunk of data modification. |
5097 | * | 5566 | * |
5098 | * This is called from DIO, fallocate or whoever calling | 5567 | * This is called from DIO, fallocate or whoever calling |
5099 | * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. | 5568 | * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. |
5100 | * | 5569 | * |
5101 | * journal buffers for data blocks are not included here, as DIO | 5570 | * journal buffers for data blocks are not included here, as DIO |
5102 | * and fallocate do no need to journal data buffers. | 5571 | * and fallocate do no need to journal data buffers. |
@@ -5162,7 +5631,6 @@ static int ext4_expand_extra_isize(struct inode *inode, | |||
5162 | { | 5631 | { |
5163 | struct ext4_inode *raw_inode; | 5632 | struct ext4_inode *raw_inode; |
5164 | struct ext4_xattr_ibody_header *header; | 5633 | struct ext4_xattr_ibody_header *header; |
5165 | struct ext4_xattr_entry *entry; | ||
5166 | 5634 | ||
5167 | if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) | 5635 | if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) |
5168 | return 0; | 5636 | return 0; |
@@ -5170,11 +5638,10 @@ static int ext4_expand_extra_isize(struct inode *inode, | |||
5170 | raw_inode = ext4_raw_inode(&iloc); | 5638 | raw_inode = ext4_raw_inode(&iloc); |
5171 | 5639 | ||
5172 | header = IHDR(inode, raw_inode); | 5640 | header = IHDR(inode, raw_inode); |
5173 | entry = IFIRST(header); | ||
5174 | 5641 | ||
5175 | /* No extended attributes present */ | 5642 | /* No extended attributes present */ |
5176 | if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || | 5643 | if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || |
5177 | header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { | 5644 | header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { |
5178 | memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, | 5645 | memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, |
5179 | new_extra_isize); | 5646 | new_extra_isize); |
5180 | EXT4_I(inode)->i_extra_isize = new_extra_isize; | 5647 | EXT4_I(inode)->i_extra_isize = new_extra_isize; |
@@ -5215,10 +5682,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5215 | int err, ret; | 5682 | int err, ret; |
5216 | 5683 | ||
5217 | might_sleep(); | 5684 | might_sleep(); |
5685 | trace_ext4_mark_inode_dirty(inode, _RET_IP_); | ||
5218 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 5686 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
5219 | if (ext4_handle_valid(handle) && | 5687 | if (ext4_handle_valid(handle) && |
5220 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && | 5688 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && |
5221 | !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { | 5689 | !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { |
5222 | /* | 5690 | /* |
5223 | * We need extra buffer credits since we may write into EA block | 5691 | * We need extra buffer credits since we may write into EA block |
5224 | * with this same handle. If journal_extend fails, then it will | 5692 | * with this same handle. If journal_extend fails, then it will |
@@ -5232,10 +5700,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5232 | sbi->s_want_extra_isize, | 5700 | sbi->s_want_extra_isize, |
5233 | iloc, handle); | 5701 | iloc, handle); |
5234 | if (ret) { | 5702 | if (ret) { |
5235 | EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; | 5703 | ext4_set_inode_state(inode, |
5704 | EXT4_STATE_NO_EXPAND); | ||
5236 | if (mnt_count != | 5705 | if (mnt_count != |
5237 | le16_to_cpu(sbi->s_es->s_mnt_count)) { | 5706 | le16_to_cpu(sbi->s_es->s_mnt_count)) { |
5238 | ext4_warning(inode->i_sb, __func__, | 5707 | ext4_warning(inode->i_sb, |
5239 | "Unable to expand inode %lu. Delete" | 5708 | "Unable to expand inode %lu. Delete" |
5240 | " some EAs or run e2fsck.", | 5709 | " some EAs or run e2fsck.", |
5241 | inode->i_ino); | 5710 | inode->i_ino); |
@@ -5257,7 +5726,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5257 | * i_size has been changed by generic_commit_write() and we thus need | 5726 | * i_size has been changed by generic_commit_write() and we thus need |
5258 | * to include the updated inode in the current transaction. | 5727 | * to include the updated inode in the current transaction. |
5259 | * | 5728 | * |
5260 | * Also, vfs_dq_alloc_block() will always dirty the inode when blocks | 5729 | * Also, dquot_alloc_block() will always dirty the inode when blocks |
5261 | * are allocated to the file. | 5730 | * are allocated to the file. |
5262 | * | 5731 | * |
5263 | * If the inode is marked synchronous, we don't honour that here - doing | 5732 | * If the inode is marked synchronous, we don't honour that here - doing |
@@ -5266,27 +5735,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5266 | */ | 5735 | */ |
5267 | void ext4_dirty_inode(struct inode *inode) | 5736 | void ext4_dirty_inode(struct inode *inode) |
5268 | { | 5737 | { |
5269 | handle_t *current_handle = ext4_journal_current_handle(); | ||
5270 | handle_t *handle; | 5738 | handle_t *handle; |
5271 | 5739 | ||
5272 | if (!ext4_handle_valid(current_handle)) { | ||
5273 | ext4_mark_inode_dirty(current_handle, inode); | ||
5274 | return; | ||
5275 | } | ||
5276 | |||
5277 | handle = ext4_journal_start(inode, 2); | 5740 | handle = ext4_journal_start(inode, 2); |
5278 | if (IS_ERR(handle)) | 5741 | if (IS_ERR(handle)) |
5279 | goto out; | 5742 | goto out; |
5280 | if (current_handle && | 5743 | |
5281 | current_handle->h_transaction != handle->h_transaction) { | 5744 | ext4_mark_inode_dirty(handle, inode); |
5282 | /* This task has a transaction open against a different fs */ | 5745 | |
5283 | printk(KERN_EMERG "%s: transactions do not match!\n", | ||
5284 | __func__); | ||
5285 | } else { | ||
5286 | jbd_debug(5, "marking dirty. outer handle=%p\n", | ||
5287 | current_handle); | ||
5288 | ext4_mark_inode_dirty(handle, inode); | ||
5289 | } | ||
5290 | ext4_journal_stop(handle); | 5746 | ext4_journal_stop(handle); |
5291 | out: | 5747 | out: |
5292 | return; | 5748 | return; |
@@ -5312,7 +5768,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) | |||
5312 | err = jbd2_journal_get_write_access(handle, iloc.bh); | 5768 | err = jbd2_journal_get_write_access(handle, iloc.bh); |
5313 | if (!err) | 5769 | if (!err) |
5314 | err = ext4_handle_dirty_metadata(handle, | 5770 | err = ext4_handle_dirty_metadata(handle, |
5315 | inode, | 5771 | NULL, |
5316 | iloc.bh); | 5772 | iloc.bh); |
5317 | brelse(iloc.bh); | 5773 | brelse(iloc.bh); |
5318 | } | 5774 | } |
@@ -5356,9 +5812,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
5356 | */ | 5812 | */ |
5357 | 5813 | ||
5358 | if (val) | 5814 | if (val) |
5359 | EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; | 5815 | ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); |
5360 | else | 5816 | else |
5361 | EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; | 5817 | ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); |
5362 | ext4_set_aops(inode); | 5818 | ext4_set_aops(inode); |
5363 | 5819 | ||
5364 | jbd2_journal_unlock_updates(journal); | 5820 | jbd2_journal_unlock_updates(journal); |
@@ -5413,12 +5869,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5413 | else | 5869 | else |
5414 | len = PAGE_CACHE_SIZE; | 5870 | len = PAGE_CACHE_SIZE; |
5415 | 5871 | ||
5872 | lock_page(page); | ||
5873 | /* | ||
5874 | * return if we have all the buffers mapped. This avoid | ||
5875 | * the need to call write_begin/write_end which does a | ||
5876 | * journal_start/journal_stop which can block and take | ||
5877 | * long time | ||
5878 | */ | ||
5416 | if (page_has_buffers(page)) { | 5879 | if (page_has_buffers(page)) { |
5417 | /* return if we have all the buffers mapped */ | ||
5418 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | 5880 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, |
5419 | ext4_bh_unmapped)) | 5881 | ext4_bh_unmapped)) { |
5882 | unlock_page(page); | ||
5420 | goto out_unlock; | 5883 | goto out_unlock; |
5884 | } | ||
5421 | } | 5885 | } |
5886 | unlock_page(page); | ||
5422 | /* | 5887 | /* |
5423 | * OK, we need to fill the hole... Do write_begin write_end | 5888 | * OK, we need to fill the hole... Do write_begin write_end |
5424 | * to do block allocation/reservation.We are not holding | 5889 | * to do block allocation/reservation.We are not holding |