diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-01 19:56:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-08-01 19:56:03 -0400 |
commit | 60ad4466821a96913a9b567115e194ed1087c2d7 (patch) | |
tree | cd488ba72a60f856b85a467763fb633cbe7ef2d9 /fs/ext4/inode.c | |
parent | 1b8e94993c4752d98c33903aa836acc15f7e6d5c (diff) | |
parent | 79a77c5ac34cc27ccbfbdf7113b41cdd93534eab (diff) |
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (60 commits)
ext4: prevent memory leaks from ext4_mb_init_backend() on error path
ext4: use EXT4_BAD_INO for buddy cache to avoid colliding with valid inode #
ext4: use ext4_msg() instead of printk in mballoc
ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and s_group_info
ext4: introduce ext4_kvmalloc(), ext4_kzalloc(), and ext4_kvfree()
ext4: use the correct error exit path in ext4_init_inode_table()
ext4: add missing kfree() on error return path in add_new_gdb()
ext4: change umode_t in tracepoint headers to be an explicit __u16
ext4: fix races in ext4_sync_parent()
ext4: Fix overflow caused by missing cast in ext4_fallocate()
ext4: add action of moving index in ext4_ext_rm_idx for Punch Hole
ext4: simplify parameters of reserve_backup_gdb()
ext4: simplify parameters of add_new_gdb()
ext4: remove lock_buffer in bclean() and setup_new_group_blocks()
ext4: simplify journal handling in setup_new_group_blocks()
ext4: let setup_new_group_blocks() set multiple bits at a time
ext4: fix a typo in ext4_group_extend()
ext4: let ext4_group_add_blocks() handle 0 blocks quickly
ext4: let ext4_group_add_blocks() return an error code
ext4: rename ext4_add_groupblocks() to ext4_group_add_blocks()
...
Fix up conflict in fs/ext4/inode.c: commit aacfc19c626e ("fs: simplify
the blockdev_direct_IO prototype") had changed the ext4_ind_direct_IO()
function for the new simplified calling convention, while commit
dae1e52cb126 ("ext4: move ext4_ind_* functions from inode.c to
indirect.c") moved the function to another file.
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 1596 |
1 files changed, 37 insertions, 1559 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e5191f9f398..d47264cafee0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -12,10 +12,6 @@ | |||
12 | * | 12 | * |
13 | * Copyright (C) 1991, 1992 Linus Torvalds | 13 | * Copyright (C) 1991, 1992 Linus Torvalds |
14 | * | 14 | * |
15 | * Goal-directed block allocation by Stephen Tweedie | ||
16 | * (sct@redhat.com), 1993, 1998 | ||
17 | * Big-endian to little-endian byte-swapping/bitmaps by | ||
18 | * David S. Miller (davem@caip.rutgers.edu), 1995 | ||
19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek | 15 | * 64-bit file support on 64-bit platforms by Jakub Jelinek |
20 | * (jj@sunsite.ms.mff.cuni.cz) | 16 | * (jj@sunsite.ms.mff.cuni.cz) |
21 | * | 17 | * |
@@ -47,6 +43,7 @@ | |||
47 | #include "xattr.h" | 43 | #include "xattr.h" |
48 | #include "acl.h" | 44 | #include "acl.h" |
49 | #include "ext4_extents.h" | 45 | #include "ext4_extents.h" |
46 | #include "truncate.h" | ||
50 | 47 | ||
51 | #include <trace/events/ext4.h> | 48 | #include <trace/events/ext4.h> |
52 | 49 | ||
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Work out how many blocks we need to proceed with the next chunk of a | ||
93 | * truncate transaction. | ||
94 | */ | ||
95 | static unsigned long blocks_for_truncate(struct inode *inode) | ||
96 | { | ||
97 | ext4_lblk_t needed; | ||
98 | |||
99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
100 | |||
101 | /* Give ourselves just enough room to cope with inodes in which | ||
102 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
103 | * which resulted in random data in an inode which looked enough | ||
104 | * like a regular file for ext4 to try to delete it. Things | ||
105 | * will go a bit crazy if that happens, but at least we should | ||
106 | * try not to panic the whole kernel. */ | ||
107 | if (needed < 2) | ||
108 | needed = 2; | ||
109 | |||
110 | /* But we need to bound the transaction so we don't overflow the | ||
111 | * journal. */ | ||
112 | if (needed > EXT4_MAX_TRANS_DATA) | ||
113 | needed = EXT4_MAX_TRANS_DATA; | ||
114 | |||
115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
120 | * be able to restart the transaction at a conventient checkpoint to make | ||
121 | * sure we don't overflow the journal. | ||
122 | * | ||
123 | * start_transaction gets us a new handle for a truncate transaction, | ||
124 | * and extend_transaction tries to extend the existing one a bit. If | ||
125 | * extend fails, we need to propagate the failure up and restart the | ||
126 | * transaction in the top-level truncate loop. --sct | ||
127 | */ | ||
128 | static handle_t *start_transaction(struct inode *inode) | ||
129 | { | ||
130 | handle_t *result; | ||
131 | |||
132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); | ||
133 | if (!IS_ERR(result)) | ||
134 | return result; | ||
135 | |||
136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
137 | return result; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Try to extend this transaction for the purposes of truncation. | ||
142 | * | ||
143 | * Returns 0 if we managed to create more room. If we can't create more | ||
144 | * room, and the transaction must be restarted we return 1. | ||
145 | */ | ||
146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
147 | { | ||
148 | if (!ext4_handle_valid(handle)) | ||
149 | return 0; | ||
150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
151 | return 0; | ||
152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | ||
153 | return 0; | ||
154 | return 1; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Restart the transaction associated with *handle. This does a commit, | 89 | * Restart the transaction associated with *handle. This does a commit, |
159 | * so before we call here everything must be consistently dirtied against | 90 | * so before we call here everything must be consistently dirtied against |
160 | * this transaction. | 91 | * this transaction. |
@@ -190,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) | |||
190 | 121 | ||
191 | trace_ext4_evict_inode(inode); | 122 | trace_ext4_evict_inode(inode); |
192 | if (inode->i_nlink) { | 123 | if (inode->i_nlink) { |
124 | /* | ||
125 | * When journalling data dirty buffers are tracked only in the | ||
126 | * journal. So although mm thinks everything is clean and | ||
127 | * ready for reaping the inode might still have some pages to | ||
128 | * write in the running transaction or waiting to be | ||
129 | * checkpointed. Thus calling jbd2_journal_invalidatepage() | ||
130 | * (via truncate_inode_pages()) to discard these buffers can | ||
131 | * cause data loss. Also even if we did not discard these | ||
132 | * buffers, we would have no way to find them after the inode | ||
133 | * is reaped and thus user could see stale data if he tries to | ||
134 | * read them before the transaction is checkpointed. So be | ||
135 | * careful and force everything to disk here... We use | ||
136 | * ei->i_datasync_tid to store the newest transaction | ||
137 | * containing inode's data. | ||
138 | * | ||
139 | * Note that directories do not have this problem because they | ||
140 | * don't use page cache. | ||
141 | */ | ||
142 | if (ext4_should_journal_data(inode) && | ||
143 | (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { | ||
144 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
145 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; | ||
146 | |||
147 | jbd2_log_start_commit(journal, commit_tid); | ||
148 | jbd2_log_wait_commit(journal, commit_tid); | ||
149 | filemap_write_and_wait(&inode->i_data); | ||
150 | } | ||
193 | truncate_inode_pages(&inode->i_data, 0); | 151 | truncate_inode_pages(&inode->i_data, 0); |
194 | goto no_delete; | 152 | goto no_delete; |
195 | } | 153 | } |
@@ -204,7 +162,7 @@ void ext4_evict_inode(struct inode *inode) | |||
204 | if (is_bad_inode(inode)) | 162 | if (is_bad_inode(inode)) |
205 | goto no_delete; | 163 | goto no_delete; |
206 | 164 | ||
207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | 165 | handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); |
208 | if (IS_ERR(handle)) { | 166 | if (IS_ERR(handle)) { |
209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | 167 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); |
210 | /* | 168 | /* |
@@ -277,793 +235,6 @@ no_delete: | |||
277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ | 235 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
278 | } | 236 | } |
279 | 237 | ||
280 | typedef struct { | ||
281 | __le32 *p; | ||
282 | __le32 key; | ||
283 | struct buffer_head *bh; | ||
284 | } Indirect; | ||
285 | |||
286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
287 | { | ||
288 | p->key = *(p->p = v); | ||
289 | p->bh = bh; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * ext4_block_to_path - parse the block number into array of offsets | ||
294 | * @inode: inode in question (we are only interested in its superblock) | ||
295 | * @i_block: block number to be parsed | ||
296 | * @offsets: array to store the offsets in | ||
297 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
298 | * followed (on disk) by an indirect block. | ||
299 | * | ||
300 | * To store the locations of file's data ext4 uses a data structure common | ||
301 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
302 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
303 | * This function translates the block number into path in that tree - | ||
304 | * return value is the path length and @offsets[n] is the offset of | ||
305 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
306 | * (negative or too large) warning is printed and zero returned. | ||
307 | * | ||
308 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
309 | * we need to know is the capacity of indirect blocks (taken from the | ||
310 | * inode->i_sb). | ||
311 | */ | ||
312 | |||
313 | /* | ||
314 | * Portability note: the last comparison (check that we fit into triple | ||
315 | * indirect block) is spelled differently, because otherwise on an | ||
316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
317 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
319 | * i_block would have to be negative in the very beginning, so we would not | ||
320 | * get there at all. | ||
321 | */ | ||
322 | |||
323 | static int ext4_block_to_path(struct inode *inode, | ||
324 | ext4_lblk_t i_block, | ||
325 | ext4_lblk_t offsets[4], int *boundary) | ||
326 | { | ||
327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
329 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
330 | indirect_blocks = ptrs, | ||
331 | double_blocks = (1 << (ptrs_bits * 2)); | ||
332 | int n = 0; | ||
333 | int final = 0; | ||
334 | |||
335 | if (i_block < direct_blocks) { | ||
336 | offsets[n++] = i_block; | ||
337 | final = direct_blocks; | ||
338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
339 | offsets[n++] = EXT4_IND_BLOCK; | ||
340 | offsets[n++] = i_block; | ||
341 | final = ptrs; | ||
342 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
343 | offsets[n++] = EXT4_DIND_BLOCK; | ||
344 | offsets[n++] = i_block >> ptrs_bits; | ||
345 | offsets[n++] = i_block & (ptrs - 1); | ||
346 | final = ptrs; | ||
347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
348 | offsets[n++] = EXT4_TIND_BLOCK; | ||
349 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
351 | offsets[n++] = i_block & (ptrs - 1); | ||
352 | final = ptrs; | ||
353 | } else { | ||
354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
355 | i_block + direct_blocks + | ||
356 | indirect_blocks + double_blocks, inode->i_ino); | ||
357 | } | ||
358 | if (boundary) | ||
359 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
360 | return n; | ||
361 | } | ||
362 | |||
363 | static int __ext4_check_blockref(const char *function, unsigned int line, | ||
364 | struct inode *inode, | ||
365 | __le32 *p, unsigned int max) | ||
366 | { | ||
367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
368 | __le32 *bref = p; | ||
369 | unsigned int blk; | ||
370 | |||
371 | while (bref < p+max) { | ||
372 | blk = le32_to_cpu(*bref++); | ||
373 | if (blk && | ||
374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
375 | blk, 1))) { | ||
376 | es->s_last_error_block = cpu_to_le64(blk); | ||
377 | ext4_error_inode(inode, function, line, blk, | ||
378 | "invalid block"); | ||
379 | return -EIO; | ||
380 | } | ||
381 | } | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | |||
386 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
387 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
388 | (__le32 *)(bh)->b_data, \ | ||
389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
390 | |||
391 | #define ext4_check_inode_blockref(inode) \ | ||
392 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
393 | EXT4_I(inode)->i_data, \ | ||
394 | EXT4_NDIR_BLOCKS) | ||
395 | |||
396 | /** | ||
397 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
398 | * @inode: inode in question | ||
399 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
400 | * @offsets: offsets of pointers in inode/indirect blocks | ||
401 | * @chain: place to store the result | ||
402 | * @err: here we store the error value | ||
403 | * | ||
404 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
405 | * if everything went OK or the pointer to the last filled triple | ||
406 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
407 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
409 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
411 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
412 | * numbers of the chain, addresses they were taken from (and where we can | ||
413 | * verify that chain did not change) and buffer_heads hosting these | ||
414 | * numbers. | ||
415 | * | ||
416 | * Function stops when it stumbles upon zero pointer (absent block) | ||
417 | * (pointer to last triple returned, *@err == 0) | ||
418 | * or when it gets an IO error reading an indirect block | ||
419 | * (ditto, *@err == -EIO) | ||
420 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
421 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
422 | * | ||
423 | * Need to be called with | ||
424 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
425 | */ | ||
426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
427 | ext4_lblk_t *offsets, | ||
428 | Indirect chain[4], int *err) | ||
429 | { | ||
430 | struct super_block *sb = inode->i_sb; | ||
431 | Indirect *p = chain; | ||
432 | struct buffer_head *bh; | ||
433 | |||
434 | *err = 0; | ||
435 | /* i_data is not going away, no lock needed */ | ||
436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
437 | if (!p->key) | ||
438 | goto no_block; | ||
439 | while (--depth) { | ||
440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
441 | if (unlikely(!bh)) | ||
442 | goto failure; | ||
443 | |||
444 | if (!bh_uptodate_or_lock(bh)) { | ||
445 | if (bh_submit_read(bh) < 0) { | ||
446 | put_bh(bh); | ||
447 | goto failure; | ||
448 | } | ||
449 | /* validate block references */ | ||
450 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
451 | put_bh(bh); | ||
452 | goto failure; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
457 | /* Reader: end */ | ||
458 | if (!p->key) | ||
459 | goto no_block; | ||
460 | } | ||
461 | return NULL; | ||
462 | |||
463 | failure: | ||
464 | *err = -EIO; | ||
465 | no_block: | ||
466 | return p; | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * ext4_find_near - find a place for allocation with sufficient locality | ||
471 | * @inode: owner | ||
472 | * @ind: descriptor of indirect block. | ||
473 | * | ||
474 | * This function returns the preferred place for block allocation. | ||
475 | * It is used when heuristic for sequential allocation fails. | ||
476 | * Rules are: | ||
477 | * + if there is a block to the left of our position - allocate near it. | ||
478 | * + if pointer will live in indirect block - allocate near that block. | ||
479 | * + if pointer will live in inode - allocate in the same | ||
480 | * cylinder group. | ||
481 | * | ||
482 | * In the latter case we colour the starting block by the callers PID to | ||
483 | * prevent it from clashing with concurrent allocations for a different inode | ||
484 | * in the same block group. The PID is used here so that functionally related | ||
485 | * files will be close-by on-disk. | ||
486 | * | ||
487 | * Caller must make sure that @ind is valid and will stay that way. | ||
488 | */ | ||
489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
490 | { | ||
491 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
493 | __le32 *p; | ||
494 | ext4_fsblk_t bg_start; | ||
495 | ext4_fsblk_t last_block; | ||
496 | ext4_grpblk_t colour; | ||
497 | ext4_group_t block_group; | ||
498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
499 | |||
500 | /* Try to find previous block */ | ||
501 | for (p = ind->p - 1; p >= start; p--) { | ||
502 | if (*p) | ||
503 | return le32_to_cpu(*p); | ||
504 | } | ||
505 | |||
506 | /* No such thing, so let's try location of indirect block */ | ||
507 | if (ind->bh) | ||
508 | return ind->bh->b_blocknr; | ||
509 | |||
510 | /* | ||
511 | * It is going to be referred to from the inode itself? OK, just put it | ||
512 | * into the same cylinder group then. | ||
513 | */ | ||
514 | block_group = ei->i_block_group; | ||
515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
516 | block_group &= ~(flex_size-1); | ||
517 | if (S_ISREG(inode->i_mode)) | ||
518 | block_group++; | ||
519 | } | ||
520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
522 | |||
523 | /* | ||
524 | * If we are doing delayed allocation, we don't need take | ||
525 | * colour into account. | ||
526 | */ | ||
527 | if (test_opt(inode->i_sb, DELALLOC)) | ||
528 | return bg_start; | ||
529 | |||
530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
531 | colour = (current->pid % 16) * | ||
532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
533 | else | ||
534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
535 | return bg_start + colour; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * ext4_find_goal - find a preferred place for allocation. | ||
540 | * @inode: owner | ||
541 | * @block: block we want | ||
542 | * @partial: pointer to the last triple within a chain | ||
543 | * | ||
544 | * Normally this function find the preferred place for block allocation, | ||
545 | * returns it. | ||
546 | * Because this is only used for non-extent files, we limit the block nr | ||
547 | * to 32 bits. | ||
548 | */ | ||
549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
550 | Indirect *partial) | ||
551 | { | ||
552 | ext4_fsblk_t goal; | ||
553 | |||
554 | /* | ||
555 | * XXX need to get goal block from mballoc's data structures | ||
556 | */ | ||
557 | |||
558 | goal = ext4_find_near(inode, partial); | ||
559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
560 | return goal; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
565 | * of direct blocks need to be allocated for the given branch. | ||
566 | * | ||
567 | * @branch: chain of indirect blocks | ||
568 | * @k: number of blocks need for indirect blocks | ||
569 | * @blks: number of data blocks to be mapped. | ||
570 | * @blocks_to_boundary: the offset in the indirect block | ||
571 | * | ||
572 | * return the total number of blocks to be allocate, including the | ||
573 | * direct and indirect blocks. | ||
574 | */ | ||
575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
576 | int blocks_to_boundary) | ||
577 | { | ||
578 | unsigned int count = 0; | ||
579 | |||
580 | /* | ||
581 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
582 | * then it's clear blocks on that path have not allocated | ||
583 | */ | ||
584 | if (k > 0) { | ||
585 | /* right now we don't handle cross boundary allocation */ | ||
586 | if (blks < blocks_to_boundary + 1) | ||
587 | count += blks; | ||
588 | else | ||
589 | count += blocks_to_boundary + 1; | ||
590 | return count; | ||
591 | } | ||
592 | |||
593 | count++; | ||
594 | while (count < blks && count <= blocks_to_boundary && | ||
595 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
596 | count++; | ||
597 | } | ||
598 | return count; | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
603 | * @handle: handle for this transaction | ||
604 | * @inode: inode which needs allocated blocks | ||
605 | * @iblock: the logical block to start allocated at | ||
606 | * @goal: preferred physical block of allocation | ||
607 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
608 | * blocks | ||
609 | * @blks: number of desired blocks | ||
610 | * @new_blocks: on return it will store the new block numbers for | ||
611 | * the indirect blocks(if needed) and the first direct block, | ||
612 | * @err: on return it will store the error code | ||
613 | * | ||
614 | * This function will return the number of blocks allocated as | ||
615 | * requested by the passed-in parameters. | ||
616 | */ | ||
617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
618 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
619 | int indirect_blks, int blks, | ||
620 | ext4_fsblk_t new_blocks[4], int *err) | ||
621 | { | ||
622 | struct ext4_allocation_request ar; | ||
623 | int target, i; | ||
624 | unsigned long count = 0, blk_allocated = 0; | ||
625 | int index = 0; | ||
626 | ext4_fsblk_t current_block = 0; | ||
627 | int ret = 0; | ||
628 | |||
629 | /* | ||
630 | * Here we try to allocate the requested multiple blocks at once, | ||
631 | * on a best-effort basis. | ||
632 | * To build a branch, we should allocate blocks for | ||
633 | * the indirect blocks(if not allocated yet), and at least | ||
634 | * the first direct block of this branch. That's the | ||
635 | * minimum number of blocks need to allocate(required) | ||
636 | */ | ||
637 | /* first we try to allocate the indirect blocks */ | ||
638 | target = indirect_blks; | ||
639 | while (target > 0) { | ||
640 | count = target; | ||
641 | /* allocating blocks for indirect blocks and direct blocks */ | ||
642 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
643 | 0, &count, err); | ||
644 | if (*err) | ||
645 | goto failed_out; | ||
646 | |||
647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
648 | EXT4_ERROR_INODE(inode, | ||
649 | "current_block %llu + count %lu > %d!", | ||
650 | current_block, count, | ||
651 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
652 | *err = -EIO; | ||
653 | goto failed_out; | ||
654 | } | ||
655 | |||
656 | target -= count; | ||
657 | /* allocate blocks for indirect blocks */ | ||
658 | while (index < indirect_blks && count) { | ||
659 | new_blocks[index++] = current_block++; | ||
660 | count--; | ||
661 | } | ||
662 | if (count > 0) { | ||
663 | /* | ||
664 | * save the new block number | ||
665 | * for the first direct block | ||
666 | */ | ||
667 | new_blocks[index] = current_block; | ||
668 | printk(KERN_INFO "%s returned more blocks than " | ||
669 | "requested\n", __func__); | ||
670 | WARN_ON(1); | ||
671 | break; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | target = blks - count ; | ||
676 | blk_allocated = count; | ||
677 | if (!target) | ||
678 | goto allocated; | ||
679 | /* Now allocate data blocks */ | ||
680 | memset(&ar, 0, sizeof(ar)); | ||
681 | ar.inode = inode; | ||
682 | ar.goal = goal; | ||
683 | ar.len = target; | ||
684 | ar.logical = iblock; | ||
685 | if (S_ISREG(inode->i_mode)) | ||
686 | /* enable in-core preallocation only for regular files */ | ||
687 | ar.flags = EXT4_MB_HINT_DATA; | ||
688 | |||
689 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
691 | EXT4_ERROR_INODE(inode, | ||
692 | "current_block %llu + ar.len %d > %d!", | ||
693 | current_block, ar.len, | ||
694 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
695 | *err = -EIO; | ||
696 | goto failed_out; | ||
697 | } | ||
698 | |||
699 | if (*err && (target == blks)) { | ||
700 | /* | ||
701 | * if the allocation failed and we didn't allocate | ||
702 | * any blocks before | ||
703 | */ | ||
704 | goto failed_out; | ||
705 | } | ||
706 | if (!*err) { | ||
707 | if (target == blks) { | ||
708 | /* | ||
709 | * save the new block number | ||
710 | * for the first direct block | ||
711 | */ | ||
712 | new_blocks[index] = current_block; | ||
713 | } | ||
714 | blk_allocated += ar.len; | ||
715 | } | ||
716 | allocated: | ||
717 | /* total number of blocks allocated for direct blocks */ | ||
718 | ret = blk_allocated; | ||
719 | *err = 0; | ||
720 | return ret; | ||
721 | failed_out: | ||
722 | for (i = 0; i < index; i++) | ||
723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
724 | return ret; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
729 | * @handle: handle for this transaction | ||
730 | * @inode: owner | ||
731 | * @indirect_blks: number of allocated indirect blocks | ||
732 | * @blks: number of allocated direct blocks | ||
733 | * @goal: preferred place for allocation | ||
734 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
735 | * @branch: place to store the chain in. | ||
736 | * | ||
737 | * This function allocates blocks, zeroes out all but the last one, | ||
738 | * links them into chain and (if we are synchronous) writes them to disk. | ||
739 | * In other words, it prepares a branch that can be spliced onto the | ||
740 | * inode. It stores the information about that chain in the branch[], in | ||
741 | * the same format as ext4_get_branch() would do. We are calling it after | ||
742 | * we had read the existing part of chain and partial points to the last | ||
743 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
744 | * picture as after the successful ext4_get_block(), except that in one | ||
745 | * place chain is disconnected - *branch->p is still zero (we did not | ||
746 | * set the last link), but branch->key contains the number that should | ||
747 | * be placed into *branch->p to fill that gap. | ||
748 | * | ||
749 | * If allocation fails we free all blocks we've allocated (and forget | ||
750 | * their buffer_heads) and return the error value the from failed | ||
751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
752 | * as described above and return 0. | ||
753 | */ | ||
754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
755 | ext4_lblk_t iblock, int indirect_blks, | ||
756 | int *blks, ext4_fsblk_t goal, | ||
757 | ext4_lblk_t *offsets, Indirect *branch) | ||
758 | { | ||
759 | int blocksize = inode->i_sb->s_blocksize; | ||
760 | int i, n = 0; | ||
761 | int err = 0; | ||
762 | struct buffer_head *bh; | ||
763 | int num; | ||
764 | ext4_fsblk_t new_blocks[4]; | ||
765 | ext4_fsblk_t current_block; | ||
766 | |||
767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
768 | *blks, new_blocks, &err); | ||
769 | if (err) | ||
770 | return err; | ||
771 | |||
772 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
773 | /* | ||
774 | * metadata blocks and data blocks are allocated. | ||
775 | */ | ||
776 | for (n = 1; n <= indirect_blks; n++) { | ||
777 | /* | ||
778 | * Get buffer_head for parent block, zero it out | ||
779 | * and set the pointer to new one, then send | ||
780 | * parent to disk. | ||
781 | */ | ||
782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
783 | if (unlikely(!bh)) { | ||
784 | err = -EIO; | ||
785 | goto failed; | ||
786 | } | ||
787 | |||
788 | branch[n].bh = bh; | ||
789 | lock_buffer(bh); | ||
790 | BUFFER_TRACE(bh, "call get_create_access"); | ||
791 | err = ext4_journal_get_create_access(handle, bh); | ||
792 | if (err) { | ||
793 | /* Don't brelse(bh) here; it's done in | ||
794 | * ext4_journal_forget() below */ | ||
795 | unlock_buffer(bh); | ||
796 | goto failed; | ||
797 | } | ||
798 | |||
799 | memset(bh->b_data, 0, blocksize); | ||
800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
801 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
802 | *branch[n].p = branch[n].key; | ||
803 | if (n == indirect_blks) { | ||
804 | current_block = new_blocks[n]; | ||
805 | /* | ||
806 | * End of chain, update the last new metablock of | ||
807 | * the chain to point to the new allocated | ||
808 | * data blocks numbers | ||
809 | */ | ||
810 | for (i = 1; i < num; i++) | ||
811 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
812 | } | ||
813 | BUFFER_TRACE(bh, "marking uptodate"); | ||
814 | set_buffer_uptodate(bh); | ||
815 | unlock_buffer(bh); | ||
816 | |||
817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
818 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
819 | if (err) | ||
820 | goto failed; | ||
821 | } | ||
822 | *blks = num; | ||
823 | return err; | ||
824 | failed: | ||
825 | /* Allocation failed, free what we already allocated */ | ||
826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
827 | for (i = 1; i <= n ; i++) { | ||
828 | /* | ||
829 | * branch[i].bh is newly allocated, so there is no | ||
830 | * need to revoke the block, which is why we don't | ||
831 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
832 | */ | ||
833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
834 | EXT4_FREE_BLOCKS_FORGET); | ||
835 | } | ||
836 | for (i = n+1; i < indirect_blks; i++) | ||
837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
838 | |||
839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
840 | |||
841 | return err; | ||
842 | } | ||
843 | |||
844 | /** | ||
845 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
846 | * @handle: handle for this transaction | ||
847 | * @inode: owner | ||
848 | * @block: (logical) number of block we are adding | ||
849 | * @chain: chain of indirect blocks (with a missing link - see | ||
850 | * ext4_alloc_branch) | ||
851 | * @where: location of missing link | ||
852 | * @num: number of indirect blocks we are adding | ||
853 | * @blks: number of direct blocks we are adding | ||
854 | * | ||
855 | * This function fills the missing link and does all housekeeping needed in | ||
856 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
857 | * chain to new block and return 0. | ||
858 | */ | ||
859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
860 | ext4_lblk_t block, Indirect *where, int num, | ||
861 | int blks) | ||
862 | { | ||
863 | int i; | ||
864 | int err = 0; | ||
865 | ext4_fsblk_t current_block; | ||
866 | |||
867 | /* | ||
868 | * If we're splicing into a [td]indirect block (as opposed to the | ||
869 | * inode) then we need to get write access to the [td]indirect block | ||
870 | * before the splice. | ||
871 | */ | ||
872 | if (where->bh) { | ||
873 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
874 | err = ext4_journal_get_write_access(handle, where->bh); | ||
875 | if (err) | ||
876 | goto err_out; | ||
877 | } | ||
878 | /* That's it */ | ||
879 | |||
880 | *where->p = where->key; | ||
881 | |||
882 | /* | ||
883 | * Update the host buffer_head or inode to point to more just allocated | ||
884 | * direct blocks blocks | ||
885 | */ | ||
886 | if (num == 0 && blks > 1) { | ||
887 | current_block = le32_to_cpu(where->key) + 1; | ||
888 | for (i = 1; i < blks; i++) | ||
889 | *(where->p + i) = cpu_to_le32(current_block++); | ||
890 | } | ||
891 | |||
892 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
893 | /* had we spliced it onto indirect block? */ | ||
894 | if (where->bh) { | ||
895 | /* | ||
896 | * If we spliced it onto an indirect block, we haven't | ||
897 | * altered the inode. Note however that if it is being spliced | ||
898 | * onto an indirect block at the very end of the file (the | ||
899 | * file is growing) then we *will* alter the inode to reflect | ||
900 | * the new i_size. But that is not done here - it is done in | ||
901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
902 | */ | ||
903 | jbd_debug(5, "splicing indirect only\n"); | ||
904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
906 | if (err) | ||
907 | goto err_out; | ||
908 | } else { | ||
909 | /* | ||
910 | * OK, we spliced it into the inode itself on a direct block. | ||
911 | */ | ||
912 | ext4_mark_inode_dirty(handle, inode); | ||
913 | jbd_debug(5, "splicing direct\n"); | ||
914 | } | ||
915 | return err; | ||
916 | |||
917 | err_out: | ||
918 | for (i = 1; i <= num; i++) { | ||
919 | /* | ||
920 | * branch[i].bh is newly allocated, so there is no | ||
921 | * need to revoke the block, which is why we don't | ||
922 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
923 | */ | ||
924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
925 | EXT4_FREE_BLOCKS_FORGET); | ||
926 | } | ||
927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
928 | blks, 0); | ||
929 | |||
930 | return err; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
935 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
936 | * scheme) for ext4_map_blocks(). | ||
937 | * | ||
938 | * Allocation strategy is simple: if we have to allocate something, we will | ||
939 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
940 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
941 | * required, recheck the path, free and repeat if check fails, otherwise | ||
942 | * set the last missing link (that will protect us from any truncate-generated | ||
943 | * removals - all blocks on the path are immune now) and possibly force the | ||
944 | * write on the parent block. | ||
945 | * That has a nice additional property: no special recovery from the failed | ||
946 | * allocations is needed - we simply release blocks and do not touch anything | ||
947 | * reachable from inode. | ||
948 | * | ||
949 | * `handle' can be NULL if create == 0. | ||
950 | * | ||
951 | * return > 0, # of blocks mapped or allocated. | ||
952 | * return = 0, if plain lookup failed. | ||
953 | * return < 0, error case. | ||
954 | * | ||
955 | * The ext4_ind_get_blocks() function should be called with | ||
956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
959 | * blocks. | ||
960 | */ | ||
961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
962 | struct ext4_map_blocks *map, | ||
963 | int flags) | ||
964 | { | ||
965 | int err = -EIO; | ||
966 | ext4_lblk_t offsets[4]; | ||
967 | Indirect chain[4]; | ||
968 | Indirect *partial; | ||
969 | ext4_fsblk_t goal; | ||
970 | int indirect_blks; | ||
971 | int blocks_to_boundary = 0; | ||
972 | int depth; | ||
973 | int count = 0; | ||
974 | ext4_fsblk_t first_block = 0; | ||
975 | |||
976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
980 | &blocks_to_boundary); | ||
981 | |||
982 | if (depth == 0) | ||
983 | goto out; | ||
984 | |||
985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
986 | |||
987 | /* Simplest case - block found, no allocation needed */ | ||
988 | if (!partial) { | ||
989 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
990 | count++; | ||
991 | /*map more blocks*/ | ||
992 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
993 | ext4_fsblk_t blk; | ||
994 | |||
995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
996 | |||
997 | if (blk == first_block + count) | ||
998 | count++; | ||
999 | else | ||
1000 | break; | ||
1001 | } | ||
1002 | goto got_it; | ||
1003 | } | ||
1004 | |||
1005 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
1007 | goto cleanup; | ||
1008 | |||
1009 | /* | ||
1010 | * Okay, we need to do block allocation. | ||
1011 | */ | ||
1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
1013 | |||
1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
1015 | indirect_blks = (chain + depth) - partial - 1; | ||
1016 | |||
1017 | /* | ||
1018 | * Next look up the indirect map to count the totoal number of | ||
1019 | * direct blocks to allocate for this branch. | ||
1020 | */ | ||
1021 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
1022 | map->m_len, blocks_to_boundary); | ||
1023 | /* | ||
1024 | * Block out ext4_truncate while we alter the tree | ||
1025 | */ | ||
1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
1027 | &count, goal, | ||
1028 | offsets + (partial - chain), partial); | ||
1029 | |||
1030 | /* | ||
1031 | * The ext4_splice_branch call will free and forget any buffers | ||
1032 | * on the new chain if there is a failure, but that risks using | ||
1033 | * up transaction credits, especially for bitmaps where the | ||
1034 | * credits cannot be returned. Can we handle this somehow? We | ||
1035 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
1036 | */ | ||
1037 | if (!err) | ||
1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
1039 | partial, indirect_blks, count); | ||
1040 | if (err) | ||
1041 | goto cleanup; | ||
1042 | |||
1043 | map->m_flags |= EXT4_MAP_NEW; | ||
1044 | |||
1045 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1046 | got_it: | ||
1047 | map->m_flags |= EXT4_MAP_MAPPED; | ||
1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
1049 | map->m_len = count; | ||
1050 | if (count > blocks_to_boundary) | ||
1051 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
1052 | err = count; | ||
1053 | /* Clean up and exit */ | ||
1054 | partial = chain + depth - 1; /* the whole chain */ | ||
1055 | cleanup: | ||
1056 | while (partial > chain) { | ||
1057 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1058 | brelse(partial->bh); | ||
1059 | partial--; | ||
1060 | } | ||
1061 | out: | ||
1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
1063 | map->m_pblk, map->m_len, err); | ||
1064 | return err; | ||
1065 | } | ||
1066 | |||
1067 | #ifdef CONFIG_QUOTA | 238 | #ifdef CONFIG_QUOTA |
1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) | 239 | qsize_t *ext4_get_reserved_space(struct inode *inode) |
1069 | { | 240 | { |
@@ -1073,33 +244,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) | |||
1073 | 244 | ||
1074 | /* | 245 | /* |
1075 | * Calculate the number of metadata blocks need to reserve | 246 | * Calculate the number of metadata blocks need to reserve |
1076 | * to allocate a new block at @lblocks for non extent file based file | ||
1077 | */ | ||
1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, | ||
1079 | sector_t lblock) | ||
1080 | { | ||
1081 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
1083 | int blk_bits; | ||
1084 | |||
1085 | if (lblock < EXT4_NDIR_BLOCKS) | ||
1086 | return 0; | ||
1087 | |||
1088 | lblock -= EXT4_NDIR_BLOCKS; | ||
1089 | |||
1090 | if (ei->i_da_metadata_calc_len && | ||
1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
1092 | ei->i_da_metadata_calc_len++; | ||
1093 | return 0; | ||
1094 | } | ||
1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
1096 | ei->i_da_metadata_calc_len = 1; | ||
1097 | blk_bits = order_base_2(lblock); | ||
1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Calculate the number of metadata blocks need to reserve | ||
1103 | * to allocate a block located at @lblock | 247 | * to allocate a block located at @lblock |
1104 | */ | 248 | */ |
1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | 249 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
@@ -1107,7 +251,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | |||
1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 251 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1108 | return ext4_ext_calc_metadata_amount(inode, lblock); | 252 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1109 | 253 | ||
1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); | 254 | return ext4_ind_calc_metadata_amount(inode, lblock); |
1111 | } | 255 | } |
1112 | 256 | ||
1113 | /* | 257 | /* |
@@ -1589,16 +733,6 @@ static int do_journal_get_write_access(handle_t *handle, | |||
1589 | return ret; | 733 | return ret; |
1590 | } | 734 | } |
1591 | 735 | ||
1592 | /* | ||
1593 | * Truncate blocks that were not used by write. We have to truncate the | ||
1594 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
1595 | */ | ||
1596 | static void ext4_truncate_failed_write(struct inode *inode) | ||
1597 | { | ||
1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1599 | ext4_truncate(inode); | ||
1600 | } | ||
1601 | |||
1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 736 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
1603 | struct buffer_head *bh_result, int create); | 737 | struct buffer_head *bh_result, int create); |
1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 738 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
@@ -1863,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1863 | if (new_i_size > inode->i_size) | 997 | if (new_i_size > inode->i_size) |
1864 | i_size_write(inode, pos+copied); | 998 | i_size_write(inode, pos+copied); |
1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 999 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1000 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1001 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1867 | ext4_update_i_disksize(inode, new_i_size); | 1002 | ext4_update_i_disksize(inode, new_i_size); |
1868 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1003 | ret2 = ext4_mark_inode_dirty(handle, inode); |
@@ -2571,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, | |||
2571 | write_end_fn); | 1706 | write_end_fn); |
2572 | if (ret == 0) | 1707 | if (ret == 0) |
2573 | ret = err; | 1708 | ret = err; |
1709 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
2574 | err = ext4_journal_stop(handle); | 1710 | err = ext4_journal_stop(handle); |
2575 | if (!ret) | 1711 | if (!ret) |
2576 | ret = err; | 1712 | ret = err; |
@@ -3450,112 +2586,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3450 | } | 2586 | } |
3451 | 2587 | ||
3452 | /* | 2588 | /* |
3453 | * O_DIRECT for ext3 (or indirect map) based files | ||
3454 | * | ||
3455 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3456 | * orphan list. So recovery will truncate it back to the original size | ||
3457 | * if the machine crashes during the write. | ||
3458 | * | ||
3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
3460 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
3461 | * VFS code falls back into buffered path in that case so we are safe. | ||
3462 | */ | ||
3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
3464 | const struct iovec *iov, loff_t offset, | ||
3465 | unsigned long nr_segs) | ||
3466 | { | ||
3467 | struct file *file = iocb->ki_filp; | ||
3468 | struct inode *inode = file->f_mapping->host; | ||
3469 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3470 | handle_t *handle; | ||
3471 | ssize_t ret; | ||
3472 | int orphan = 0; | ||
3473 | size_t count = iov_length(iov, nr_segs); | ||
3474 | int retries = 0; | ||
3475 | |||
3476 | if (rw == WRITE) { | ||
3477 | loff_t final_size = offset + count; | ||
3478 | |||
3479 | if (final_size > inode->i_size) { | ||
3480 | /* Credits for sb + inode write */ | ||
3481 | handle = ext4_journal_start(inode, 2); | ||
3482 | if (IS_ERR(handle)) { | ||
3483 | ret = PTR_ERR(handle); | ||
3484 | goto out; | ||
3485 | } | ||
3486 | ret = ext4_orphan_add(handle, inode); | ||
3487 | if (ret) { | ||
3488 | ext4_journal_stop(handle); | ||
3489 | goto out; | ||
3490 | } | ||
3491 | orphan = 1; | ||
3492 | ei->i_disksize = inode->i_size; | ||
3493 | ext4_journal_stop(handle); | ||
3494 | } | ||
3495 | } | ||
3496 | |||
3497 | retry: | ||
3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
3499 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
3500 | inode->i_sb->s_bdev, iov, | ||
3501 | offset, nr_segs, | ||
3502 | ext4_get_block, NULL, NULL, 0); | ||
3503 | else { | ||
3504 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
3505 | offset, nr_segs, ext4_get_block); | ||
3506 | |||
3507 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
3508 | loff_t isize = i_size_read(inode); | ||
3509 | loff_t end = offset + iov_length(iov, nr_segs); | ||
3510 | |||
3511 | if (end > isize) | ||
3512 | ext4_truncate_failed_write(inode); | ||
3513 | } | ||
3514 | } | ||
3515 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3516 | goto retry; | ||
3517 | |||
3518 | if (orphan) { | ||
3519 | int err; | ||
3520 | |||
3521 | /* Credits for sb + inode write */ | ||
3522 | handle = ext4_journal_start(inode, 2); | ||
3523 | if (IS_ERR(handle)) { | ||
3524 | /* This is really bad luck. We've written the data | ||
3525 | * but cannot extend i_size. Bail out and pretend | ||
3526 | * the write failed... */ | ||
3527 | ret = PTR_ERR(handle); | ||
3528 | if (inode->i_nlink) | ||
3529 | ext4_orphan_del(NULL, inode); | ||
3530 | |||
3531 | goto out; | ||
3532 | } | ||
3533 | if (inode->i_nlink) | ||
3534 | ext4_orphan_del(handle, inode); | ||
3535 | if (ret > 0) { | ||
3536 | loff_t end = offset + ret; | ||
3537 | if (end > inode->i_size) { | ||
3538 | ei->i_disksize = end; | ||
3539 | i_size_write(inode, end); | ||
3540 | /* | ||
3541 | * We're going to return a positive `ret' | ||
3542 | * here due to non-zero-length I/O, so there's | ||
3543 | * no way of reporting error returns from | ||
3544 | * ext4_mark_inode_dirty() to userspace. So | ||
3545 | * ignore it. | ||
3546 | */ | ||
3547 | ext4_mark_inode_dirty(handle, inode); | ||
3548 | } | ||
3549 | } | ||
3550 | err = ext4_journal_stop(handle); | ||
3551 | if (ret == 0) | ||
3552 | ret = err; | ||
3553 | } | ||
3554 | out: | ||
3555 | return ret; | ||
3556 | } | ||
3557 | |||
3558 | /* | ||
3559 | * ext4_get_block used when preparing for a DIO write or buffer write. | 2589 | * ext4_get_block used when preparing for a DIO write or buffer write. |
3560 | * We allocate an uinitialized extent if blocks haven't been allocated. | 2590 | * We allocate an uinitialized extent if blocks haven't been allocated. |
3561 | * The extent will be converted to initialized after the IO is complete. | 2591 | * The extent will be converted to initialized after the IO is complete. |
@@ -4033,383 +3063,6 @@ unlock: | |||
4033 | return err; | 3063 | return err; |
4034 | } | 3064 | } |
4035 | 3065 | ||
4036 | /* | ||
4037 | * Probably it should be a library function... search for first non-zero word | ||
4038 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
4039 | * Linus? | ||
4040 | */ | ||
4041 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
4042 | { | ||
4043 | while (p < q) | ||
4044 | if (*p++) | ||
4045 | return 0; | ||
4046 | return 1; | ||
4047 | } | ||
4048 | |||
4049 | /** | ||
4050 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
4051 | * @inode: inode in question | ||
4052 | * @depth: depth of the affected branch | ||
4053 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
4054 | * @chain: place to store the pointers to partial indirect blocks | ||
4055 | * @top: place to the (detached) top of branch | ||
4056 | * | ||
4057 | * This is a helper function used by ext4_truncate(). | ||
4058 | * | ||
4059 | * When we do truncate() we may have to clean the ends of several | ||
4060 | * indirect blocks but leave the blocks themselves alive. Block is | ||
4061 | * partially truncated if some data below the new i_size is referred | ||
4062 | * from it (and it is on the path to the first completely truncated | ||
4063 | * data block, indeed). We have to free the top of that path along | ||
4064 | * with everything to the right of the path. Since no allocation | ||
4065 | * past the truncation point is possible until ext4_truncate() | ||
4066 | * finishes, we may safely do the latter, but top of branch may | ||
4067 | * require special attention - pageout below the truncation point | ||
4068 | * might try to populate it. | ||
4069 | * | ||
4070 | * We atomically detach the top of branch from the tree, store the | ||
4071 | * block number of its root in *@top, pointers to buffer_heads of | ||
4072 | * partially truncated blocks - in @chain[].bh and pointers to | ||
4073 | * their last elements that should not be removed - in | ||
4074 | * @chain[].p. Return value is the pointer to last filled element | ||
4075 | * of @chain. | ||
4076 | * | ||
4077 | * The work left to caller to do the actual freeing of subtrees: | ||
4078 | * a) free the subtree starting from *@top | ||
4079 | * b) free the subtrees whose roots are stored in | ||
4080 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
4081 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
4082 | * (no partially truncated stuff there). */ | ||
4083 | |||
4084 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
4085 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
4086 | __le32 *top) | ||
4087 | { | ||
4088 | Indirect *partial, *p; | ||
4089 | int k, err; | ||
4090 | |||
4091 | *top = 0; | ||
4092 | /* Make k index the deepest non-null offset + 1 */ | ||
4093 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
4094 | ; | ||
4095 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
4096 | /* Writer: pointers */ | ||
4097 | if (!partial) | ||
4098 | partial = chain + k-1; | ||
4099 | /* | ||
4100 | * If the branch acquired continuation since we've looked at it - | ||
4101 | * fine, it should all survive and (new) top doesn't belong to us. | ||
4102 | */ | ||
4103 | if (!partial->key && *partial->p) | ||
4104 | /* Writer: end */ | ||
4105 | goto no_top; | ||
4106 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
4107 | ; | ||
4108 | /* | ||
4109 | * OK, we've found the last block that must survive. The rest of our | ||
4110 | * branch should be detached before unlocking. However, if that rest | ||
4111 | * of branch is all ours and does not grow immediately from the inode | ||
4112 | * it's easier to cheat and just decrement partial->p. | ||
4113 | */ | ||
4114 | if (p == chain + k - 1 && p > chain) { | ||
4115 | p->p--; | ||
4116 | } else { | ||
4117 | *top = *p->p; | ||
4118 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
4119 | #if 0 | ||
4120 | *p->p = 0; | ||
4121 | #endif | ||
4122 | } | ||
4123 | /* Writer: end */ | ||
4124 | |||
4125 | while (partial > p) { | ||
4126 | brelse(partial->bh); | ||
4127 | partial--; | ||
4128 | } | ||
4129 | no_top: | ||
4130 | return partial; | ||
4131 | } | ||
4132 | |||
4133 | /* | ||
4134 | * Zero a number of block pointers in either an inode or an indirect block. | ||
4135 | * If we restart the transaction we must again get write access to the | ||
4136 | * indirect block for further modification. | ||
4137 | * | ||
4138 | * We release `count' blocks on disk, but (last - first) may be greater | ||
4139 | * than `count' because there can be holes in there. | ||
4140 | * | ||
4141 | * Return 0 on success, 1 on invalid block range | ||
4142 | * and < 0 on fatal error. | ||
4143 | */ | ||
4144 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
4145 | struct buffer_head *bh, | ||
4146 | ext4_fsblk_t block_to_free, | ||
4147 | unsigned long count, __le32 *first, | ||
4148 | __le32 *last) | ||
4149 | { | ||
4150 | __le32 *p; | ||
4151 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
4152 | int err; | ||
4153 | |||
4154 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
4155 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4156 | |||
4157 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
4158 | count)) { | ||
4159 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
4160 | "blocks %llu len %lu", | ||
4161 | (unsigned long long) block_to_free, count); | ||
4162 | return 1; | ||
4163 | } | ||
4164 | |||
4165 | if (try_to_extend_transaction(handle, inode)) { | ||
4166 | if (bh) { | ||
4167 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
4168 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
4169 | if (unlikely(err)) | ||
4170 | goto out_err; | ||
4171 | } | ||
4172 | err = ext4_mark_inode_dirty(handle, inode); | ||
4173 | if (unlikely(err)) | ||
4174 | goto out_err; | ||
4175 | err = ext4_truncate_restart_trans(handle, inode, | ||
4176 | blocks_for_truncate(inode)); | ||
4177 | if (unlikely(err)) | ||
4178 | goto out_err; | ||
4179 | if (bh) { | ||
4180 | BUFFER_TRACE(bh, "retaking write access"); | ||
4181 | err = ext4_journal_get_write_access(handle, bh); | ||
4182 | if (unlikely(err)) | ||
4183 | goto out_err; | ||
4184 | } | ||
4185 | } | ||
4186 | |||
4187 | for (p = first; p < last; p++) | ||
4188 | *p = 0; | ||
4189 | |||
4190 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
4191 | return 0; | ||
4192 | out_err: | ||
4193 | ext4_std_error(inode->i_sb, err); | ||
4194 | return err; | ||
4195 | } | ||
4196 | |||
4197 | /** | ||
4198 | * ext4_free_data - free a list of data blocks | ||
4199 | * @handle: handle for this transaction | ||
4200 | * @inode: inode we are dealing with | ||
4201 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
4202 | * @first: array of block numbers | ||
4203 | * @last: points immediately past the end of array | ||
4204 | * | ||
4205 | * We are freeing all blocks referred from that array (numbers are stored as | ||
4206 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
4207 | * | ||
4208 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
4209 | * blocks are contiguous then releasing them at one time will only affect one | ||
4210 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
4211 | * actually use a lot of journal space. | ||
4212 | * | ||
4213 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
4214 | * block pointers. | ||
4215 | */ | ||
4216 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
4217 | struct buffer_head *this_bh, | ||
4218 | __le32 *first, __le32 *last) | ||
4219 | { | ||
4220 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
4221 | unsigned long count = 0; /* Number of blocks in the run */ | ||
4222 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
4223 | corresponding to | ||
4224 | block_to_free */ | ||
4225 | ext4_fsblk_t nr; /* Current block # */ | ||
4226 | __le32 *p; /* Pointer into inode/ind | ||
4227 | for current block */ | ||
4228 | int err = 0; | ||
4229 | |||
4230 | if (this_bh) { /* For indirect block */ | ||
4231 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
4232 | err = ext4_journal_get_write_access(handle, this_bh); | ||
4233 | /* Important: if we can't update the indirect pointers | ||
4234 | * to the blocks, we can't free them. */ | ||
4235 | if (err) | ||
4236 | return; | ||
4237 | } | ||
4238 | |||
4239 | for (p = first; p < last; p++) { | ||
4240 | nr = le32_to_cpu(*p); | ||
4241 | if (nr) { | ||
4242 | /* accumulate blocks to free if they're contiguous */ | ||
4243 | if (count == 0) { | ||
4244 | block_to_free = nr; | ||
4245 | block_to_free_p = p; | ||
4246 | count = 1; | ||
4247 | } else if (nr == block_to_free + count) { | ||
4248 | count++; | ||
4249 | } else { | ||
4250 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
4251 | block_to_free, count, | ||
4252 | block_to_free_p, p); | ||
4253 | if (err) | ||
4254 | break; | ||
4255 | block_to_free = nr; | ||
4256 | block_to_free_p = p; | ||
4257 | count = 1; | ||
4258 | } | ||
4259 | } | ||
4260 | } | ||
4261 | |||
4262 | if (!err && count > 0) | ||
4263 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
4264 | count, block_to_free_p, p); | ||
4265 | if (err < 0) | ||
4266 | /* fatal error */ | ||
4267 | return; | ||
4268 | |||
4269 | if (this_bh) { | ||
4270 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
4271 | |||
4272 | /* | ||
4273 | * The buffer head should have an attached journal head at this | ||
4274 | * point. However, if the data is corrupted and an indirect | ||
4275 | * block pointed to itself, it would have been detached when | ||
4276 | * the block was cleared. Check for this instead of OOPSing. | ||
4277 | */ | ||
4278 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
4279 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
4280 | else | ||
4281 | EXT4_ERROR_INODE(inode, | ||
4282 | "circular indirect block detected at " | ||
4283 | "block %llu", | ||
4284 | (unsigned long long) this_bh->b_blocknr); | ||
4285 | } | ||
4286 | } | ||
4287 | |||
4288 | /** | ||
4289 | * ext4_free_branches - free an array of branches | ||
4290 | * @handle: JBD handle for this transaction | ||
4291 | * @inode: inode we are dealing with | ||
4292 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
4293 | * @first: array of block numbers | ||
4294 | * @last: pointer immediately past the end of array | ||
4295 | * @depth: depth of the branches to free | ||
4296 | * | ||
4297 | * We are freeing all blocks referred from these branches (numbers are | ||
4298 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
4299 | * appropriately. | ||
4300 | */ | ||
4301 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
4302 | struct buffer_head *parent_bh, | ||
4303 | __le32 *first, __le32 *last, int depth) | ||
4304 | { | ||
4305 | ext4_fsblk_t nr; | ||
4306 | __le32 *p; | ||
4307 | |||
4308 | if (ext4_handle_is_aborted(handle)) | ||
4309 | return; | ||
4310 | |||
4311 | if (depth--) { | ||
4312 | struct buffer_head *bh; | ||
4313 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4314 | p = last; | ||
4315 | while (--p >= first) { | ||
4316 | nr = le32_to_cpu(*p); | ||
4317 | if (!nr) | ||
4318 | continue; /* A hole */ | ||
4319 | |||
4320 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
4321 | nr, 1)) { | ||
4322 | EXT4_ERROR_INODE(inode, | ||
4323 | "invalid indirect mapped " | ||
4324 | "block %lu (level %d)", | ||
4325 | (unsigned long) nr, depth); | ||
4326 | break; | ||
4327 | } | ||
4328 | |||
4329 | /* Go read the buffer for the next level down */ | ||
4330 | bh = sb_bread(inode->i_sb, nr); | ||
4331 | |||
4332 | /* | ||
4333 | * A read failure? Report error and clear slot | ||
4334 | * (should be rare). | ||
4335 | */ | ||
4336 | if (!bh) { | ||
4337 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
4338 | "Read failure"); | ||
4339 | continue; | ||
4340 | } | ||
4341 | |||
4342 | /* This zaps the entire block. Bottom up. */ | ||
4343 | BUFFER_TRACE(bh, "free child branches"); | ||
4344 | ext4_free_branches(handle, inode, bh, | ||
4345 | (__le32 *) bh->b_data, | ||
4346 | (__le32 *) bh->b_data + addr_per_block, | ||
4347 | depth); | ||
4348 | brelse(bh); | ||
4349 | |||
4350 | /* | ||
4351 | * Everything below this this pointer has been | ||
4352 | * released. Now let this top-of-subtree go. | ||
4353 | * | ||
4354 | * We want the freeing of this indirect block to be | ||
4355 | * atomic in the journal with the updating of the | ||
4356 | * bitmap block which owns it. So make some room in | ||
4357 | * the journal. | ||
4358 | * | ||
4359 | * We zero the parent pointer *after* freeing its | ||
4360 | * pointee in the bitmaps, so if extend_transaction() | ||
4361 | * for some reason fails to put the bitmap changes and | ||
4362 | * the release into the same transaction, recovery | ||
4363 | * will merely complain about releasing a free block, | ||
4364 | * rather than leaking blocks. | ||
4365 | */ | ||
4366 | if (ext4_handle_is_aborted(handle)) | ||
4367 | return; | ||
4368 | if (try_to_extend_transaction(handle, inode)) { | ||
4369 | ext4_mark_inode_dirty(handle, inode); | ||
4370 | ext4_truncate_restart_trans(handle, inode, | ||
4371 | blocks_for_truncate(inode)); | ||
4372 | } | ||
4373 | |||
4374 | /* | ||
4375 | * The forget flag here is critical because if | ||
4376 | * we are journaling (and not doing data | ||
4377 | * journaling), we have to make sure a revoke | ||
4378 | * record is written to prevent the journal | ||
4379 | * replay from overwriting the (former) | ||
4380 | * indirect block if it gets reallocated as a | ||
4381 | * data block. This must happen in the same | ||
4382 | * transaction where the data blocks are | ||
4383 | * actually freed. | ||
4384 | */ | ||
4385 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
4386 | EXT4_FREE_BLOCKS_METADATA| | ||
4387 | EXT4_FREE_BLOCKS_FORGET); | ||
4388 | |||
4389 | if (parent_bh) { | ||
4390 | /* | ||
4391 | * The block which we have just freed is | ||
4392 | * pointed to by an indirect block: journal it | ||
4393 | */ | ||
4394 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
4395 | if (!ext4_journal_get_write_access(handle, | ||
4396 | parent_bh)){ | ||
4397 | *p = 0; | ||
4398 | BUFFER_TRACE(parent_bh, | ||
4399 | "call ext4_handle_dirty_metadata"); | ||
4400 | ext4_handle_dirty_metadata(handle, | ||
4401 | inode, | ||
4402 | parent_bh); | ||
4403 | } | ||
4404 | } | ||
4405 | } | ||
4406 | } else { | ||
4407 | /* We have reached the bottom of the tree. */ | ||
4408 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
4409 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
4410 | } | ||
4411 | } | ||
4412 | |||
4413 | int ext4_can_truncate(struct inode *inode) | 3066 | int ext4_can_truncate(struct inode *inode) |
4414 | { | 3067 | { |
4415 | if (S_ISREG(inode->i_mode)) | 3068 | if (S_ISREG(inode->i_mode)) |
@@ -4476,19 +3129,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4476 | */ | 3129 | */ |
4477 | void ext4_truncate(struct inode *inode) | 3130 | void ext4_truncate(struct inode *inode) |
4478 | { | 3131 | { |
4479 | handle_t *handle; | ||
4480 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4481 | __le32 *i_data = ei->i_data; | ||
4482 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4483 | struct address_space *mapping = inode->i_mapping; | ||
4484 | ext4_lblk_t offsets[4]; | ||
4485 | Indirect chain[4]; | ||
4486 | Indirect *partial; | ||
4487 | __le32 nr = 0; | ||
4488 | int n = 0; | ||
4489 | ext4_lblk_t last_block, max_block; | ||
4490 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
4491 | |||
4492 | trace_ext4_truncate_enter(inode); | 3132 | trace_ext4_truncate_enter(inode); |
4493 | 3133 | ||
4494 | if (!ext4_can_truncate(inode)) | 3134 | if (!ext4_can_truncate(inode)) |
@@ -4499,149 +3139,11 @@ void ext4_truncate(struct inode *inode) | |||
4499 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | 3139 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4500 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | 3140 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); |
4501 | 3141 | ||
4502 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 3142 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
4503 | ext4_ext_truncate(inode); | 3143 | ext4_ext_truncate(inode); |
4504 | trace_ext4_truncate_exit(inode); | 3144 | else |
4505 | return; | 3145 | ext4_ind_truncate(inode); |
4506 | } | ||
4507 | |||
4508 | handle = start_transaction(inode); | ||
4509 | if (IS_ERR(handle)) | ||
4510 | return; /* AKPM: return what? */ | ||
4511 | |||
4512 | last_block = (inode->i_size + blocksize-1) | ||
4513 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4514 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
4515 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4516 | |||
4517 | if (inode->i_size & (blocksize - 1)) | ||
4518 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
4519 | goto out_stop; | ||
4520 | |||
4521 | if (last_block != max_block) { | ||
4522 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
4523 | if (n == 0) | ||
4524 | goto out_stop; /* error */ | ||
4525 | } | ||
4526 | |||
4527 | /* | ||
4528 | * OK. This truncate is going to happen. We add the inode to the | ||
4529 | * orphan list, so that if this truncate spans multiple transactions, | ||
4530 | * and we crash, we will resume the truncate when the filesystem | ||
4531 | * recovers. It also marks the inode dirty, to catch the new size. | ||
4532 | * | ||
4533 | * Implication: the file must always be in a sane, consistent | ||
4534 | * truncatable state while each transaction commits. | ||
4535 | */ | ||
4536 | if (ext4_orphan_add(handle, inode)) | ||
4537 | goto out_stop; | ||
4538 | |||
4539 | /* | ||
4540 | * From here we block out all ext4_get_block() callers who want to | ||
4541 | * modify the block allocation tree. | ||
4542 | */ | ||
4543 | down_write(&ei->i_data_sem); | ||
4544 | |||
4545 | ext4_discard_preallocations(inode); | ||
4546 | |||
4547 | /* | ||
4548 | * The orphan list entry will now protect us from any crash which | ||
4549 | * occurs before the truncate completes, so it is now safe to propagate | ||
4550 | * the new, shorter inode size (held for now in i_size) into the | ||
4551 | * on-disk inode. We do this via i_disksize, which is the value which | ||
4552 | * ext4 *really* writes onto the disk inode. | ||
4553 | */ | ||
4554 | ei->i_disksize = inode->i_size; | ||
4555 | |||
4556 | if (last_block == max_block) { | ||
4557 | /* | ||
4558 | * It is unnecessary to free any data blocks if last_block is | ||
4559 | * equal to the indirect block limit. | ||
4560 | */ | ||
4561 | goto out_unlock; | ||
4562 | } else if (n == 1) { /* direct blocks */ | ||
4563 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
4564 | i_data + EXT4_NDIR_BLOCKS); | ||
4565 | goto do_indirects; | ||
4566 | } | ||
4567 | |||
4568 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
4569 | /* Kill the top of shared branch (not detached) */ | ||
4570 | if (nr) { | ||
4571 | if (partial == chain) { | ||
4572 | /* Shared branch grows from the inode */ | ||
4573 | ext4_free_branches(handle, inode, NULL, | ||
4574 | &nr, &nr+1, (chain+n-1) - partial); | ||
4575 | *partial->p = 0; | ||
4576 | /* | ||
4577 | * We mark the inode dirty prior to restart, | ||
4578 | * and prior to stop. No need for it here. | ||
4579 | */ | ||
4580 | } else { | ||
4581 | /* Shared branch grows from an indirect block */ | ||
4582 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
4583 | ext4_free_branches(handle, inode, partial->bh, | ||
4584 | partial->p, | ||
4585 | partial->p+1, (chain+n-1) - partial); | ||
4586 | } | ||
4587 | } | ||
4588 | /* Clear the ends of indirect blocks on the shared branch */ | ||
4589 | while (partial > chain) { | ||
4590 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
4591 | (__le32*)partial->bh->b_data+addr_per_block, | ||
4592 | (chain+n-1) - partial); | ||
4593 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
4594 | brelse(partial->bh); | ||
4595 | partial--; | ||
4596 | } | ||
4597 | do_indirects: | ||
4598 | /* Kill the remaining (whole) subtrees */ | ||
4599 | switch (offsets[0]) { | ||
4600 | default: | ||
4601 | nr = i_data[EXT4_IND_BLOCK]; | ||
4602 | if (nr) { | ||
4603 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
4604 | i_data[EXT4_IND_BLOCK] = 0; | ||
4605 | } | ||
4606 | case EXT4_IND_BLOCK: | ||
4607 | nr = i_data[EXT4_DIND_BLOCK]; | ||
4608 | if (nr) { | ||
4609 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
4610 | i_data[EXT4_DIND_BLOCK] = 0; | ||
4611 | } | ||
4612 | case EXT4_DIND_BLOCK: | ||
4613 | nr = i_data[EXT4_TIND_BLOCK]; | ||
4614 | if (nr) { | ||
4615 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
4616 | i_data[EXT4_TIND_BLOCK] = 0; | ||
4617 | } | ||
4618 | case EXT4_TIND_BLOCK: | ||
4619 | ; | ||
4620 | } | ||
4621 | |||
4622 | out_unlock: | ||
4623 | up_write(&ei->i_data_sem); | ||
4624 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4625 | ext4_mark_inode_dirty(handle, inode); | ||
4626 | |||
4627 | /* | ||
4628 | * In a multi-transaction truncate, we only make the final transaction | ||
4629 | * synchronous | ||
4630 | */ | ||
4631 | if (IS_SYNC(inode)) | ||
4632 | ext4_handle_sync(handle); | ||
4633 | out_stop: | ||
4634 | /* | ||
4635 | * If this was a simple ftruncate(), and the file will remain alive | ||
4636 | * then we need to clear up the orphan record which we created above. | ||
4637 | * However, if this was a real unlink then we were called by | ||
4638 | * ext4_delete_inode(), and we allow that function to clean up the | ||
4639 | * orphan info for us. | ||
4640 | */ | ||
4641 | if (inode->i_nlink) | ||
4642 | ext4_orphan_del(handle, inode); | ||
4643 | 3146 | ||
4644 | ext4_journal_stop(handle); | ||
4645 | trace_ext4_truncate_exit(inode); | 3147 | trace_ext4_truncate_exit(inode); |
4646 | } | 3148 | } |
4647 | 3149 | ||
@@ -5012,7 +3514,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
5012 | (S_ISLNK(inode->i_mode) && | 3514 | (S_ISLNK(inode->i_mode) && |
5013 | !ext4_inode_is_fast_symlink(inode))) { | 3515 | !ext4_inode_is_fast_symlink(inode))) { |
5014 | /* Validate block references which are part of inode */ | 3516 | /* Validate block references which are part of inode */ |
5015 | ret = ext4_check_inode_blockref(inode); | 3517 | ret = ext4_ind_check_inode(inode); |
5016 | } | 3518 | } |
5017 | if (ret) | 3519 | if (ret) |
5018 | goto bad_inode; | 3520 | goto bad_inode; |
@@ -5459,34 +3961,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
5459 | return 0; | 3961 | return 0; |
5460 | } | 3962 | } |
5461 | 3963 | ||
5462 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | ||
5463 | int chunk) | ||
5464 | { | ||
5465 | int indirects; | ||
5466 | |||
5467 | /* if nrblocks are contiguous */ | ||
5468 | if (chunk) { | ||
5469 | /* | ||
5470 | * With N contiguous data blocks, we need at most | ||
5471 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
5472 | * 2 dindirect blocks, and 1 tindirect block | ||
5473 | */ | ||
5474 | return DIV_ROUND_UP(nrblocks, | ||
5475 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
5476 | } | ||
5477 | /* | ||
5478 | * if nrblocks are not contiguous, worse case, each block touch | ||
5479 | * a indirect block, and each indirect block touch a double indirect | ||
5480 | * block, plus a triple indirect block | ||
5481 | */ | ||
5482 | indirects = nrblocks * 2 + 1; | ||
5483 | return indirects; | ||
5484 | } | ||
5485 | |||
5486 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 3964 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5487 | { | 3965 | { |
5488 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 3966 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5489 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 3967 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); |
5490 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 3968 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5491 | } | 3969 | } |
5492 | 3970 | ||