aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/ext4
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Makefile5
-rw-r--r--fs/ext4/acl.c13
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/balloc.c157
-rw-r--r--fs/ext4/block_validity.c7
-rw-r--r--fs/ext4/dir.c58
-rw-r--r--fs/ext4/ext4.h347
-rw-r--r--fs/ext4/ext4_extents.h82
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h18
-rw-r--r--fs/ext4/extents.c2024
-rw-r--r--fs/ext4/file.c129
-rw-r--r--fs/ext4/fsync.c142
-rw-r--r--fs/ext4/ialloc.c147
-rw-r--r--fs/ext4/inode.c1198
-rw-r--r--fs/ext4/ioctl.c39
-rw-r--r--fs/ext4/mballoc.c893
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/migrate.c18
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c35
-rw-r--r--fs/ext4/namei.c236
-rw-r--r--fs/ext4/page-io.c417
-rw-r--r--fs/ext4/resize.c125
-rw-r--r--fs/ext4/super.c1231
-rw-r--r--fs/ext4/xattr.c40
-rw-r--r--fs/ext4/xattr.h14
-rw-r--r--fs/ext4/xattr_security.c5
28 files changed, 5144 insertions, 2611 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,9 +4,10 @@
4 4
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o
10 11
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
238} 238}
239 239
240int 240int
241ext4_check_acl(struct inode *inode, int mask) 241ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
242{ 242{
243 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 243 struct posix_acl *acl;
244
245 if (flags & IPERM_FLAG_RCU) {
246 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
247 return -ECHILD;
248 return -EAGAIN;
249 }
244 250
251 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
245 if (IS_ERR(acl)) 252 if (IS_ERR(acl))
246 return PTR_ERR(acl); 253 return PTR_ERR(acl);
247 if (acl) { 254 if (acl) {
@@ -426,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
426 return -EINVAL; 433 return -EINVAL;
427 if (!test_opt(inode->i_sb, POSIX_ACL)) 434 if (!test_opt(inode->i_sb, POSIX_ACL))
428 return -EOPNOTSUPP; 435 return -EOPNOTSUPP;
429 if (!is_owner_or_cap(inode)) 436 if (!inode_owner_or_capable(inode))
430 return -EPERM; 437 return -EPERM;
431 438
432 if (value) { 439 if (value) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_check_acl(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int, unsigned int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "mballoc.h" 22#include "mballoc.h"
23 23
24#include <trace/events/ext4.h>
25
24/* 26/*
25 * balloc.c contains the blocks allocation and deallocation routines 27 * balloc.c contains the blocks allocation and deallocation routines
26 */ 28 */
@@ -171,7 +173,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
171 * less than the blocksize * 8 ( which is the size 173 * less than the blocksize * 8 ( which is the size
172 * of bitmap ), set rest of the block bitmap to 1 174 * of bitmap ), set rest of the block bitmap to 1
173 */ 175 */
174 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 176 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
177 bh->b_data);
175 } 178 }
176 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 179 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
177} 180}
@@ -341,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
341 * We do it here so the bitmap uptodate bit 344 * We do it here so the bitmap uptodate bit
342 * get set with buffer lock held. 345 * get set with buffer lock held.
343 */ 346 */
347 trace_ext4_read_block_bitmap_load(sb, block_group);
344 set_bitmap_uptodate(bh); 348 set_bitmap_uptodate(bh);
345 if (bh_submit_read(bh) < 0) { 349 if (bh_submit_read(bh) < 0) {
346 put_bh(bh); 350 put_bh(bh);
@@ -358,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
358} 362}
359 363
360/** 364/**
361 * ext4_add_groupblocks() -- Add given blocks to an existing group
362 * @handle: handle to this transaction
363 * @sb: super block
364 * @block: start physcial block to add to the block group
365 * @count: number of blocks to free
366 *
367 * This marks the blocks as free in the bitmap. We ask the
368 * mballoc to reload the buddy after this by setting group
369 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
370 */
371void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
372 ext4_fsblk_t block, unsigned long count)
373{
374 struct buffer_head *bitmap_bh = NULL;
375 struct buffer_head *gd_bh;
376 ext4_group_t block_group;
377 ext4_grpblk_t bit;
378 unsigned int i;
379 struct ext4_group_desc *desc;
380 struct ext4_sb_info *sbi = EXT4_SB(sb);
381 int err = 0, ret, blk_free_count;
382 ext4_grpblk_t blocks_freed;
383 struct ext4_group_info *grp;
384
385 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
386
387 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
388 grp = ext4_get_group_info(sb, block_group);
389 /*
390 * Check to see if we are freeing blocks across a group
391 * boundary.
392 */
393 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
394 goto error_return;
395 }
396 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
397 if (!bitmap_bh)
398 goto error_return;
399 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
400 if (!desc)
401 goto error_return;
402
403 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
404 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
405 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
406 in_range(block + count - 1, ext4_inode_table(sb, desc),
407 sbi->s_itb_per_group)) {
408 ext4_error(sb, "Adding blocks in system zones - "
409 "Block = %llu, count = %lu",
410 block, count);
411 goto error_return;
412 }
413
414 /*
415 * We are about to add blocks to the bitmap,
416 * so we need undo access.
417 */
418 BUFFER_TRACE(bitmap_bh, "getting undo access");
419 err = ext4_journal_get_undo_access(handle, bitmap_bh);
420 if (err)
421 goto error_return;
422
423 /*
424 * We are about to modify some metadata. Call the journal APIs
425 * to unshare ->b_data if a currently-committing transaction is
426 * using it
427 */
428 BUFFER_TRACE(gd_bh, "get_write_access");
429 err = ext4_journal_get_write_access(handle, gd_bh);
430 if (err)
431 goto error_return;
432 /*
433 * make sure we don't allow a parallel init on other groups in the
434 * same buddy cache
435 */
436 down_write(&grp->alloc_sem);
437 for (i = 0, blocks_freed = 0; i < count; i++) {
438 BUFFER_TRACE(bitmap_bh, "clear bit");
439 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
440 bit + i, bitmap_bh->b_data)) {
441 ext4_error(sb, "bit already cleared for block %llu",
442 (ext4_fsblk_t)(block + i));
443 BUFFER_TRACE(bitmap_bh, "bit already cleared");
444 } else {
445 blocks_freed++;
446 }
447 }
448 ext4_lock_group(sb, block_group);
449 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
450 ext4_free_blks_set(sb, desc, blk_free_count);
451 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
452 ext4_unlock_group(sb, block_group);
453 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
454
455 if (sbi->s_log_groups_per_flex) {
456 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
457 atomic_add(blocks_freed,
458 &sbi->s_flex_groups[flex_group].free_blocks);
459 }
460 /*
461 * request to reload the buddy with the
462 * new bitmap information
463 */
464 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
465 grp->bb_free += blocks_freed;
466 up_write(&grp->alloc_sem);
467
468 /* We dirtied the bitmap block */
469 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
470 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
471
472 /* And the group descriptor block */
473 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
474 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
475 if (!err)
476 err = ret;
477
478error_return:
479 brelse(bitmap_bh);
480 ext4_std_error(sb, err);
481 return;
482}
483
484/**
485 * ext4_has_free_blocks() 365 * ext4_has_free_blocks()
486 * @sbi: in-core super block structure. 366 * @sbi: in-core super block structure.
487 * @nblocks: number of needed blocks 367 * @nblocks: number of needed blocks
@@ -489,7 +369,8 @@ error_return:
489 * Check if filesystem has nblocks free & available for allocation. 369 * Check if filesystem has nblocks free & available for allocation.
490 * On success return 1, return 0 on failure. 370 * On success return 1, return 0 on failure.
491 */ 371 */
492int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 372static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags)
493{ 374{
494 s64 free_blocks, dirty_blocks, root_blocks; 375 s64 free_blocks, dirty_blocks, root_blocks;
495 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -503,11 +384,6 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
503 EXT4_FREEBLOCKS_WATERMARK) { 384 EXT4_FREEBLOCKS_WATERMARK) {
504 free_blocks = percpu_counter_sum_positive(fbc); 385 free_blocks = percpu_counter_sum_positive(fbc);
505 dirty_blocks = percpu_counter_sum_positive(dbc); 386 dirty_blocks = percpu_counter_sum_positive(dbc);
506 if (dirty_blocks < 0) {
507 printk(KERN_CRIT "Dirty block accounting "
508 "went wrong %lld\n",
509 (long long)dirty_blocks);
510 }
511 } 387 }
512 /* Check whether we have space after 388 /* Check whether we have space after
513 * accounting for current dirty blocks & root reserved blocks. 389 * accounting for current dirty blocks & root reserved blocks.
@@ -518,7 +394,9 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
518 /* Hm, nope. Are (enough) root reserved blocks available? */ 394 /* Hm, nope. Are (enough) root reserved blocks available? */
519 if (sbi->s_resuid == current_fsuid() || 395 if (sbi->s_resuid == current_fsuid() ||
520 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
521 capable(CAP_SYS_RESOURCE)) { 397 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399
522 if (free_blocks >= (nblocks + dirty_blocks)) 400 if (free_blocks >= (nblocks + dirty_blocks))
523 return 1; 401 return 1;
524 } 402 }
@@ -527,9 +405,9 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
527} 405}
528 406
529int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 407int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
530 s64 nblocks) 408 s64 nblocks, unsigned int flags)
531{ 409{
532 if (ext4_has_free_blocks(sbi, nblocks)) { 410 if (ext4_has_free_blocks(sbi, nblocks, flags)) {
533 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
534 return 0; 412 return 0;
535 } else 413 } else
@@ -543,14 +421,14 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
543 * 421 *
544 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if 422 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
545 * it is profitable to retry the operation, this function will wait 423 * it is profitable to retry the operation, this function will wait
546 * for the current or commiting transaction to complete, and then 424 * for the current or committing transaction to complete, and then
547 * return TRUE. 425 * return TRUE.
548 * 426 *
549 * if the total number of retries exceed three times, return FALSE. 427 * if the total number of retries exceed three times, return FALSE.
550 */ 428 */
551int ext4_should_retry_alloc(struct super_block *sb, int *retries) 429int ext4_should_retry_alloc(struct super_block *sb, int *retries)
552{ 430{
553 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || 431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
554 (*retries)++ > 3 || 432 (*retries)++ > 3 ||
555 !EXT4_SB(sb)->s_journal) 433 !EXT4_SB(sb)->s_journal)
556 return 0; 434 return 0;
@@ -573,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
573 * error stores in errp pointer 451 * error stores in errp pointer
574 */ 452 */
575ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 453ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
576 ext4_fsblk_t goal, unsigned long *count, int *errp) 454 ext4_fsblk_t goal, unsigned int flags,
455 unsigned long *count, int *errp)
577{ 456{
578 struct ext4_allocation_request ar; 457 struct ext4_allocation_request ar;
579 ext4_fsblk_t ret; 458 ext4_fsblk_t ret;
@@ -583,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
583 ar.inode = inode; 462 ar.inode = inode;
584 ar.goal = goal; 463 ar.goal = goal;
585 ar.len = count ? *count : 1; 464 ar.len = count ? *count : 1;
465 ar.flags = flags;
586 466
587 ret = ext4_mb_new_blocks(handle, &ar, errp); 467 ret = ext4_mb_new_blocks(handle, &ar, errp);
588 if (count) 468 if (count)
@@ -591,7 +471,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 * Account for the allocated meta blocks. We will never 471 * Account for the allocated meta blocks. We will never
592 * fail EDQUOT for metdata, but we do account for it. 472 * fail EDQUOT for metdata, but we do account for it.
593 */ 473 */
594 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 474 if (!(*errp) &&
475 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 476 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
596 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 477 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 478 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
29 29
30static struct kmem_cache *ext4_system_zone_cachep; 30static struct kmem_cache *ext4_system_zone_cachep;
31 31
32int __init init_ext4_system_zone(void) 32int __init ext4_init_system_zone(void)
33{ 33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL) 35 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM; 36 return -ENOMEM;
38 return 0; 37 return 0;
39} 38}
40 39
41void exit_ext4_system_zone(void) 40void ext4_exit_system_zone(void)
42{ 41{
43 kmem_cache_destroy(ext4_system_zone_cachep); 42 kmem_cache_destroy(ext4_system_zone_cachep);
44} 43}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
39 struct file *filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = ext4_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext4_readdir, /* we take BKL. needed?*/ 44 .readdir = ext4_readdir, /* we take BKL. needed?*/
45 .unlocked_ioctl = ext4_ioctl, 45 .unlocked_ioctl = ext4_ioctl,
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
60 return (ext4_filetype_table[filetype]); 60 return (ext4_filetype_table[filetype]);
61} 61}
62 62
63 63/*
64 * Return 0 if the directory entry is OK, and 1 if there is a problem
65 *
66 * Note: this is the opposite of what ext2 and ext3 historically returned...
67 */
64int __ext4_check_dir_entry(const char *function, unsigned int line, 68int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct inode *dir, 69 struct inode *dir, struct file *filp,
66 struct ext4_dir_entry_2 *de, 70 struct ext4_dir_entry_2 *de,
67 struct buffer_head *bh, 71 struct buffer_head *bh,
68 unsigned int offset) 72 unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
71 const int rlen = ext4_rec_len_from_disk(de->rec_len, 75 const int rlen = ext4_rec_len_from_disk(de->rec_len,
72 dir->i_sb->s_blocksize); 76 dir->i_sb->s_blocksize);
73 77
74 if (rlen < EXT4_DIR_REC_LEN(1)) 78 if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
75 error_msg = "rec_len is smaller than minimal"; 79 error_msg = "rec_len is smaller than minimal";
76 else if (rlen % 4 != 0) 80 else if (unlikely(rlen % 4 != 0))
77 error_msg = "rec_len % 4 != 0"; 81 error_msg = "rec_len % 4 != 0";
78 else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) 82 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
79 error_msg = "rec_len is too small for name_len"; 83 error_msg = "rec_len is too small for name_len";
80 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 84 else if (unlikely(((char *) de - bh->b_data) + rlen >
85 dir->i_sb->s_blocksize))
81 error_msg = "directory entry across blocks"; 86 error_msg = "directory entry across blocks";
82 else if (le32_to_cpu(de->inode) > 87 else if (unlikely(le32_to_cpu(de->inode) >
83 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) 88 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
84 error_msg = "inode out of bounds"; 89 error_msg = "inode out of bounds";
90 else
91 return 0;
85 92
86 if (error_msg != NULL) 93 if (filp)
87 ext4_error_inode(dir, function, line, bh->b_blocknr, 94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
88 "bad entry in directory: %s - " 95 "bad entry in directory: %s - offset=%u(%u), "
89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
90 error_msg, (unsigned) (offset%bh->b_size), offset, 97 error_msg, (unsigned) (offset%bh->b_size),
91 le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
92 rlen, de->name_len); 99 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size),
105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len);
107
108 return 1;
94} 109}
95 110
96static int ext4_readdir(struct file *filp, 111static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
152 */ 167 */
153 if (!bh) { 168 if (!bh) {
154 if (!dir_has_error) { 169 if (!dir_has_error) {
155 EXT4_ERROR_INODE(inode, "directory " 170 EXT4_ERROR_FILE(filp, 0,
156 "contains a hole at offset %Lu", 171 "directory contains a "
172 "hole at offset %llu",
157 (unsigned long long) filp->f_pos); 173 (unsigned long long) filp->f_pos);
158 dir_has_error = 1; 174 dir_has_error = 1;
159 } 175 }
@@ -194,8 +210,8 @@ revalidate:
194 while (!error && filp->f_pos < inode->i_size 210 while (!error && filp->f_pos < inode->i_size
195 && offset < sb->s_blocksize) { 211 && offset < sb->s_blocksize) {
196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 212 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
197 if (!ext4_check_dir_entry(inode, de, 213 if (ext4_check_dir_entry(inode, filp, de,
198 bh, offset)) { 214 bh, offset)) {
199 /* 215 /*
200 * On error, skip the f_pos to the next block 216 * On error, skip the f_pos to the next block
201 */ 217 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ 62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) 63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
64 64
65#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, block, fmt, a...) \
66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) 66 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
67 67
68/* data type for block offset of block group */ 68/* data type for block offset of block group */
69typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
108#define EXT4_MB_DELALLOC_RESERVED 0x0400 108#define EXT4_MB_DELALLOC_RESERVED 0x0400
109/* We are doing stream allocation */ 109/* We are doing stream allocation */
110#define EXT4_MB_STREAM_ALLOC 0x0800 110#define EXT4_MB_STREAM_ALLOC 0x0800
111 111/* Use reserved root blocks if needed */
112#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
112 113
113struct ext4_allocation_request { 114struct ext4_allocation_request {
114 /* target inode for block we're allocating */ 115 /* target inode for block we're allocating */
@@ -168,7 +169,20 @@ struct mpage_da_data {
168 int pages_written; 169 int pages_written;
169 int retval; 170 int retval;
170}; 171};
171#define EXT4_IO_UNWRITTEN 0x1 172
173/*
174 * Flags for ext4_io_end->flags
175 */
176#define EXT4_IO_END_UNWRITTEN 0x0001
177#define EXT4_IO_END_ERROR 0x0002
178
179struct ext4_io_page {
180 struct page *p_page;
181 atomic_t p_count;
182};
183
184#define MAX_IO_PAGES 128
185
172typedef struct ext4_io_end { 186typedef struct ext4_io_end {
173 struct list_head list; /* per-file finished IO list */ 187 struct list_head list; /* per-file finished IO list */
174 struct inode *inode; /* file being written to */ 188 struct inode *inode; /* file being written to */
@@ -179,13 +193,25 @@ typedef struct ext4_io_end {
179 struct work_struct work; /* data work queue */ 193 struct work_struct work; /* data work queue */
180 struct kiocb *iocb; /* iocb struct for AIO */ 194 struct kiocb *iocb; /* iocb struct for AIO */
181 int result; /* error value for AIO */ 195 int result; /* error value for AIO */
196 int num_io_pages;
197 struct ext4_io_page *pages[MAX_IO_PAGES];
182} ext4_io_end_t; 198} ext4_io_end_t;
183 199
200struct ext4_io_submit {
201 int io_op;
202 struct bio *io_bio;
203 ext4_io_end_t *io_end;
204 struct ext4_io_page *io_page;
205 sector_t io_next_block;
206};
207
184/* 208/*
185 * Special inodes numbers 209 * Special inodes numbers
186 */ 210 */
187#define EXT4_BAD_INO 1 /* Bad blocks inode */ 211#define EXT4_BAD_INO 1 /* Bad blocks inode */
188#define EXT4_ROOT_INO 2 /* Root inode */ 212#define EXT4_ROOT_INO 2 /* Root inode */
213#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
214#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
189#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ 215#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
190#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ 216#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
191#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ 217#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -205,6 +231,7 @@ typedef struct ext4_io_end {
205#define EXT4_MIN_BLOCK_SIZE 1024 231#define EXT4_MIN_BLOCK_SIZE 1024
206#define EXT4_MAX_BLOCK_SIZE 65536 232#define EXT4_MAX_BLOCK_SIZE 65536
207#define EXT4_MIN_BLOCK_LOG_SIZE 10 233#define EXT4_MIN_BLOCK_LOG_SIZE 10
234#define EXT4_MAX_BLOCK_LOG_SIZE 16
208#ifdef __KERNEL__ 235#ifdef __KERNEL__
209# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) 236# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
210#else 237#else
@@ -488,6 +515,10 @@ struct ext4_new_group_data {
488 /* Convert extent to initialized after IO complete */ 515 /* Convert extent to initialized after IO complete */
489#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 516#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
490 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 517 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
518 /* Punch out blocks of an extent */
519#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
520 /* Don't normalize allocation size (used for fallocate) */
521#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
491 522
492/* 523/*
493 * Flags used by ext4_free_blocks 524 * Flags used by ext4_free_blocks
@@ -537,23 +568,7 @@ struct ext4_new_group_data {
537#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 568#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
538#endif 569#endif
539 570
540 571/* Max physical block we can address w/o extents */
541/*
542 * Mount options
543 */
544struct ext4_mount_options {
545 unsigned long s_mount_opt;
546 uid_t s_resuid;
547 gid_t s_resgid;
548 unsigned long s_commit_interval;
549 u32 s_min_batch_time, s_max_batch_time;
550#ifdef CONFIG_QUOTA
551 int s_jquota_fmt;
552 char *s_qf_names[MAXQUOTAS];
553#endif
554};
555
556/* Max physical block we can addres w/o extents */
557#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF 572#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
558 573
559/* 574/*
@@ -685,6 +700,8 @@ do { \
685 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ 700 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
686 ext4_decode_extra_time(&(inode)->xtime, \ 701 ext4_decode_extra_time(&(inode)->xtime, \
687 raw_inode->xtime ## _extra); \ 702 raw_inode->xtime ## _extra); \
703 else \
704 (inode)->xtime.tv_nsec = 0; \
688} while (0) 705} while (0)
689 706
690#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ 707#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
@@ -695,6 +712,8 @@ do { \
695 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ 712 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
696 ext4_decode_extra_time(&(einode)->xtime, \ 713 ext4_decode_extra_time(&(einode)->xtime, \
697 raw_inode->xtime ## _extra); \ 714 raw_inode->xtime ## _extra); \
715 else \
716 (einode)->xtime.tv_nsec = 0; \
698} while (0) 717} while (0)
699 718
700#define i_disk_version osd1.linux1.l_i_version 719#define i_disk_version osd1.linux1.l_i_version
@@ -726,12 +745,13 @@ do { \
726 745
727/* 746/*
728 * storage for cached extent 747 * storage for cached extent
748 * If ec_len == 0, then the cache is invalid.
749 * If ec_start == 0, then the cache represents a gap (null mapping)
729 */ 750 */
730struct ext4_ext_cache { 751struct ext4_ext_cache {
731 ext4_fsblk_t ec_start; 752 ext4_fsblk_t ec_start;
732 ext4_lblk_t ec_block; 753 ext4_lblk_t ec_block;
733 __u32 ec_len; /* must be 32bit to return holes */ 754 __u32 ec_len; /* must be 32bit to return holes */
734 __u32 ec_type;
735}; 755};
736 756
737/* 757/*
@@ -750,10 +770,12 @@ struct ext4_inode_info {
750 * near to their parent directory's inode. 770 * near to their parent directory's inode.
751 */ 771 */
752 ext4_group_t i_block_group; 772 ext4_group_t i_block_group;
773 ext4_lblk_t i_dir_start_lookup;
774#if (BITS_PER_LONG < 64)
753 unsigned long i_state_flags; /* Dynamic state flags */ 775 unsigned long i_state_flags; /* Dynamic state flags */
776#endif
754 unsigned long i_flags; 777 unsigned long i_flags;
755 778
756 ext4_lblk_t i_dir_start_lookup;
757#ifdef CONFIG_EXT4_FS_XATTR 779#ifdef CONFIG_EXT4_FS_XATTR
758 /* 780 /*
759 * Extended attributes can be read independently of the main file 781 * Extended attributes can be read independently of the main file
@@ -796,7 +818,7 @@ struct ext4_inode_info {
796 */ 818 */
797 struct rw_semaphore i_data_sem; 819 struct rw_semaphore i_data_sem;
798 struct inode vfs_inode; 820 struct inode vfs_inode;
799 struct jbd2_inode jinode; 821 struct jbd2_inode *jinode;
800 822
801 struct ext4_ext_cache i_cached_extent; 823 struct ext4_ext_cache i_cached_extent;
802 /* 824 /*
@@ -816,14 +838,12 @@ struct ext4_inode_info {
816 unsigned int i_reserved_data_blocks; 838 unsigned int i_reserved_data_blocks;
817 unsigned int i_reserved_meta_blocks; 839 unsigned int i_reserved_meta_blocks;
818 unsigned int i_allocated_meta_blocks; 840 unsigned int i_allocated_meta_blocks;
819 unsigned short i_delalloc_reserved_flag; 841 ext4_lblk_t i_da_metadata_calc_last_lblock;
820 sector_t i_da_metadata_calc_last_lblock;
821 int i_da_metadata_calc_len; 842 int i_da_metadata_calc_len;
822 843
823 /* on-disk additional length */ 844 /* on-disk additional length */
824 __u16 i_extra_isize; 845 __u16 i_extra_isize;
825 846
826 spinlock_t i_block_reservation_lock;
827#ifdef CONFIG_QUOTA 847#ifdef CONFIG_QUOTA
828 /* quota space reservation, managed internally by quota code */ 848 /* quota space reservation, managed internally by quota code */
829 qsize_t i_reserved_quota; 849 qsize_t i_reserved_quota;
@@ -832,8 +852,12 @@ struct ext4_inode_info {
832 /* completed IOs that might need unwritten extents handling */ 852 /* completed IOs that might need unwritten extents handling */
833 struct list_head i_completed_io_list; 853 struct list_head i_completed_io_list;
834 spinlock_t i_completed_io_lock; 854 spinlock_t i_completed_io_lock;
855 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
835 /* current io_end structure for async DIO write*/ 856 /* current io_end structure for async DIO write*/
836 ext4_io_end_t *cur_aio_dio; 857 ext4_io_end_t *cur_aio_dio;
858 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
859
860 spinlock_t i_block_reservation_lock;
837 861
838 /* 862 /*
839 * Transactions that contain inode's metadata needed to complete 863 * Transactions that contain inode's metadata needed to complete
@@ -885,24 +909,35 @@ struct ext4_inode_info {
885#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 909#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
886#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 910#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
887#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 911#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
912#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
888#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 913#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
889#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 914#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
890#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 915#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
891#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 916#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
917#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
892 918
893#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 919#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
894#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 920 ~EXT4_MOUNT_##opt
921#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
922 EXT4_MOUNT_##opt
895#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ 923#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
896 EXT4_MOUNT_##opt) 924 EXT4_MOUNT_##opt)
897 925
898#define ext4_set_bit ext2_set_bit 926#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
927 ~EXT4_MOUNT2_##opt
928#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
929 EXT4_MOUNT2_##opt
930#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
931 EXT4_MOUNT2_##opt)
932
933#define ext4_set_bit __test_and_set_bit_le
899#define ext4_set_bit_atomic ext2_set_bit_atomic 934#define ext4_set_bit_atomic ext2_set_bit_atomic
900#define ext4_clear_bit ext2_clear_bit 935#define ext4_clear_bit __test_and_clear_bit_le
901#define ext4_clear_bit_atomic ext2_clear_bit_atomic 936#define ext4_clear_bit_atomic ext2_clear_bit_atomic
902#define ext4_test_bit ext2_test_bit 937#define ext4_test_bit test_bit_le
903#define ext4_find_first_zero_bit ext2_find_first_zero_bit 938#define ext4_find_first_zero_bit find_first_zero_bit_le
904#define ext4_find_next_zero_bit ext2_find_next_zero_bit 939#define ext4_find_next_zero_bit find_next_zero_bit_le
905#define ext4_find_next_bit ext2_find_next_bit 940#define ext4_find_next_bit find_next_bit_le
906 941
907/* 942/*
908 * Maximal mount counts between two filesystem checks 943 * Maximal mount counts between two filesystem checks
@@ -1000,7 +1035,7 @@ struct ext4_super_block {
1000 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1035 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
1001 __le32 s_flags; /* Miscellaneous flags */ 1036 __le32 s_flags; /* Miscellaneous flags */
1002 __le16 s_raid_stride; /* RAID stride */ 1037 __le16 s_raid_stride; /* RAID stride */
1003 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1038 __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
1004 __le64 s_mmp_block; /* Block for multi-mount protection */ 1039 __le64 s_mmp_block; /* Block for multi-mount protection */
1005 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1040 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1006 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1041 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1060,6 +1095,7 @@ struct ext4_sb_info {
1060 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1095 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
1061 struct buffer_head **s_group_desc; 1096 struct buffer_head **s_group_desc;
1062 unsigned int s_mount_opt; 1097 unsigned int s_mount_opt;
1098 unsigned int s_mount_opt2;
1063 unsigned int s_mount_flags; 1099 unsigned int s_mount_flags;
1064 ext4_fsblk_t s_sb_block; 1100 ext4_fsblk_t s_sb_block;
1065 uid_t s_resuid; 1101 uid_t s_resuid;
@@ -1087,7 +1123,6 @@ struct ext4_sb_info {
1087 struct completion s_kobj_unregister; 1123 struct completion s_kobj_unregister;
1088 1124
1089 /* Journaling */ 1125 /* Journaling */
1090 struct inode *s_journal_inode;
1091 struct journal_s *s_journal; 1126 struct journal_s *s_journal;
1092 struct list_head s_orphan; 1127 struct list_head s_orphan;
1093 struct mutex s_orphan_lock; 1128 struct mutex s_orphan_lock;
@@ -1116,14 +1151,14 @@ struct ext4_sb_info {
1116 unsigned long s_ext_blocks; 1151 unsigned long s_ext_blocks;
1117 unsigned long s_ext_extents; 1152 unsigned long s_ext_extents;
1118#endif 1153#endif
1154 /* ext4 extent cache stats */
1155 unsigned long extent_cache_hits;
1156 unsigned long extent_cache_misses;
1119 1157
1120 /* for buddy allocator */ 1158 /* for buddy allocator */
1121 struct ext4_group_info ***s_group_info; 1159 struct ext4_group_info ***s_group_info;
1122 struct inode *s_buddy_cache; 1160 struct inode *s_buddy_cache;
1123 long s_blocks_reserved;
1124 spinlock_t s_reserve_lock;
1125 spinlock_t s_md_lock; 1161 spinlock_t s_md_lock;
1126 tid_t s_last_transaction;
1127 unsigned short *s_mb_offsets; 1162 unsigned short *s_mb_offsets;
1128 unsigned int *s_mb_maxs; 1163 unsigned int *s_mb_maxs;
1129 1164
@@ -1141,7 +1176,6 @@ struct ext4_sb_info {
1141 unsigned long s_mb_last_start; 1176 unsigned long s_mb_last_start;
1142 1177
1143 /* stats for buddy allocator */ 1178 /* stats for buddy allocator */
1144 spinlock_t s_mb_pa_lock;
1145 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 1179 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
1146 atomic_t s_bal_success; /* we found long enough chunks */ 1180 atomic_t s_bal_success; /* we found long enough chunks */
1147 atomic_t s_bal_allocated; /* in blocks */ 1181 atomic_t s_bal_allocated; /* in blocks */
@@ -1172,6 +1206,14 @@ struct ext4_sb_info {
1172 1206
1173 /* timer for periodic error stats printing */ 1207 /* timer for periodic error stats printing */
1174 struct timer_list s_err_report; 1208 struct timer_list s_err_report;
1209
1210 /* Lazy inode table initialization info */
1211 struct ext4_li_request *s_li_request;
1212 /* Wait multiplier for lazy initialization thread */
1213 unsigned int s_li_wait_mult;
1214
1215 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk;
1175}; 1217};
1176 1218
1177static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1210,24 +1252,39 @@ enum {
1210 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1252 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1211 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1253 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1212 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1254 EXT4_STATE_NEWENTRY, /* File just added to dir */
1255 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1213}; 1256};
1214 1257
1215#define EXT4_INODE_BIT_FNS(name, field) \ 1258#define EXT4_INODE_BIT_FNS(name, field, offset) \
1216static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ 1259static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1217{ \ 1260{ \
1218 return test_bit(bit, &EXT4_I(inode)->i_##field); \ 1261 return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1219} \ 1262} \
1220static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ 1263static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1221{ \ 1264{ \
1222 set_bit(bit, &EXT4_I(inode)->i_##field); \ 1265 set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1223} \ 1266} \
1224static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ 1267static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1225{ \ 1268{ \
1226 clear_bit(bit, &EXT4_I(inode)->i_##field); \ 1269 clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1227} 1270}
1228 1271
1229EXT4_INODE_BIT_FNS(flag, flags) 1272EXT4_INODE_BIT_FNS(flag, flags, 0)
1230EXT4_INODE_BIT_FNS(state, state_flags) 1273#if (BITS_PER_LONG < 64)
1274EXT4_INODE_BIT_FNS(state, state_flags, 0)
1275
1276static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1277{
1278 (ei)->i_state_flags = 0;
1279}
1280#else
1281EXT4_INODE_BIT_FNS(state, flags, 32)
1282
1283static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1284{
1285 /* We depend on the fact that callers will set i_flags */
1286}
1287#endif
1231#else 1288#else
1232/* Assume that user mode programs are passing in an ext4fs superblock, not 1289/* Assume that user mode programs are passing in an ext4fs superblock, not
1233 * a kernel struct super_block. This will allow us to call the feature-test 1290 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1294,6 +1351,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
1294#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 1351#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
1295#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1352#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1296#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1353#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1354#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1297 1355
1298#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1356#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1299#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1357#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1307,13 +1365,29 @@ EXT4_INODE_BIT_FNS(state, state_flags)
1307#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1365#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1308#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1366#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1309 1367
1368#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1369#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1370 EXT4_FEATURE_INCOMPAT_META_BG)
1371#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1372 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1373 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1374
1375#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1376#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1377 EXT4_FEATURE_INCOMPAT_RECOVER| \
1378 EXT4_FEATURE_INCOMPAT_META_BG)
1379#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1380 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1381 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1382
1310#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1383#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1311#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1384#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1312 EXT4_FEATURE_INCOMPAT_RECOVER| \ 1385 EXT4_FEATURE_INCOMPAT_RECOVER| \
1313 EXT4_FEATURE_INCOMPAT_META_BG| \ 1386 EXT4_FEATURE_INCOMPAT_META_BG| \
1314 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1387 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1315 EXT4_FEATURE_INCOMPAT_64BIT| \ 1388 EXT4_FEATURE_INCOMPAT_64BIT| \
1316 EXT4_FEATURE_INCOMPAT_FLEX_BG) 1389 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1390 EXT4_FEATURE_INCOMPAT_MMP)
1317#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1391#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1318 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1392 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1319 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1393 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1533,7 +1607,97 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1533void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 1607void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1534 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); 1608 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1535 1609
1536extern struct proc_dir_entry *ext4_proc_root; 1610/*
1611 * Timeout and state flag for lazy initialization inode thread.
1612 */
1613#define EXT4_DEF_LI_WAIT_MULT 10
1614#define EXT4_DEF_LI_MAX_START_DELAY 5
1615#define EXT4_LAZYINIT_QUIT 0x0001
1616#define EXT4_LAZYINIT_RUNNING 0x0002
1617
1618/*
1619 * Lazy inode table initialization info
1620 */
1621struct ext4_lazy_init {
1622 unsigned long li_state;
1623 struct list_head li_request_list;
1624 struct mutex li_list_mtx;
1625};
1626
1627struct ext4_li_request {
1628 struct super_block *lr_super;
1629 struct ext4_sb_info *lr_sbi;
1630 ext4_group_t lr_next_group;
1631 struct list_head lr_request;
1632 unsigned long lr_next_sched;
1633 unsigned long lr_timeout;
1634};
1635
1636struct ext4_features {
1637 struct kobject f_kobj;
1638 struct completion f_kobj_unregister;
1639};
1640
1641/*
1642 * This structure will be used for multiple mount protection. It will be
1643 * written into the block number saved in the s_mmp_block field in the
1644 * superblock. Programs that check MMP should assume that if
1645 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
1646 * to use the filesystem, regardless of how old the timestamp is.
1647 */
1648#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
1649#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
1650#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
1651#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
1652
1653struct mmp_struct {
1654 __le32 mmp_magic; /* Magic number for MMP */
1655 __le32 mmp_seq; /* Sequence no. updated periodically */
1656
1657 /*
1658 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
1659 * purposes and do not affect the correctness of the algorithm
1660 */
1661 __le64 mmp_time; /* Time last updated */
1662 char mmp_nodename[64]; /* Node which last updated MMP block */
1663 char mmp_bdevname[32]; /* Bdev which last updated MMP block */
1664
1665 /*
1666 * mmp_check_interval is used to verify if the MMP block has been
1667 * updated on the block device. The value is updated based on the
1668 * maximum time to write the MMP block during an update cycle.
1669 */
1670 __le16 mmp_check_interval;
1671
1672 __le16 mmp_pad1;
1673 __le32 mmp_pad2[227];
1674};
1675
1676/* arguments passed to the mmp thread */
1677struct mmpd_data {
1678 struct buffer_head *bh; /* bh from initial read_mmp_block() */
1679 struct super_block *sb; /* super block of the fs */
1680};
1681
1682/*
1683 * Check interval multiplier
1684 * The MMP block is written every update interval and initially checked every
1685 * update interval x the multiplier (the value is then adapted based on the
1686 * write latency). The reason is that writes can be delayed under load and we
1687 * don't want readers to incorrectly assume that the filesystem is no longer
1688 * in use.
1689 */
1690#define EXT4_MMP_CHECK_MULT 2UL
1691
1692/*
1693 * Minimum interval for MMP checking in seconds.
1694 */
1695#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
1696
1697/*
1698 * Maximum interval for MMP checking in seconds.
1699 */
1700#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
1537 1701
1538/* 1702/*
1539 * Function prototypes 1703 * Function prototypes
@@ -1559,11 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1559extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1723extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1560 ext4_group_t group); 1724 ext4_group_t group);
1561extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1725extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1562 ext4_fsblk_t goal, unsigned long *count, int *errp); 1726 ext4_fsblk_t goal,
1563extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1727 unsigned int flags,
1564extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1728 unsigned long *count,
1565extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1729 int *errp);
1566 ext4_fsblk_t block, unsigned long count); 1730extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1731 s64 nblocks, unsigned int flags);
1567extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1732extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1568extern void ext4_check_blocks_bitmap(struct super_block *); 1733extern void ext4_check_blocks_bitmap(struct super_block *);
1569extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1734extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1581,10 +1746,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1581 1746
1582/* dir.c */ 1747/* dir.c */
1583extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1748extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1749 struct file *,
1584 struct ext4_dir_entry_2 *, 1750 struct ext4_dir_entry_2 *,
1585 struct buffer_head *, unsigned int); 1751 struct buffer_head *, unsigned int);
1586#define ext4_check_dir_entry(dir, de, bh, offset) \ 1752#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
1587 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) 1753 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1754 (de), (bh), (offset)))
1588extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1755extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1589 __u32 minor_hash, 1756 __u32 minor_hash,
1590 struct ext4_dir_entry_2 *dirent); 1757 struct ext4_dir_entry_2 *dirent);
@@ -1592,6 +1759,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1592 1759
1593/* fsync.c */ 1760/* fsync.c */
1594extern int ext4_sync_file(struct file *, int); 1761extern int ext4_sync_file(struct file *, int);
1762extern int ext4_flush_completed_IO(struct inode *);
1595 1763
1596/* hash.c */ 1764/* hash.c */
1597extern int ext4fs_dirhash(const char *name, int len, struct 1765extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1605,11 +1773,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1605extern unsigned long ext4_count_free_inodes(struct super_block *); 1773extern unsigned long ext4_count_free_inodes(struct super_block *);
1606extern unsigned long ext4_count_dirs(struct super_block *); 1774extern unsigned long ext4_count_dirs(struct super_block *);
1607extern void ext4_check_inodes_bitmap(struct super_block *); 1775extern void ext4_check_inodes_bitmap(struct super_block *);
1608extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 1776extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1609 struct buffer_head *bh, 1777extern int ext4_init_inode_table(struct super_block *sb,
1610 ext4_group_t group, 1778 ext4_group_t group, int barrier);
1611 struct ext4_group_desc *desc);
1612extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1613 1779
1614/* mballoc.c */ 1780/* mballoc.c */
1615extern long ext4_mb_stats; 1781extern long ext4_mb_stats;
@@ -1620,16 +1786,17 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1620 struct ext4_allocation_request *, int *); 1786 struct ext4_allocation_request *, int *);
1621extern int ext4_mb_reserve_blocks(struct super_block *, int); 1787extern int ext4_mb_reserve_blocks(struct super_block *, int);
1622extern void ext4_discard_preallocations(struct inode *); 1788extern void ext4_discard_preallocations(struct inode *);
1623extern int __init init_ext4_mballoc(void); 1789extern int __init ext4_init_mballoc(void);
1624extern void exit_ext4_mballoc(void); 1790extern void ext4_exit_mballoc(void);
1625extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1791extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1626 struct buffer_head *bh, ext4_fsblk_t block, 1792 struct buffer_head *bh, ext4_fsblk_t block,
1627 unsigned long count, int flags); 1793 unsigned long count, int flags);
1628extern int ext4_mb_add_groupinfo(struct super_block *sb, 1794extern int ext4_mb_add_groupinfo(struct super_block *sb,
1629 ext4_group_t i, struct ext4_group_desc *desc); 1795 ext4_group_t i, struct ext4_group_desc *desc);
1630extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1631extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1797 ext4_fsblk_t block, unsigned long count);
1632 ext4_group_t, int); 1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1799
1633/* inode.c */ 1800/* inode.c */
1634struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1801struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1635 ext4_lblk_t, int, int *); 1802 ext4_lblk_t, int, int *);
@@ -1646,24 +1813,25 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1646extern void ext4_evict_inode(struct inode *); 1813extern void ext4_evict_inode(struct inode *);
1647extern void ext4_clear_inode(struct inode *); 1814extern void ext4_clear_inode(struct inode *);
1648extern int ext4_sync_inode(handle_t *, struct inode *); 1815extern int ext4_sync_inode(handle_t *, struct inode *);
1649extern void ext4_dirty_inode(struct inode *); 1816extern void ext4_dirty_inode(struct inode *, int);
1650extern int ext4_change_inode_journal_flag(struct inode *, int); 1817extern int ext4_change_inode_journal_flag(struct inode *, int);
1651extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1652extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
1653extern void ext4_truncate(struct inode *); 1820extern void ext4_truncate(struct inode *);
1821extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
1654extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 1822extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1655extern void ext4_set_inode_flags(struct inode *); 1823extern void ext4_set_inode_flags(struct inode *);
1656extern void ext4_get_inode_flags(struct ext4_inode_info *); 1824extern void ext4_get_inode_flags(struct ext4_inode_info *);
1657extern int ext4_alloc_da_blocks(struct inode *inode); 1825extern int ext4_alloc_da_blocks(struct inode *inode);
1658extern void ext4_set_aops(struct inode *inode); 1826extern void ext4_set_aops(struct inode *inode);
1659extern int ext4_writepage_trans_blocks(struct inode *); 1827extern int ext4_writepage_trans_blocks(struct inode *);
1660extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1661extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1828extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1662extern int ext4_block_truncate_page(handle_t *handle, 1829extern int ext4_block_truncate_page(handle_t *handle,
1663 struct address_space *mapping, loff_t from); 1830 struct address_space *mapping, loff_t from);
1831extern int ext4_block_zero_page_range(handle_t *handle,
1832 struct address_space *mapping, loff_t from, loff_t length);
1664extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1833extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1665extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1834extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1666extern int flush_completed_IO(struct inode *inode);
1667extern void ext4_da_update_reserve_space(struct inode *inode, 1835extern void ext4_da_update_reserve_space(struct inode *inode,
1668 int used, int quota_claim); 1836 int used, int quota_claim);
1669/* ioctl.c */ 1837/* ioctl.c */
@@ -1696,8 +1864,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1696 ext4_fsblk_t, const char *, ...) 1864 ext4_fsblk_t, const char *, ...)
1697 __attribute__ ((format (printf, 5, 6))); 1865 __attribute__ ((format (printf, 5, 6)));
1698extern void ext4_error_file(struct file *, const char *, unsigned int, 1866extern void ext4_error_file(struct file *, const char *, unsigned int,
1699 const char *, ...) 1867 ext4_fsblk_t, const char *, ...)
1700 __attribute__ ((format (printf, 4, 5))); 1868 __attribute__ ((format (printf, 5, 6)));
1701extern void __ext4_std_error(struct super_block *, const char *, 1869extern void __ext4_std_error(struct super_block *, const char *,
1702 unsigned int, int); 1870 unsigned int, int);
1703extern void __ext4_abort(struct super_block *, const char *, unsigned int, 1871extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -1712,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1712 __LINE__, ## message) 1880 __LINE__, ## message)
1713extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1881extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1714 __attribute__ ((format (printf, 3, 4))); 1882 __attribute__ ((format (printf, 3, 4)));
1883extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1884 const char *, unsigned int, const char *);
1885#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1886 __LINE__, msg)
1715extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1887extern void __ext4_grp_locked_error(const char *, unsigned int, \
1716 struct super_block *, ext4_group_t, \ 1888 struct super_block *, ext4_group_t, \
1717 unsigned long, ext4_fsblk_t, \ 1889 unsigned long, ext4_fsblk_t, \
@@ -1960,6 +2132,7 @@ extern const struct file_operations ext4_dir_operations;
1960/* file.c */ 2132/* file.c */
1961extern const struct inode_operations ext4_file_inode_operations; 2133extern const struct inode_operations ext4_file_inode_operations;
1962extern const struct file_operations ext4_file_operations; 2134extern const struct file_operations ext4_file_operations;
2135extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
1963 2136
1964/* namei.c */ 2137/* namei.c */
1965extern const struct inode_operations ext4_dir_inode_operations; 2138extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2146,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1973/* block_validity */ 2146/* block_validity */
1974extern void ext4_release_system_zone(struct super_block *sb); 2147extern void ext4_release_system_zone(struct super_block *sb);
1975extern int ext4_setup_system_zone(struct super_block *sb); 2148extern int ext4_setup_system_zone(struct super_block *sb);
1976extern int __init init_ext4_system_zone(void); 2149extern int __init ext4_init_system_zone(void);
1977extern void exit_ext4_system_zone(void); 2150extern void ext4_exit_system_zone(void);
1978extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2151extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1979 ext4_fsblk_t start_blk, 2152 ext4_fsblk_t start_blk,
1980 unsigned int count); 2153 unsigned int count);
@@ -1987,9 +2160,11 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1987extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2160extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1988 struct ext4_map_blocks *map, int flags); 2161 struct ext4_map_blocks *map, int flags);
1989extern void ext4_ext_truncate(struct inode *); 2162extern void ext4_ext_truncate(struct inode *);
2163extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
2164 loff_t length);
1990extern void ext4_ext_init(struct super_block *); 2165extern void ext4_ext_init(struct super_block *);
1991extern void ext4_ext_release(struct super_block *); 2166extern void ext4_ext_release(struct super_block *);
1992extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 2167extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
1993 loff_t len); 2168 loff_t len);
1994extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2169extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1995 ssize_t len); 2170 ssize_t len);
@@ -2002,6 +2177,21 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2002 __u64 start_orig, __u64 start_donor, 2177 __u64 start_orig, __u64 start_donor,
2003 __u64 len, __u64 *moved_len); 2178 __u64 len, __u64 *moved_len);
2004 2179
2180/* page-io.c */
2181extern int __init ext4_init_pageio(void);
2182extern void ext4_exit_pageio(void);
2183extern void ext4_ioend_wait(struct inode *);
2184extern void ext4_free_io_end(ext4_io_end_t *io);
2185extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2186extern int ext4_end_io_nolock(ext4_io_end_t *io);
2187extern void ext4_io_submit(struct ext4_io_submit *io);
2188extern int ext4_bio_write_page(struct ext4_io_submit *io,
2189 struct page *page,
2190 int len,
2191 struct writeback_control *wbc);
2192
2193/* mmp.c */
2194extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2005 2195
2006/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2196/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2007enum ext4_state_bits { 2197enum ext4_state_bits {
@@ -2031,6 +2221,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2031 2221
2032#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2222#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2033 2223
2224/* For ioend & aio unwritten conversion wait queues */
2225#define EXT4_WQ_HASH_SZ 37
2226#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2227 EXT4_WQ_HASH_SZ])
2228#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2229 EXT4_WQ_HASH_SZ])
2230extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2231extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2232
2034#endif /* __KERNEL__ */ 2233#endif /* __KERNEL__ */
2035 2234
2036#endif /* _EXT4_H */ 2235#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..095c36f3b612 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,17 +119,13 @@ struct ext4_ext_path {
119 * structure for external API 119 * structure for external API
120 */ 120 */
121 121
122#define EXT4_EXT_CACHE_NO 0
123#define EXT4_EXT_CACHE_GAP 1
124#define EXT4_EXT_CACHE_EXTENT 2
125
126/* 122/*
127 * to be called by ext4_ext_walk_space() 123 * to be called by ext4_ext_walk_space()
128 * negative retcode - error 124 * negative retcode - error
129 * positive retcode - signal for ext4_ext_walk_space(), see below 125 * positive retcode - signal for ext4_ext_walk_space(), see below
130 * callback must return valid extent (passed or newly created) 126 * callback must return valid extent (passed or newly created)
131 */ 127 */
132typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, 128typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
133 struct ext4_ext_cache *, 129 struct ext4_ext_cache *,
134 struct ext4_extent *, void *); 130 struct ext4_extent *, void *);
135 131
@@ -137,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
137#define EXT_BREAK 1 133#define EXT_BREAK 1
138#define EXT_REPEAT 2 134#define EXT_REPEAT 2
139 135
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ 136/*
141#define EXT_MAX_BLOCK 0xffffffff 137 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
138 * __le32.
139 */
140#define EXT_MAX_BLOCKS 0xffffffff
142 141
143/* 142/*
144 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an 143 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
@@ -197,7 +196,7 @@ static inline unsigned short ext_depth(struct inode *inode)
197static inline void 196static inline void
198ext4_ext_invalidate_cache(struct inode *inode) 197ext4_ext_invalidate_cache(struct inode *inode)
199{ 198{
200 EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; 199 EXT4_I(inode)->i_cached_extent.ec_len = 0;
201} 200}
202 201
203static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) 202static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -225,11 +224,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 224 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 225}
227 226
227/*
228 * ext4_ext_pblock:
229 * combine low and high parts of physical block number into ext4_fsblk_t
230 */
231static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
232{
233 ext4_fsblk_t block;
234
235 block = le32_to_cpu(ex->ee_start_lo);
236 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
237 return block;
238}
239
240/*
241 * ext4_idx_pblock:
242 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
243 */
244static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
245{
246 ext4_fsblk_t block;
247
248 block = le32_to_cpu(ix->ei_leaf_lo);
249 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
250 return block;
251}
252
253/*
254 * ext4_ext_store_pblock:
255 * stores a large physical block number into an extent struct,
256 * breaking it into parts
257 */
258static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
259 ext4_fsblk_t pb)
260{
261 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
262 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
263 0xffff);
264}
265
266/*
267 * ext4_idx_store_pblock:
268 * stores a large physical block number into an index struct,
269 * breaking it into parts
270 */
271static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
272 ext4_fsblk_t pb)
273{
274 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
275 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
276 0xffff);
277}
278
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, 279extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks); 280 ext4_lblk_t lblocks);
230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
233extern int ext4_extent_tree_init(handle_t *, struct inode *); 281extern int ext4_extent_tree_init(handle_t *, struct inode *);
234extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 282extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
235 int num, 283 int num,
@@ -237,19 +285,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
237extern int ext4_can_extents_be_merged(struct inode *inode, 285extern int ext4_can_extents_be_merged(struct inode *inode,
238 struct ext4_extent *ex1, 286 struct ext4_extent *ex1,
239 struct ext4_extent *ex2); 287 struct ext4_extent *ex2);
240extern int ext4_ext_try_to_merge(struct inode *inode,
241 struct ext4_ext_path *path,
242 struct ext4_extent *);
243extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
244extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); 288extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
245extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
246 ext_prepare_callback, void *);
247extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 289extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
248 struct ext4_ext_path *); 290 struct ext4_ext_path *);
249extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
250 ext4_lblk_t *, ext4_fsblk_t *);
251extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
252 ext4_lblk_t *, ext4_fsblk_t *);
253extern void ext4_ext_drop_refs(struct ext4_ext_path *); 291extern void ext4_ext_drop_refs(struct ext4_ext_path *);
254extern int ext4_ext_check_inode(struct inode *inode); 292extern int ext4_ext_check_inode(struct inode *inode);
255#endif /* _EXT4_EXTENTS */ 293#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11{
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21}
22
23int __ext4_journal_get_write_access(const char *where, unsigned int line, 9int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25{ 11{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
86 86
87#ifdef CONFIG_QUOTA 87#ifdef CONFIG_QUOTA
88/* Amount of blocks needed for quota update - we know that the structure was 88/* Amount of blocks needed for quota update - we know that the structure was
89 * allocated so we need to update only inode+data */ 89 * allocated so we need to update only data block */
90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) 90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
91/* Amount of blocks needed for quota insert/delete - we do some block writes 91/* Amount of blocks needed for quota insert/delete - we do some block writes
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132int __ext4_journal_get_write_access(const char *where, unsigned int line, 129int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
146int __ext4_handle_dirty_super(const char *where, unsigned int line, 143int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149#define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
@@ -202,13 +197,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
202 return 1; 197 return 1;
203} 198}
204 199
205static inline void ext4_journal_release_buffer(handle_t *handle,
206 struct buffer_head *bh)
207{
208 if (ext4_handle_valid(handle))
209 jbd2_journal_release_buffer(handle, bh);
210}
211
212static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 200static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
213{ 201{
214 return ext4_journal_start_sb(inode->i_sb, nblocks); 202 return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -253,7 +241,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 241static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254{ 242{
255 if (ext4_handle_valid(handle)) 243 if (ext4_handle_valid(handle))
256 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 244 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
257 return 0; 245 return 0;
258} 246}
259 247
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..f815cc81e7a2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,54 +44,14 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47#include <trace/events/ext4.h>
47 48
48/* 49static int ext4_split_extent(handle_t *handle,
49 * ext_pblock: 50 struct inode *inode,
50 * combine low and high parts of physical block number into ext4_fsblk_t 51 struct ext4_ext_path *path,
51 */ 52 struct ext4_map_blocks *map,
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex) 53 int split_flag,
53{ 54 int flags);
54 ext4_fsblk_t block;
55
56 block = le32_to_cpu(ex->ee_start_lo);
57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58 return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67 ext4_fsblk_t block;
68
69 block = le32_to_cpu(ix->ei_leaf_lo);
70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71 return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95 55
96static int ext4_ext_truncate_extend_restart(handle_t *handle, 56static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode, 57 struct inode *inode,
@@ -166,10 +126,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
166 struct ext4_extent *ex; 126 struct ext4_extent *ex;
167 depth = path->p_depth; 127 depth = path->p_depth;
168 128
169 /* try to predict block placement */ 129 /*
130 * Try to predict block placement assuming that we are
131 * filling in a file which will eventually be
132 * non-sparse --- i.e., in the case of libbfd writing
133 * an ELF object sections out-of-order but in a way
134 * the eventually results in a contiguous object or
135 * executable file, or some database extending a table
136 * space file. However, this is actually somewhat
137 * non-ideal if we are writing a sparse file such as
138 * qemu or KVM writing a raw image file that is going
139 * to stay fairly sparse, since it will end up
140 * fragmenting the file system's free space. Maybe we
141 * should have some hueristics or some way to allow
142 * userspace to pass a hint to file system,
143 * especially if the latter case turns out to be
144 * common.
145 */
170 ex = path[depth].p_ext; 146 ex = path[depth].p_ext;
171 if (ex) 147 if (ex) {
172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 148 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
149 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
150
151 if (block > ext_block)
152 return ext_pblk + (block - ext_block);
153 else
154 return ext_pblk - (ext_block - block);
155 }
173 156
174 /* it looks like index is empty; 157 /* it looks like index is empty;
175 * try to find starting block from index itself */ 158 * try to find starting block from index itself */
@@ -216,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
216static ext4_fsblk_t 199static ext4_fsblk_t
217ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 200ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
218 struct ext4_ext_path *path, 201 struct ext4_ext_path *path,
219 struct ext4_extent *ex, int *err) 202 struct ext4_extent *ex, int *err, unsigned int flags)
220{ 203{
221 ext4_fsblk_t goal, newblock; 204 ext4_fsblk_t goal, newblock;
222 205
223 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
224 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
208 NULL, err);
225 return newblock; 209 return newblock;
226} 210}
227 211
@@ -292,7 +276,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
292 * to allocate @blocks 276 * to allocate @blocks
293 * Worse case is one block per extent 277 * Worse case is one block per extent
294 */ 278 */
295int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 279int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
296{ 280{
297 struct ext4_inode_info *ei = EXT4_I(inode); 281 struct ext4_inode_info *ei = EXT4_I(inode);
298 int idxs, num = 0; 282 int idxs, num = 0;
@@ -354,7 +338,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
354 338
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 339static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{ 340{
357 ext4_fsblk_t block = ext_pblock(ext); 341 ext4_fsblk_t block = ext4_ext_pblock(ext);
358 int len = ext4_ext_get_actual_len(ext); 342 int len = ext4_ext_get_actual_len(ext);
359 343
360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 344 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +347,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
363static int ext4_valid_extent_idx(struct inode *inode, 347static int ext4_valid_extent_idx(struct inode *inode,
364 struct ext4_extent_idx *ext_idx) 348 struct ext4_extent_idx *ext_idx)
365{ 349{
366 ext4_fsblk_t block = idx_pblock(ext_idx); 350 ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
367 351
368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 352 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369} 353}
@@ -463,13 +447,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
463 for (k = 0; k <= l; k++, path++) { 447 for (k = 0; k <= l; k++, path++) {
464 if (path->p_idx) { 448 if (path->p_idx) {
465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 449 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466 idx_pblock(path->p_idx)); 450 ext4_idx_pblock(path->p_idx));
467 } else if (path->p_ext) { 451 } else if (path->p_ext) {
468 ext_debug(" %d:[%d]%d:%llu ", 452 ext_debug(" %d:[%d]%d:%llu ",
469 le32_to_cpu(path->p_ext->ee_block), 453 le32_to_cpu(path->p_ext->ee_block),
470 ext4_ext_is_uninitialized(path->p_ext), 454 ext4_ext_is_uninitialized(path->p_ext),
471 ext4_ext_get_actual_len(path->p_ext), 455 ext4_ext_get_actual_len(path->p_ext),
472 ext_pblock(path->p_ext)); 456 ext4_ext_pblock(path->p_ext));
473 } else 457 } else
474 ext_debug(" []"); 458 ext_debug(" []");
475 } 459 }
@@ -494,13 +478,47 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 478 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 479 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496 ext4_ext_is_uninitialized(ex), 480 ext4_ext_is_uninitialized(ex),
497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 481 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
498 } 482 }
499 ext_debug("\n"); 483 ext_debug("\n");
500} 484}
485
486static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
487 ext4_fsblk_t newblock, int level)
488{
489 int depth = ext_depth(inode);
490 struct ext4_extent *ex;
491
492 if (depth != level) {
493 struct ext4_extent_idx *idx;
494 idx = path[level].p_idx;
495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
496 ext_debug("%d: move %d:%llu in new index %llu\n", level,
497 le32_to_cpu(idx->ei_block),
498 ext4_idx_pblock(idx),
499 newblock);
500 idx++;
501 }
502
503 return;
504 }
505
506 ex = path[depth].p_ext;
507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
509 le32_to_cpu(ex->ee_block),
510 ext4_ext_pblock(ex),
511 ext4_ext_is_uninitialized(ex),
512 ext4_ext_get_actual_len(ex),
513 newblock);
514 ex++;
515 }
516}
517
501#else 518#else
502#define ext4_ext_show_path(inode, path) 519#define ext4_ext_show_path(inode, path)
503#define ext4_ext_show_leaf(inode, path) 520#define ext4_ext_show_leaf(inode, path)
521#define ext4_ext_show_move(inode, path, newblock, level)
504#endif 522#endif
505 523
506void ext4_ext_drop_refs(struct ext4_ext_path *path) 524void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -545,7 +563,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
545 563
546 path->p_idx = l - 1; 564 path->p_idx = l - 1;
547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 565 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548 idx_pblock(path->p_idx)); 566 ext4_idx_pblock(path->p_idx));
549 567
550#ifdef CHECK_BINSEARCH 568#ifdef CHECK_BINSEARCH
551 { 569 {
@@ -614,7 +632,7 @@ ext4_ext_binsearch(struct inode *inode,
614 path->p_ext = l - 1; 632 path->p_ext = l - 1;
615 ext_debug(" -> %d:%llu:[%d]%d ", 633 ext_debug(" -> %d:%llu:[%d]%d ",
616 le32_to_cpu(path->p_ext->ee_block), 634 le32_to_cpu(path->p_ext->ee_block),
617 ext_pblock(path->p_ext), 635 ext4_ext_pblock(path->p_ext),
618 ext4_ext_is_uninitialized(path->p_ext), 636 ext4_ext_is_uninitialized(path->p_ext),
619 ext4_ext_get_actual_len(path->p_ext)); 637 ext4_ext_get_actual_len(path->p_ext));
620 638
@@ -682,7 +700,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 700 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683 701
684 ext4_ext_binsearch_idx(inode, path + ppos, block); 702 ext4_ext_binsearch_idx(inode, path + ppos, block);
685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 703 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
686 path[ppos].p_depth = i; 704 path[ppos].p_depth = i;
687 path[ppos].p_ext = NULL; 705 path[ppos].p_ext = NULL;
688 706
@@ -690,6 +708,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
690 if (unlikely(!bh)) 708 if (unlikely(!bh))
691 goto err; 709 goto err;
692 if (!bh_uptodate_or_lock(bh)) { 710 if (!bh_uptodate_or_lock(bh)) {
711 trace_ext4_ext_load_extent(inode, block,
712 path[ppos].p_block);
693 if (bh_submit_read(bh) < 0) { 713 if (bh_submit_read(bh) < 0) {
694 put_bh(bh); 714 put_bh(bh);
695 goto err; 715 goto err;
@@ -721,7 +741,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
721 ext4_ext_binsearch(inode, path + ppos, block); 741 ext4_ext_binsearch(inode, path + ppos, block);
722 /* if not an empty leaf */ 742 /* if not an empty leaf */
723 if (path[ppos].p_ext) 743 if (path[ppos].p_ext)
724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 744 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
725 745
726 ext4_ext_show_path(inode, path); 746 ext4_ext_show_path(inode, path);
727 747
@@ -739,9 +759,9 @@ err:
739 * insert new index [@logical;@ptr] into the block at @curp; 759 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp 760 * check where to insert: before @curp or after @curp
741 */ 761 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 762static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743 struct ext4_ext_path *curp, 763 struct ext4_ext_path *curp,
744 int logical, ext4_fsblk_t ptr) 764 int logical, ext4_fsblk_t ptr)
745{ 765{
746 struct ext4_extent_idx *ix; 766 struct ext4_extent_idx *ix;
747 int len, err; 767 int len, err;
@@ -814,14 +834,14 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
814 * - initializes subtree 834 * - initializes subtree
815 */ 835 */
816static int ext4_ext_split(handle_t *handle, struct inode *inode, 836static int ext4_ext_split(handle_t *handle, struct inode *inode,
817 struct ext4_ext_path *path, 837 unsigned int flags,
818 struct ext4_extent *newext, int at) 838 struct ext4_ext_path *path,
839 struct ext4_extent *newext, int at)
819{ 840{
820 struct buffer_head *bh = NULL; 841 struct buffer_head *bh = NULL;
821 int depth = ext_depth(inode); 842 int depth = ext_depth(inode);
822 struct ext4_extent_header *neh; 843 struct ext4_extent_header *neh;
823 struct ext4_extent_idx *fidx; 844 struct ext4_extent_idx *fidx;
824 struct ext4_extent *ex;
825 int i = at, k, m, a; 845 int i = at, k, m, a;
826 ext4_fsblk_t newblock, oldblock; 846 ext4_fsblk_t newblock, oldblock;
827 __le32 border; 847 __le32 border;
@@ -869,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
869 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
870 for (a = 0; a < depth - at; a++) { 890 for (a = 0; a < depth - at; a++) {
871 newblock = ext4_ext_new_meta_block(handle, inode, path, 891 newblock = ext4_ext_new_meta_block(handle, inode, path,
872 newext, &err); 892 newext, &err, flags);
873 if (newblock == 0) 893 if (newblock == 0)
874 goto cleanup; 894 goto cleanup;
875 ablocks[a] = newblock; 895 ablocks[a] = newblock;
@@ -898,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
898 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
899 neh->eh_magic = EXT4_EXT_MAGIC; 919 neh->eh_magic = EXT4_EXT_MAGIC;
900 neh->eh_depth = 0; 920 neh->eh_depth = 0;
901 ex = EXT_FIRST_EXTENT(neh);
902 921
903 /* move remainder of path[depth] to the new leaf */ 922 /* move remainder of path[depth] to the new leaf */
904 if (unlikely(path[depth].p_hdr->eh_entries != 923 if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -910,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
910 goto cleanup; 929 goto cleanup;
911 } 930 }
912 /* start copy from next extent */ 931 /* start copy from next extent */
913 /* TODO: we could do it by single memmove */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
914 m = 0; 933 ext4_ext_show_move(inode, path, newblock, depth);
915 path[depth].p_ext++;
916 while (path[depth].p_ext <=
917 EXT_MAX_EXTENT(path[depth].p_hdr)) {
918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
919 le32_to_cpu(path[depth].p_ext->ee_block),
920 ext_pblock(path[depth].p_ext),
921 ext4_ext_is_uninitialized(path[depth].p_ext),
922 ext4_ext_get_actual_len(path[depth].p_ext),
923 newblock);
924 /*memmove(ex++, path[depth].p_ext++,
925 sizeof(struct ext4_extent));
926 neh->eh_entries++;*/
927 path[depth].p_ext++;
928 m++;
929 }
930 if (m) { 934 if (m) {
931 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 935 struct ext4_extent *ex;
936 ex = EXT_FIRST_EXTENT(neh);
937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
932 le16_add_cpu(&neh->eh_entries, m); 938 le16_add_cpu(&neh->eh_entries, m);
933 } 939 }
934 940
@@ -990,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
990 996
991 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
992 i, newblock, le32_to_cpu(border), oldblock); 998 i, newblock, le32_to_cpu(border), oldblock);
993 /* copy indexes */
994 m = 0;
995 path[i].p_idx++;
996 999
997 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 /* move remainder of path[i] to the new index block */
998 EXT_MAX_INDEX(path[i].p_hdr));
999 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1000 EXT_LAST_INDEX(path[i].p_hdr))) { 1002 EXT_LAST_INDEX(path[i].p_hdr))) {
1001 EXT4_ERROR_INODE(inode, 1003 EXT4_ERROR_INODE(inode,
@@ -1004,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1004 err = -EIO; 1006 err = -EIO;
1005 goto cleanup; 1007 goto cleanup;
1006 } 1008 }
1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1009 /* start copy indexes */
1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1009 le32_to_cpu(path[i].p_idx->ei_block), 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1010 idx_pblock(path[i].p_idx), 1012 EXT_MAX_INDEX(path[i].p_hdr));
1011 newblock); 1013 ext4_ext_show_move(inode, path, newblock, i);
1012 /*memmove(++fidx, path[i].p_idx++,
1013 sizeof(struct ext4_extent_idx));
1014 neh->eh_entries++;
1015 BUG_ON(neh->eh_entries > neh->eh_max);*/
1016 path[i].p_idx++;
1017 m++;
1018 }
1019 if (m) { 1014 if (m) {
1020 memmove(++fidx, path[i].p_idx - m, 1015 memmove(++fidx, path[i].p_idx,
1021 sizeof(struct ext4_extent_idx) * m); 1016 sizeof(struct ext4_extent_idx) * m);
1022 le16_add_cpu(&neh->eh_entries, m); 1017 le16_add_cpu(&neh->eh_entries, m);
1023 } 1018 }
@@ -1060,7 +1055,7 @@ cleanup:
1060 for (i = 0; i < depth; i++) { 1055 for (i = 0; i < depth; i++) {
1061 if (!ablocks[i]) 1056 if (!ablocks[i])
1062 continue; 1057 continue;
1063 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1058 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1064 EXT4_FREE_BLOCKS_METADATA); 1059 EXT4_FREE_BLOCKS_METADATA);
1065 } 1060 }
1066 } 1061 }
@@ -1078,8 +1073,9 @@ cleanup:
1078 * just created block 1073 * just created block
1079 */ 1074 */
1080static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1075static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1081 struct ext4_ext_path *path, 1076 unsigned int flags,
1082 struct ext4_extent *newext) 1077 struct ext4_ext_path *path,
1078 struct ext4_extent *newext)
1083{ 1079{
1084 struct ext4_ext_path *curp = path; 1080 struct ext4_ext_path *curp = path;
1085 struct ext4_extent_header *neh; 1081 struct ext4_extent_header *neh;
@@ -1087,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1087 ext4_fsblk_t newblock; 1083 ext4_fsblk_t newblock;
1088 int err = 0; 1084 int err = 0;
1089 1085
1090 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1086 newblock = ext4_ext_new_meta_block(handle, inode, path,
1087 newext, &err, flags);
1091 if (newblock == 0) 1088 if (newblock == 0)
1092 return err; 1089 return err;
1093 1090
@@ -1146,7 +1143,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1143 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1144 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1145 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149 idx_pblock(EXT_FIRST_INDEX(neh))); 1146 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1150 1147
1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1148 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1152 err = ext4_ext_dirty(handle, inode, curp); 1149 err = ext4_ext_dirty(handle, inode, curp);
@@ -1162,8 +1159,9 @@ out:
1162 * if no free index is found, then it requests in-depth growing. 1159 * if no free index is found, then it requests in-depth growing.
1163 */ 1160 */
1164static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1161static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1165 struct ext4_ext_path *path, 1162 unsigned int flags,
1166 struct ext4_extent *newext) 1163 struct ext4_ext_path *path,
1164 struct ext4_extent *newext)
1167{ 1165{
1168 struct ext4_ext_path *curp; 1166 struct ext4_ext_path *curp;
1169 int depth, i, err = 0; 1167 int depth, i, err = 0;
@@ -1183,7 +1181,7 @@ repeat:
1183 if (EXT_HAS_FREE_INDEX(curp)) { 1181 if (EXT_HAS_FREE_INDEX(curp)) {
1184 /* if we found index with free entry, then use that 1182 /* if we found index with free entry, then use that
1185 * entry: create all needed subtree and add new leaf */ 1183 * entry: create all needed subtree and add new leaf */
1186 err = ext4_ext_split(handle, inode, path, newext, i); 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i);
1187 if (err) 1185 if (err)
1188 goto out; 1186 goto out;
1189 1187
@@ -1196,7 +1194,8 @@ repeat:
1196 err = PTR_ERR(path); 1194 err = PTR_ERR(path);
1197 } else { 1195 } else {
1198 /* tree is full, time to grow in depth */ 1196 /* tree is full, time to grow in depth */
1199 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1197 err = ext4_ext_grow_indepth(handle, inode, flags,
1198 path, newext);
1200 if (err) 1199 if (err)
1201 goto out; 1200 goto out;
1202 1201
@@ -1232,9 +1231,9 @@ out:
1232 * returns 0 at @phys 1231 * returns 0 at @phys
1233 * return value contains 0 (success) or error code 1232 * return value contains 0 (success) or error code
1234 */ 1233 */
1235int 1234static int ext4_ext_search_left(struct inode *inode,
1236ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1235 struct ext4_ext_path *path,
1237 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1236 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1238{ 1237{
1239 struct ext4_extent_idx *ix; 1238 struct ext4_extent_idx *ix;
1240 struct ext4_extent *ex; 1239 struct ext4_extent *ex;
@@ -1286,7 +1285,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1286 } 1285 }
1287 1286
1288 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1287 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1289 *phys = ext_pblock(ex) + ee_len - 1; 1288 *phys = ext4_ext_pblock(ex) + ee_len - 1;
1290 return 0; 1289 return 0;
1291} 1290}
1292 1291
@@ -1297,9 +1296,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1297 * returns 0 at @phys 1296 * returns 0 at @phys
1298 * return value contains 0 (success) or error code 1297 * return value contains 0 (success) or error code
1299 */ 1298 */
1300int 1299static int ext4_ext_search_right(struct inode *inode,
1301ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1300 struct ext4_ext_path *path,
1302 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1301 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1303{ 1302{
1304 struct buffer_head *bh = NULL; 1303 struct buffer_head *bh = NULL;
1305 struct ext4_extent_header *eh; 1304 struct ext4_extent_header *eh;
@@ -1342,7 +1341,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1342 } 1341 }
1343 } 1342 }
1344 *logical = le32_to_cpu(ex->ee_block); 1343 *logical = le32_to_cpu(ex->ee_block);
1345 *phys = ext_pblock(ex); 1344 *phys = ext4_ext_pblock(ex);
1346 return 0; 1345 return 0;
1347 } 1346 }
1348 1347
@@ -1357,7 +1356,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1357 /* next allocated block in this leaf */ 1356 /* next allocated block in this leaf */
1358 ex++; 1357 ex++;
1359 *logical = le32_to_cpu(ex->ee_block); 1358 *logical = le32_to_cpu(ex->ee_block);
1360 *phys = ext_pblock(ex); 1359 *phys = ext4_ext_pblock(ex);
1361 return 0; 1360 return 0;
1362 } 1361 }
1363 1362
@@ -1376,7 +1375,7 @@ got_index:
1376 * follow it and find the closest allocated 1375 * follow it and find the closest allocated
1377 * block to the right */ 1376 * block to the right */
1378 ix++; 1377 ix++;
1379 block = idx_pblock(ix); 1378 block = ext4_idx_pblock(ix);
1380 while (++depth < path->p_depth) { 1379 while (++depth < path->p_depth) {
1381 bh = sb_bread(inode->i_sb, block); 1380 bh = sb_bread(inode->i_sb, block);
1382 if (bh == NULL) 1381 if (bh == NULL)
@@ -1388,7 +1387,7 @@ got_index:
1388 return -EIO; 1387 return -EIO;
1389 } 1388 }
1390 ix = EXT_FIRST_INDEX(eh); 1389 ix = EXT_FIRST_INDEX(eh);
1391 block = idx_pblock(ix); 1390 block = ext4_idx_pblock(ix);
1392 put_bh(bh); 1391 put_bh(bh);
1393 } 1392 }
1394 1393
@@ -1402,14 +1401,14 @@ got_index:
1402 } 1401 }
1403 ex = EXT_FIRST_EXTENT(eh); 1402 ex = EXT_FIRST_EXTENT(eh);
1404 *logical = le32_to_cpu(ex->ee_block); 1403 *logical = le32_to_cpu(ex->ee_block);
1405 *phys = ext_pblock(ex); 1404 *phys = ext4_ext_pblock(ex);
1406 put_bh(bh); 1405 put_bh(bh);
1407 return 0; 1406 return 0;
1408} 1407}
1409 1408
1410/* 1409/*
1411 * ext4_ext_next_allocated_block: 1410 * ext4_ext_next_allocated_block:
1412 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1411 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1413 * NOTE: it considers block number from index entry as 1412 * NOTE: it considers block number from index entry as
1414 * allocated block. Thus, index entries have to be consistent 1413 * allocated block. Thus, index entries have to be consistent
1415 * with leaves. 1414 * with leaves.
@@ -1423,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1423 depth = path->p_depth; 1422 depth = path->p_depth;
1424 1423
1425 if (depth == 0 && path->p_ext == NULL) 1424 if (depth == 0 && path->p_ext == NULL)
1426 return EXT_MAX_BLOCK; 1425 return EXT_MAX_BLOCKS;
1427 1426
1428 while (depth >= 0) { 1427 while (depth >= 0) {
1429 if (depth == path->p_depth) { 1428 if (depth == path->p_depth) {
@@ -1440,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1440 depth--; 1439 depth--;
1441 } 1440 }
1442 1441
1443 return EXT_MAX_BLOCK; 1442 return EXT_MAX_BLOCKS;
1444} 1443}
1445 1444
1446/* 1445/*
1447 * ext4_ext_next_leaf_block: 1446 * ext4_ext_next_leaf_block:
1448 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1447 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1449 */ 1448 */
1450static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1449static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1451 struct ext4_ext_path *path) 1450 struct ext4_ext_path *path)
@@ -1457,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1457 1456
1458 /* zero-tree has no leaf blocks at all */ 1457 /* zero-tree has no leaf blocks at all */
1459 if (depth == 0) 1458 if (depth == 0)
1460 return EXT_MAX_BLOCK; 1459 return EXT_MAX_BLOCKS;
1461 1460
1462 /* go to index block */ 1461 /* go to index block */
1463 depth--; 1462 depth--;
@@ -1470,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1470 depth--; 1469 depth--;
1471 } 1470 }
1472 1471
1473 return EXT_MAX_BLOCK; 1472 return EXT_MAX_BLOCKS;
1474} 1473}
1475 1474
1476/* 1475/*
@@ -1573,7 +1572,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1573 return 0; 1572 return 0;
1574#endif 1573#endif
1575 1574
1576 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1575 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1577 return 1; 1576 return 1;
1578 return 0; 1577 return 0;
1579} 1578}
@@ -1585,9 +1584,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1585 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1586 * 1 if they got merged. 1585 * 1 if they got merged.
1587 */ 1586 */
1588int ext4_ext_try_to_merge(struct inode *inode, 1587static int ext4_ext_try_to_merge_right(struct inode *inode,
1589 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1590 struct ext4_extent *ex) 1589 struct ext4_extent *ex)
1591{ 1590{
1592 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1593 unsigned int depth, len; 1592 unsigned int depth, len;
@@ -1625,6 +1624,31 @@ int ext4_ext_try_to_merge(struct inode *inode,
1625} 1624}
1626 1625
1627/* 1626/*
1627 * This function tries to merge the @ex extent to neighbours in the tree.
1628 * return 1 if merge left else 0.
1629 */
1630static int ext4_ext_try_to_merge(struct inode *inode,
1631 struct ext4_ext_path *path,
1632 struct ext4_extent *ex) {
1633 struct ext4_extent_header *eh;
1634 unsigned int depth;
1635 int merge_done = 0;
1636 int ret = 0;
1637
1638 depth = ext_depth(inode);
1639 BUG_ON(path[depth].p_hdr == NULL);
1640 eh = path[depth].p_hdr;
1641
1642 if (ex > EXT_FIRST_EXTENT(eh))
1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1644
1645 if (!merge_done)
1646 ret = ext4_ext_try_to_merge_right(inode, path, ex);
1647
1648 return ret;
1649}
1650
1651/*
1628 * check if a portion of the "newext" extent overlaps with an 1652 * check if a portion of the "newext" extent overlaps with an
1629 * existing extent. 1653 * existing extent.
1630 * 1654 *
@@ -1632,9 +1656,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1632 * such that there will be no overlap, and then returns 1. 1656 * such that there will be no overlap, and then returns 1.
1633 * If there is no overlap found, it returns 0. 1657 * If there is no overlap found, it returns 0.
1634 */ 1658 */
1635unsigned int ext4_ext_check_overlap(struct inode *inode, 1659static unsigned int ext4_ext_check_overlap(struct inode *inode,
1636 struct ext4_extent *newext, 1660 struct ext4_extent *newext,
1637 struct ext4_ext_path *path) 1661 struct ext4_ext_path *path)
1638{ 1662{
1639 ext4_lblk_t b1, b2; 1663 ext4_lblk_t b1, b2;
1640 unsigned int depth, len1; 1664 unsigned int depth, len1;
@@ -1653,13 +1677,13 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1653 */ 1677 */
1654 if (b2 < b1) { 1678 if (b2 < b1) {
1655 b2 = ext4_ext_next_allocated_block(path); 1679 b2 = ext4_ext_next_allocated_block(path);
1656 if (b2 == EXT_MAX_BLOCK) 1680 if (b2 == EXT_MAX_BLOCKS)
1657 goto out; 1681 goto out;
1658 } 1682 }
1659 1683
1660 /* check for wrap through zero on extent logical start block*/ 1684 /* check for wrap through zero on extent logical start block*/
1661 if (b1 + len1 < b1) { 1685 if (b1 + len1 < b1) {
1662 len1 = EXT_MAX_BLOCK - b1; 1686 len1 = EXT_MAX_BLOCKS - b1;
1663 newext->ee_len = cpu_to_le16(len1); 1687 newext->ee_len = cpu_to_le16(len1);
1664 ret = 1; 1688 ret = 1;
1665 } 1689 }
@@ -1690,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1690 int depth, len, err; 1714 int depth, len, err;
1691 ext4_lblk_t next; 1715 ext4_lblk_t next;
1692 unsigned uninitialized = 0; 1716 unsigned uninitialized = 0;
1717 int flags = 0;
1693 1718
1694 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1695 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1706,11 +1731,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1706 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1731 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1707 && ext4_can_extents_be_merged(inode, ex, newext)) { 1732 && ext4_can_extents_be_merged(inode, ex, newext)) {
1708 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1733 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1709 ext4_ext_is_uninitialized(newext), 1734 ext4_ext_is_uninitialized(newext),
1710 ext4_ext_get_actual_len(newext), 1735 ext4_ext_get_actual_len(newext),
1711 le32_to_cpu(ex->ee_block), 1736 le32_to_cpu(ex->ee_block),
1712 ext4_ext_is_uninitialized(ex), 1737 ext4_ext_is_uninitialized(ex),
1713 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1738 ext4_ext_get_actual_len(ex),
1739 ext4_ext_pblock(ex));
1714 err = ext4_ext_get_access(handle, inode, path + depth); 1740 err = ext4_ext_get_access(handle, inode, path + depth);
1715 if (err) 1741 if (err)
1716 return err; 1742 return err;
@@ -1741,7 +1767,7 @@ repeat:
1741 fex = EXT_LAST_EXTENT(eh); 1767 fex = EXT_LAST_EXTENT(eh);
1742 next = ext4_ext_next_leaf_block(inode, path); 1768 next = ext4_ext_next_leaf_block(inode, path);
1743 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1769 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1744 && next != EXT_MAX_BLOCK) { 1770 && next != EXT_MAX_BLOCKS) {
1745 ext_debug("next leaf block - %d\n", next); 1771 ext_debug("next leaf block - %d\n", next);
1746 BUG_ON(npath != NULL); 1772 BUG_ON(npath != NULL);
1747 npath = ext4_ext_find_extent(inode, next, NULL); 1773 npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1750,7 +1776,7 @@ repeat:
1750 BUG_ON(npath->p_depth != path->p_depth); 1776 BUG_ON(npath->p_depth != path->p_depth);
1751 eh = npath[depth].p_hdr; 1777 eh = npath[depth].p_hdr;
1752 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1778 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1753 ext_debug("next leaf isnt full(%d)\n", 1779 ext_debug("next leaf isn't full(%d)\n",
1754 le16_to_cpu(eh->eh_entries)); 1780 le16_to_cpu(eh->eh_entries));
1755 path = npath; 1781 path = npath;
1756 goto repeat; 1782 goto repeat;
@@ -1763,7 +1789,9 @@ repeat:
1763 * There is no free space in the found leaf. 1789 * There is no free space in the found leaf.
1764 * We're gonna add a new leaf in the tree. 1790 * We're gonna add a new leaf in the tree.
1765 */ 1791 */
1766 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1793 flags = EXT4_MB_USE_ROOT_BLOCKS;
1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1767 if (err) 1795 if (err)
1768 goto cleanup; 1796 goto cleanup;
1769 depth = ext_depth(inode); 1797 depth = ext_depth(inode);
@@ -1780,7 +1808,7 @@ has_space:
1780 /* there is no extent in this leaf, create first one */ 1808 /* there is no extent in this leaf, create first one */
1781 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1809 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1782 le32_to_cpu(newext->ee_block), 1810 le32_to_cpu(newext->ee_block),
1783 ext_pblock(newext), 1811 ext4_ext_pblock(newext),
1784 ext4_ext_is_uninitialized(newext), 1812 ext4_ext_is_uninitialized(newext),
1785 ext4_ext_get_actual_len(newext)); 1813 ext4_ext_get_actual_len(newext));
1786 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1814 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1822,7 @@ has_space:
1794 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1822 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1795 "move %d from 0x%p to 0x%p\n", 1823 "move %d from 0x%p to 0x%p\n",
1796 le32_to_cpu(newext->ee_block), 1824 le32_to_cpu(newext->ee_block),
1797 ext_pblock(newext), 1825 ext4_ext_pblock(newext),
1798 ext4_ext_is_uninitialized(newext), 1826 ext4_ext_is_uninitialized(newext),
1799 ext4_ext_get_actual_len(newext), 1827 ext4_ext_get_actual_len(newext),
1800 nearex, len, nearex + 1, nearex + 2); 1828 nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1836,7 @@ has_space:
1808 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1836 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1809 "move %d from 0x%p to 0x%p\n", 1837 "move %d from 0x%p to 0x%p\n",
1810 le32_to_cpu(newext->ee_block), 1838 le32_to_cpu(newext->ee_block),
1811 ext_pblock(newext), 1839 ext4_ext_pblock(newext),
1812 ext4_ext_is_uninitialized(newext), 1840 ext4_ext_is_uninitialized(newext),
1813 ext4_ext_get_actual_len(newext), 1841 ext4_ext_get_actual_len(newext),
1814 nearex, len, nearex + 1, nearex + 2); 1842 nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1847,7 @@ has_space:
1819 le16_add_cpu(&eh->eh_entries, 1); 1847 le16_add_cpu(&eh->eh_entries, 1);
1820 nearex = path[depth].p_ext; 1848 nearex = path[depth].p_ext;
1821 nearex->ee_block = newext->ee_block; 1849 nearex->ee_block = newext->ee_block;
1822 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1850 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1823 nearex->ee_len = newext->ee_len; 1851 nearex->ee_len = newext->ee_len;
1824 1852
1825merge: 1853merge:
@@ -1845,9 +1873,9 @@ cleanup:
1845 return err; 1873 return err;
1846} 1874}
1847 1875
1848int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1876static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1849 ext4_lblk_t num, ext_prepare_callback func, 1877 ext4_lblk_t num, ext_prepare_callback func,
1850 void *cbdata) 1878 void *cbdata)
1851{ 1879{
1852 struct ext4_ext_path *path = NULL; 1880 struct ext4_ext_path *path = NULL;
1853 struct ext4_ext_cache cbex; 1881 struct ext4_ext_cache cbex;
@@ -1859,7 +1887,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1859 BUG_ON(func == NULL); 1887 BUG_ON(func == NULL);
1860 BUG_ON(inode == NULL); 1888 BUG_ON(inode == NULL);
1861 1889
1862 while (block < last && block != EXT_MAX_BLOCK) { 1890 while (block < last && block != EXT_MAX_BLOCKS) {
1863 num = last - block; 1891 num = last - block;
1864 /* find extent for this block */ 1892 /* find extent for this block */
1865 down_read(&EXT4_I(inode)->i_data_sem); 1893 down_read(&EXT4_I(inode)->i_data_sem);
@@ -1919,12 +1947,10 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1919 cbex.ec_block = start; 1947 cbex.ec_block = start;
1920 cbex.ec_len = end - start; 1948 cbex.ec_len = end - start;
1921 cbex.ec_start = 0; 1949 cbex.ec_start = 0;
1922 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1923 } else { 1950 } else {
1924 cbex.ec_block = le32_to_cpu(ex->ee_block); 1951 cbex.ec_block = le32_to_cpu(ex->ee_block);
1925 cbex.ec_len = ext4_ext_get_actual_len(ex); 1952 cbex.ec_len = ext4_ext_get_actual_len(ex);
1926 cbex.ec_start = ext_pblock(ex); 1953 cbex.ec_start = ext4_ext_pblock(ex);
1927 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1928 } 1954 }
1929 1955
1930 if (unlikely(cbex.ec_len == 0)) { 1956 if (unlikely(cbex.ec_len == 0)) {
@@ -1932,7 +1958,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1932 err = -EIO; 1958 err = -EIO;
1933 break; 1959 break;
1934 } 1960 }
1935 err = func(inode, path, &cbex, ex, cbdata); 1961 err = func(inode, next, &cbex, ex, cbdata);
1936 ext4_ext_drop_refs(path); 1962 ext4_ext_drop_refs(path);
1937 1963
1938 if (err < 0) 1964 if (err < 0)
@@ -1964,13 +1990,12 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1964 1990
1965static void 1991static void
1966ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1992ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1967 __u32 len, ext4_fsblk_t start, int type) 1993 __u32 len, ext4_fsblk_t start)
1968{ 1994{
1969 struct ext4_ext_cache *cex; 1995 struct ext4_ext_cache *cex;
1970 BUG_ON(len == 0); 1996 BUG_ON(len == 0);
1971 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1997 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1972 cex = &EXT4_I(inode)->i_cached_extent; 1998 cex = &EXT4_I(inode)->i_cached_extent;
1973 cex->ec_type = type;
1974 cex->ec_block = block; 1999 cex->ec_block = block;
1975 cex->ec_len = len; 2000 cex->ec_len = len;
1976 cex->ec_start = start; 2001 cex->ec_start = start;
@@ -1995,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1995 if (ex == NULL) { 2020 if (ex == NULL) {
1996 /* there is no extent yet, so gap is [0;-] */ 2021 /* there is no extent yet, so gap is [0;-] */
1997 lblock = 0; 2022 lblock = 0;
1998 len = EXT_MAX_BLOCK; 2023 len = EXT_MAX_BLOCKS;
1999 ext_debug("cache gap(whole file):"); 2024 ext_debug("cache gap(whole file):");
2000 } else if (block < le32_to_cpu(ex->ee_block)) { 2025 } else if (block < le32_to_cpu(ex->ee_block)) {
2001 lblock = block; 2026 lblock = block;
@@ -2023,43 +2048,90 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2023 } 2048 }
2024 2049
2025 ext_debug(" -> %u:%lu\n", lblock, len); 2050 ext_debug(" -> %u:%lu\n", lblock, len);
2026 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 2051 ext4_ext_put_in_cache(inode, lblock, len, 0);
2027} 2052}
2028 2053
2029static int 2054/*
2030ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2055 * ext4_ext_in_cache()
2031 struct ext4_extent *ex) 2056 * Checks to see if the given block is in the cache.
2032{ 2057 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole,
2059 * this routine should be used instead of
2060 * ext4_ext_in_cache if the calling function needs to
2061 * know the size of the hole.
2062 *
2063 * @inode: The files inode
2064 * @block: The block to look for in the cache
2065 * @ex: Pointer where the cached extent will be stored
2066 * if it contains block
2067 *
2068 * Return 0 if cache is invalid; 1 if the cache is valid
2069 */
2070static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2071 struct ext4_ext_cache *ex){
2033 struct ext4_ext_cache *cex; 2072 struct ext4_ext_cache *cex;
2034 int ret = EXT4_EXT_CACHE_NO; 2073 struct ext4_sb_info *sbi;
2074 int ret = 0;
2035 2075
2036 /* 2076 /*
2037 * We borrow i_block_reservation_lock to protect i_cached_extent 2077 * We borrow i_block_reservation_lock to protect i_cached_extent
2038 */ 2078 */
2039 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2040 cex = &EXT4_I(inode)->i_cached_extent; 2080 cex = &EXT4_I(inode)->i_cached_extent;
2081 sbi = EXT4_SB(inode->i_sb);
2041 2082
2042 /* has cache valid data? */ 2083 /* has cache valid data? */
2043 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2084 if (cex->ec_len == 0)
2044 goto errout; 2085 goto errout;
2045 2086
2046 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2047 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2048 if (in_range(block, cex->ec_block, cex->ec_len)) { 2087 if (in_range(block, cex->ec_block, cex->ec_len)) {
2049 ex->ee_block = cpu_to_le32(cex->ec_block); 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2050 ext4_ext_store_pblock(ex, cex->ec_start);
2051 ex->ee_len = cpu_to_le16(cex->ec_len);
2052 ext_debug("%u cached by %u:%u:%llu\n", 2089 ext_debug("%u cached by %u:%u:%llu\n",
2053 block, 2090 block,
2054 cex->ec_block, cex->ec_len, cex->ec_start); 2091 cex->ec_block, cex->ec_len, cex->ec_start);
2055 ret = cex->ec_type; 2092 ret = 1;
2056 } 2093 }
2057errout: 2094errout:
2095 if (!ret)
2096 sbi->extent_cache_misses++;
2097 else
2098 sbi->extent_cache_hits++;
2058 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2059 return ret; 2100 return ret;
2060} 2101}
2061 2102
2062/* 2103/*
2104 * ext4_ext_in_cache()
2105 * Checks to see if the given block is in the cache.
2106 * If it is, the cached extent is stored in the given
2107 * extent pointer.
2108 *
2109 * @inode: The files inode
2110 * @block: The block to look for in the cache
2111 * @ex: Pointer where the cached extent will be stored
2112 * if it contains block
2113 *
2114 * Return 0 if cache is invalid; 1 if the cache is valid
2115 */
2116static int
2117ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2118 struct ext4_extent *ex)
2119{
2120 struct ext4_ext_cache cex;
2121 int ret = 0;
2122
2123 if (ext4_ext_check_cache(inode, block, &cex)) {
2124 ex->ee_block = cpu_to_le32(cex.ec_block);
2125 ext4_ext_store_pblock(ex, cex.ec_start);
2126 ex->ee_len = cpu_to_le16(cex.ec_len);
2127 ret = 1;
2128 }
2129
2130 return ret;
2131}
2132
2133
2134/*
2063 * ext4_ext_rm_idx: 2135 * ext4_ext_rm_idx:
2064 * removes index from the index block. 2136 * removes index from the index block.
2065 * It's used in truncate case only, thus all requests are for 2137 * It's used in truncate case only, thus all requests are for
@@ -2073,7 +2145,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2073 2145
2074 /* free index block */ 2146 /* free index block */
2075 path--; 2147 path--;
2076 leaf = idx_pblock(path->p_idx); 2148 leaf = ext4_idx_pblock(path->p_idx);
2077 if (unlikely(path->p_hdr->eh_entries == 0)) { 2149 if (unlikely(path->p_hdr->eh_entries == 0)) {
2078 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2150 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2079 return -EIO; 2151 return -EIO;
@@ -2086,7 +2158,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2086 if (err) 2158 if (err)
2087 return err; 2159 return err;
2088 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2160 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2089 ext4_free_blocks(handle, inode, 0, leaf, 1, 2161 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2090 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2162 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2091 return err; 2163 return err;
2092} 2164}
@@ -2181,13 +2253,21 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2181 ext4_fsblk_t start; 2253 ext4_fsblk_t start;
2182 2254
2183 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2255 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2184 start = ext_pblock(ex) + ee_len - num; 2256 start = ext4_ext_pblock(ex) + ee_len - num;
2185 ext_debug("free last %u blocks starting %llu\n", num, start); 2257 ext_debug("free last %u blocks starting %llu\n", num, start);
2186 ext4_free_blocks(handle, inode, 0, start, num, flags); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2187 } else if (from == le32_to_cpu(ex->ee_block) 2259 } else if (from == le32_to_cpu(ex->ee_block)
2188 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2189 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2261 /* head removal */
2190 from, to, le32_to_cpu(ex->ee_block), ee_len); 2262 ext4_lblk_t num;
2263 ext4_fsblk_t start;
2264
2265 num = to - from;
2266 start = ext4_ext_pblock(ex);
2267
2268 ext_debug("free first %u blocks starting %llu\n", num, start);
2269 ext4_free_blocks(handle, inode, 0, start, num, flags);
2270
2191 } else { 2271 } else {
2192 printk(KERN_INFO "strange request: removal(2) " 2272 printk(KERN_INFO "strange request: removal(2) "
2193 "%u-%u from %u:%u\n", 2273 "%u-%u from %u:%u\n",
@@ -2196,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2196 return 0; 2276 return 0;
2197} 2277}
2198 2278
2279
2280/*
2281 * ext4_ext_rm_leaf() Removes the extents associated with the
2282 * blocks appearing between "start" and "end", and splits the extents
2283 * if "start" and "end" appear in the same extent
2284 *
2285 * @handle: The journal handle
2286 * @inode: The files inode
2287 * @path: The path to the leaf
2288 * @start: The first block to remove
2289 * @end: The last block to remove
2290 */
2199static int 2291static int
2200ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2292ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2201 struct ext4_ext_path *path, ext4_lblk_t start) 2293 struct ext4_ext_path *path, ext4_lblk_t start,
2294 ext4_lblk_t end)
2202{ 2295{
2203 int err = 0, correct_index = 0; 2296 int err = 0, correct_index = 0;
2204 int depth = ext_depth(inode), credits; 2297 int depth = ext_depth(inode), credits;
@@ -2209,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2209 unsigned short ex_ee_len; 2302 unsigned short ex_ee_len;
2210 unsigned uninitialized = 0; 2303 unsigned uninitialized = 0;
2211 struct ext4_extent *ex; 2304 struct ext4_extent *ex;
2305 struct ext4_map_blocks map;
2212 2306
2213 /* the header must be checked already in ext4_ext_remove_space() */ 2307 /* the header must be checked already in ext4_ext_remove_space() */
2214 ext_debug("truncate since %u in leaf\n", start); 2308 ext_debug("truncate since %u in leaf\n", start);
@@ -2238,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2238 path[depth].p_ext = ex; 2332 path[depth].p_ext = ex;
2239 2333
2240 a = ex_ee_block > start ? ex_ee_block : start; 2334 a = ex_ee_block > start ? ex_ee_block : start;
2241 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2335 b = ex_ee_block+ex_ee_len - 1 < end ?
2242 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2336 ex_ee_block+ex_ee_len - 1 : end;
2243 2337
2244 ext_debug(" border %u:%u\n", a, b); 2338 ext_debug(" border %u:%u\n", a, b);
2245 2339
2246 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2340 /* If this extent is beyond the end of the hole, skip it */
2247 block = 0; 2341 if (end <= ex_ee_block) {
2248 num = 0; 2342 ex--;
2249 BUG(); 2343 ex_ee_block = le32_to_cpu(ex->ee_block);
2344 ex_ee_len = ext4_ext_get_actual_len(ex);
2345 continue;
2346 } else if (a != ex_ee_block &&
2347 b != ex_ee_block + ex_ee_len - 1) {
2348 /*
2349 * If this is a truncate, then this condition should
2350 * never happen because at least one of the end points
2351 * needs to be on the edge of the extent.
2352 */
2353 if (end == EXT_MAX_BLOCKS - 1) {
2354 ext_debug(" bad truncate %u:%u\n",
2355 start, end);
2356 block = 0;
2357 num = 0;
2358 err = -EIO;
2359 goto out;
2360 }
2361 /*
2362 * else this is a hole punch, so the extent needs to
2363 * be split since neither edge of the hole is on the
2364 * extent edge
2365 */
2366 else{
2367 map.m_pblk = ext4_ext_pblock(ex);
2368 map.m_lblk = ex_ee_block;
2369 map.m_len = b - ex_ee_block;
2370
2371 err = ext4_split_extent(handle,
2372 inode, path, &map, 0,
2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2374 EXT4_GET_BLOCKS_PRE_IO);
2375
2376 if (err < 0)
2377 goto out;
2378
2379 ex_ee_len = ext4_ext_get_actual_len(ex);
2380
2381 b = ex_ee_block+ex_ee_len - 1 < end ?
2382 ex_ee_block+ex_ee_len - 1 : end;
2383
2384 /* Then remove tail of this extent */
2385 block = ex_ee_block;
2386 num = a - block;
2387 }
2250 } else if (a != ex_ee_block) { 2388 } else if (a != ex_ee_block) {
2251 /* remove tail of the extent */ 2389 /* remove tail of the extent */
2252 block = ex_ee_block; 2390 block = ex_ee_block;
2253 num = a - block; 2391 num = a - block;
2254 } else if (b != ex_ee_block + ex_ee_len - 1) { 2392 } else if (b != ex_ee_block + ex_ee_len - 1) {
2255 /* remove head of the extent */ 2393 /* remove head of the extent */
2256 block = a; 2394 block = b;
2257 num = b - a; 2395 num = ex_ee_block + ex_ee_len - b;
2258 /* there is no "make a hole" API yet */ 2396
2259 BUG(); 2397 /*
2398 * If this is a truncate, this condition
2399 * should never happen
2400 */
2401 if (end == EXT_MAX_BLOCKS - 1) {
2402 ext_debug(" bad truncate %u:%u\n",
2403 start, end);
2404 err = -EIO;
2405 goto out;
2406 }
2260 } else { 2407 } else {
2261 /* remove whole extent: excellent! */ 2408 /* remove whole extent: excellent! */
2262 block = ex_ee_block; 2409 block = ex_ee_block;
2263 num = 0; 2410 num = 0;
2264 BUG_ON(a != ex_ee_block); 2411 if (a != ex_ee_block) {
2265 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2412 ext_debug(" bad truncate %u:%u\n",
2413 start, end);
2414 err = -EIO;
2415 goto out;
2416 }
2417
2418 if (b != ex_ee_block + ex_ee_len - 1) {
2419 ext_debug(" bad truncate %u:%u\n",
2420 start, end);
2421 err = -EIO;
2422 goto out;
2423 }
2266 } 2424 }
2267 2425
2268 /* 2426 /*
@@ -2293,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2293 if (num == 0) { 2451 if (num == 0) {
2294 /* this extent is removed; mark slot entirely unused */ 2452 /* this extent is removed; mark slot entirely unused */
2295 ext4_ext_store_pblock(ex, 0); 2453 ext4_ext_store_pblock(ex, 0);
2296 le16_add_cpu(&eh->eh_entries, -1); 2454 } else if (block != ex_ee_block) {
2455 /*
2456 * If this was a head removal, then we need to update
2457 * the physical block since it is now at a different
2458 * location
2459 */
2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2297 } 2461 }
2298 2462
2299 ex->ee_block = cpu_to_le32(block); 2463 ex->ee_block = cpu_to_le32(block);
@@ -2309,8 +2473,29 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2309 if (err) 2473 if (err)
2310 goto out; 2474 goto out;
2311 2475
2476 /*
2477 * If the extent was completely released,
2478 * we need to remove it from the leaf
2479 */
2480 if (num == 0) {
2481 if (end != EXT_MAX_BLOCKS - 1) {
2482 /*
2483 * For hole punching, we need to scoot all the
2484 * extents up when an extent is removed so that
2485 * we dont have blank extents in the middle
2486 */
2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2488 sizeof(struct ext4_extent));
2489
2490 /* Now get rid of the one at the end */
2491 memset(EXT_LAST_EXTENT(eh), 0,
2492 sizeof(struct ext4_extent));
2493 }
2494 le16_add_cpu(&eh->eh_entries, -1);
2495 }
2496
2312 ext_debug("new extent: %u:%u:%llu\n", block, num, 2497 ext_debug("new extent: %u:%u:%llu\n", block, num,
2313 ext_pblock(ex)); 2498 ext4_ext_pblock(ex));
2314 ex--; 2499 ex--;
2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2500 ex_ee_block = le32_to_cpu(ex->ee_block);
2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2501 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2349,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2349 return 1; 2534 return 1;
2350} 2535}
2351 2536
2352static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2538 ext4_lblk_t end)
2353{ 2539{
2354 struct super_block *sb = inode->i_sb; 2540 struct super_block *sb = inode->i_sb;
2355 int depth = ext_depth(inode); 2541 int depth = ext_depth(inode);
@@ -2388,7 +2574,8 @@ again:
2388 while (i >= 0 && err == 0) { 2574 while (i >= 0 && err == 0) {
2389 if (i == depth) { 2575 if (i == depth) {
2390 /* this is leaf block */ 2576 /* this is leaf block */
2391 err = ext4_ext_rm_leaf(handle, inode, path, start); 2577 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end);
2392 /* root level has p_bh == NULL, brelse() eats this */ 2579 /* root level has p_bh == NULL, brelse() eats this */
2393 brelse(path[i].p_bh); 2580 brelse(path[i].p_bh);
2394 path[i].p_bh = NULL; 2581 path[i].p_bh = NULL;
@@ -2421,9 +2608,9 @@ again:
2421 struct buffer_head *bh; 2608 struct buffer_head *bh;
2422 /* go to the next level */ 2609 /* go to the next level */
2423 ext_debug("move to level %d (block %llu)\n", 2610 ext_debug("move to level %d (block %llu)\n",
2424 i + 1, idx_pblock(path[i].p_idx)); 2611 i + 1, ext4_idx_pblock(path[i].p_idx));
2425 memset(path + i + 1, 0, sizeof(*path)); 2612 memset(path + i + 1, 0, sizeof(*path));
2426 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2613 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2427 if (!bh) { 2614 if (!bh) {
2428 /* should we reset i_size? */ 2615 /* should we reset i_size? */
2429 err = -EIO; 2616 err = -EIO;
@@ -2535,84 +2722,217 @@ void ext4_ext_release(struct super_block *sb)
2535#endif 2722#endif
2536} 2723}
2537 2724
2538static void bi_complete(struct bio *bio, int error)
2539{
2540 complete((struct completion *)bio->bi_private);
2541}
2542
2543/* FIXME!! we need to try to merge to left or right after zero-out */ 2725/* FIXME!! we need to try to merge to left or right after zero-out */
2544static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2726static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2545{ 2727{
2728 ext4_fsblk_t ee_pblock;
2729 unsigned int ee_len;
2546 int ret; 2730 int ret;
2547 struct bio *bio;
2548 int blkbits, blocksize;
2549 sector_t ee_pblock;
2550 struct completion event;
2551 unsigned int ee_len, len, done, offset;
2552 2731
2553
2554 blkbits = inode->i_blkbits;
2555 blocksize = inode->i_sb->s_blocksize;
2556 ee_len = ext4_ext_get_actual_len(ex); 2732 ee_len = ext4_ext_get_actual_len(ex);
2557 ee_pblock = ext_pblock(ex); 2733 ee_pblock = ext4_ext_pblock(ex);
2734
2735 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2736 if (ret > 0)
2737 ret = 0;
2738
2739 return ret;
2740}
2741
2742/*
2743 * used by extent splitting.
2744 */
2745#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2746 due to ENOSPC */
2747#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2748#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2749
2750/*
2751 * ext4_split_extent_at() splits an extent at given block.
2752 *
2753 * @handle: the journal handle
2754 * @inode: the file inode
2755 * @path: the path to the extent
2756 * @split: the logical block where the extent is splitted.
2757 * @split_flags: indicates if the extent could be zeroout if split fails, and
2758 * the states(init or uninit) of new extents.
2759 * @flags: flags used to insert new extent to extent tree.
2760 *
2761 *
2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2763 * of which are deterimined by split_flag.
2764 *
2765 * There are two cases:
2766 * a> the extent are splitted into two extent.
2767 * b> split is not needed, and just mark the extent.
2768 *
2769 * return 0 on success.
2770 */
2771static int ext4_split_extent_at(handle_t *handle,
2772 struct inode *inode,
2773 struct ext4_ext_path *path,
2774 ext4_lblk_t split,
2775 int split_flag,
2776 int flags)
2777{
2778 ext4_fsblk_t newblock;
2779 ext4_lblk_t ee_block;
2780 struct ext4_extent *ex, newex, orig_ex;
2781 struct ext4_extent *ex2 = NULL;
2782 unsigned int ee_len, depth;
2783 int err = 0;
2784
2785 ext_debug("ext4_split_extents_at: inode %lu, logical"
2786 "block %llu\n", inode->i_ino, (unsigned long long)split);
2558 2787
2559 /* convert ee_pblock to 512 byte sectors */ 2788 ext4_ext_show_leaf(inode, path);
2560 ee_pblock = ee_pblock << (blkbits - 9); 2789
2790 depth = ext_depth(inode);
2791 ex = path[depth].p_ext;
2792 ee_block = le32_to_cpu(ex->ee_block);
2793 ee_len = ext4_ext_get_actual_len(ex);
2794 newblock = split - ee_block + ext4_ext_pblock(ex);
2561 2795
2562 while (ee_len > 0) { 2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2563 2797
2564 if (ee_len > BIO_MAX_PAGES) 2798 err = ext4_ext_get_access(handle, inode, path + depth);
2565 len = BIO_MAX_PAGES; 2799 if (err)
2800 goto out;
2801
2802 if (split == ee_block) {
2803 /*
2804 * case b: block @split is the block that the extent begins with
2805 * then we just change the state of the extent, and splitting
2806 * is not needed.
2807 */
2808 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809 ext4_ext_mark_uninitialized(ex);
2566 else 2810 else
2567 len = ee_len; 2811 ext4_ext_mark_initialized(ex);
2568 2812
2569 bio = bio_alloc(GFP_NOIO, len); 2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2570 if (!bio) 2814 ext4_ext_try_to_merge(inode, path, ex);
2571 return -ENOMEM;
2572 2815
2573 bio->bi_sector = ee_pblock; 2816 err = ext4_ext_dirty(handle, inode, path + depth);
2574 bio->bi_bdev = inode->i_sb->s_bdev; 2817 goto out;
2818 }
2575 2819
2576 done = 0; 2820 /* case a */
2577 offset = 0; 2821 memcpy(&orig_ex, ex, sizeof(orig_ex));
2578 while (done < len) { 2822 ex->ee_len = cpu_to_le16(split - ee_block);
2579 ret = bio_add_page(bio, ZERO_PAGE(0), 2823 if (split_flag & EXT4_EXT_MARK_UNINIT1)
2580 blocksize, offset); 2824 ext4_ext_mark_uninitialized(ex);
2581 if (ret != blocksize) {
2582 /*
2583 * We can't add any more pages because of
2584 * hardware limitations. Start a new bio.
2585 */
2586 break;
2587 }
2588 done++;
2589 offset += blocksize;
2590 if (offset >= PAGE_CACHE_SIZE)
2591 offset = 0;
2592 }
2593 2825
2594 init_completion(&event); 2826 /*
2595 bio->bi_private = &event; 2827 * path may lead to new leaf, not to original leaf any more
2596 bio->bi_end_io = bi_complete; 2828 * after ext4_ext_insert_extent() returns,
2597 submit_bio(WRITE, bio); 2829 */
2598 wait_for_completion(&event); 2830 err = ext4_ext_dirty(handle, inode, path + depth);
2831 if (err)
2832 goto fix_extent_len;
2599 2833
2600 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2834 ex2 = &newex;
2601 bio_put(bio); 2835 ex2->ee_block = cpu_to_le32(split);
2602 return -EIO; 2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
2603 } 2837 ext4_ext_store_pblock(ex2, newblock);
2604 bio_put(bio); 2838 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2605 ee_len -= done; 2839 ext4_ext_mark_uninitialized(ex2);
2606 ee_pblock += done << (blkbits - 9); 2840
2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2843 err = ext4_ext_zeroout(inode, &orig_ex);
2844 if (err)
2845 goto fix_extent_len;
2846 /* update the extent length and mark as initialized */
2847 ex->ee_len = cpu_to_le32(ee_len);
2848 ext4_ext_try_to_merge(inode, path, ex);
2849 err = ext4_ext_dirty(handle, inode, path + depth);
2850 goto out;
2851 } else if (err)
2852 goto fix_extent_len;
2853
2854out:
2855 ext4_ext_show_leaf(inode, path);
2856 return err;
2857
2858fix_extent_len:
2859 ex->ee_len = orig_ex.ee_len;
2860 ext4_ext_dirty(handle, inode, path + depth);
2861 return err;
2862}
2863
2864/*
2865 * ext4_split_extents() splits an extent and mark extent which is covered
2866 * by @map as split_flags indicates
2867 *
2868 * It may result in splitting the extent into multiple extents (upto three)
2869 * There are three possibilities:
2870 * a> There is no split required
2871 * b> Splits in two extents: Split is happening at either end of the extent
2872 * c> Splits in three extents: Somone is splitting in middle of the extent
2873 *
2874 */
2875static int ext4_split_extent(handle_t *handle,
2876 struct inode *inode,
2877 struct ext4_ext_path *path,
2878 struct ext4_map_blocks *map,
2879 int split_flag,
2880 int flags)
2881{
2882 ext4_lblk_t ee_block;
2883 struct ext4_extent *ex;
2884 unsigned int ee_len, depth;
2885 int err = 0;
2886 int uninitialized;
2887 int split_flag1, flags1;
2888
2889 depth = ext_depth(inode);
2890 ex = path[depth].p_ext;
2891 ee_block = le32_to_cpu(ex->ee_block);
2892 ee_len = ext4_ext_get_actual_len(ex);
2893 uninitialized = ext4_ext_is_uninitialized(ex);
2894
2895 if (map->m_lblk + map->m_len < ee_block + ee_len) {
2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2897 EXT4_EXT_MAY_ZEROOUT : 0;
2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2899 if (uninitialized)
2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2901 EXT4_EXT_MARK_UNINIT2;
2902 err = ext4_split_extent_at(handle, inode, path,
2903 map->m_lblk + map->m_len, split_flag1, flags1);
2904 if (err)
2905 goto out;
2607 } 2906 }
2608 return 0; 2907
2908 ext4_ext_drop_refs(path);
2909 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2910 if (IS_ERR(path))
2911 return PTR_ERR(path);
2912
2913 if (map->m_lblk >= ee_block) {
2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2915 EXT4_EXT_MAY_ZEROOUT : 0;
2916 if (uninitialized)
2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2918 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2920 err = ext4_split_extent_at(handle, inode, path,
2921 map->m_lblk, split_flag1, flags);
2922 if (err)
2923 goto out;
2924 }
2925
2926 ext4_ext_show_leaf(inode, path);
2927out:
2928 return err ? err : map->m_len;
2609} 2929}
2610 2930
2611#define EXT4_EXT_ZERO_LEN 7 2931#define EXT4_EXT_ZERO_LEN 7
2612/* 2932/*
2613 * This function is called by ext4_ext_map_blocks() if someone tries to write 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2934 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2935 * extent into multiple extents (up to three - one initialized and two
2616 * uninitialized). 2936 * uninitialized).
2617 * There are three possibilities: 2937 * There are three possibilities:
2618 * a> There is no split required: Entire extent should be initialized 2938 * a> There is no split required: Entire extent should be initialized
@@ -2624,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2624 struct ext4_map_blocks *map, 2944 struct ext4_map_blocks *map,
2625 struct ext4_ext_path *path) 2945 struct ext4_ext_path *path)
2626{ 2946{
2627 struct ext4_extent *ex, newex, orig_ex; 2947 struct ext4_map_blocks split_map;
2628 struct ext4_extent *ex1 = NULL; 2948 struct ext4_extent zero_ex;
2629 struct ext4_extent *ex2 = NULL; 2949 struct ext4_extent *ex;
2630 struct ext4_extent *ex3 = NULL;
2631 struct ext4_extent_header *eh;
2632 ext4_lblk_t ee_block, eof_block; 2950 ext4_lblk_t ee_block, eof_block;
2633 unsigned int allocated, ee_len, depth; 2951 unsigned int allocated, ee_len, depth;
2634 ext4_fsblk_t newblock;
2635 int err = 0; 2952 int err = 0;
2636 int ret = 0; 2953 int split_flag = 0;
2637 int may_zeroout;
2638 2954
2639 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2640 "block %llu, max_blocks %u\n", inode->i_ino, 2956 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2646,279 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2646 eof_block = map->m_lblk + map->m_len; 2962 eof_block = map->m_lblk + map->m_len;
2647 2963
2648 depth = ext_depth(inode); 2964 depth = ext_depth(inode);
2649 eh = path[depth].p_hdr;
2650 ex = path[depth].p_ext; 2965 ex = path[depth].p_ext;
2651 ee_block = le32_to_cpu(ex->ee_block); 2966 ee_block = le32_to_cpu(ex->ee_block);
2652 ee_len = ext4_ext_get_actual_len(ex); 2967 ee_len = ext4_ext_get_actual_len(ex);
2653 allocated = ee_len - (map->m_lblk - ee_block); 2968 allocated = ee_len - (map->m_lblk - ee_block);
2654 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2655
2656 ex2 = ex;
2657 orig_ex.ee_block = ex->ee_block;
2658 orig_ex.ee_len = cpu_to_le16(ee_len);
2659 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2660 2969
2970 WARN_ON(map->m_lblk < ee_block);
2661 /* 2971 /*
2662 * It is safe to convert extent to initialized via explicit 2972 * It is safe to convert extent to initialized via explicit
2663 * zeroout only if extent is fully insde i_size or new_size. 2973 * zeroout only if extent is fully insde i_size or new_size.
2664 */ 2974 */
2665 may_zeroout = ee_block + ee_len <= eof_block; 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2666 2976
2667 err = ext4_ext_get_access(handle, inode, path + depth);
2668 if (err)
2669 goto out;
2670 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2671 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2672 err = ext4_ext_zeroout(inode, &orig_ex); 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2980 err = ext4_ext_zeroout(inode, ex);
2673 if (err) 2981 if (err)
2674 goto fix_extent_len;
2675 /* update the extent length and mark as initialized */
2676 ex->ee_block = orig_ex.ee_block;
2677 ex->ee_len = orig_ex.ee_len;
2678 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2679 ext4_ext_dirty(handle, inode, path + depth);
2680 /* zeroed the full extent */
2681 return allocated;
2682 }
2683
2684 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2685 if (map->m_lblk > ee_block) {
2686 ex1 = ex;
2687 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2688 ext4_ext_mark_uninitialized(ex1);
2689 ex2 = &newex;
2690 }
2691 /*
2692 * for sanity, update the length of the ex2 extent before
2693 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2694 * overlap of blocks.
2695 */
2696 if (!ex1 && allocated > map->m_len)
2697 ex2->ee_len = cpu_to_le16(map->m_len);
2698 /* ex3: to ee_block + ee_len : uninitialised */
2699 if (allocated > map->m_len) {
2700 unsigned int newdepth;
2701 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2702 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2703 /*
2704 * map->m_lblk == ee_block is handled by the zerouout
2705 * at the beginning.
2706 * Mark first half uninitialized.
2707 * Mark second half initialized and zero out the
2708 * initialized extent
2709 */
2710 ex->ee_block = orig_ex.ee_block;
2711 ex->ee_len = cpu_to_le16(ee_len - allocated);
2712 ext4_ext_mark_uninitialized(ex);
2713 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2714 ext4_ext_dirty(handle, inode, path + depth);
2715
2716 ex3 = &newex;
2717 ex3->ee_block = cpu_to_le32(map->m_lblk);
2718 ext4_ext_store_pblock(ex3, newblock);
2719 ex3->ee_len = cpu_to_le16(allocated);
2720 err = ext4_ext_insert_extent(handle, inode, path,
2721 ex3, 0);
2722 if (err == -ENOSPC) {
2723 err = ext4_ext_zeroout(inode, &orig_ex);
2724 if (err)
2725 goto fix_extent_len;
2726 ex->ee_block = orig_ex.ee_block;
2727 ex->ee_len = orig_ex.ee_len;
2728 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2729 ext4_ext_dirty(handle, inode, path + depth);
2730 /* blocks available from map->m_lblk */
2731 return allocated;
2732
2733 } else if (err)
2734 goto fix_extent_len;
2735
2736 /*
2737 * We need to zero out the second half because
2738 * an fallocate request can update file size and
2739 * converting the second half to initialized extent
2740 * implies that we can leak some junk data to user
2741 * space.
2742 */
2743 err = ext4_ext_zeroout(inode, ex3);
2744 if (err) {
2745 /*
2746 * We should actually mark the
2747 * second half as uninit and return error
2748 * Insert would have changed the extent
2749 */
2750 depth = ext_depth(inode);
2751 ext4_ext_drop_refs(path);
2752 path = ext4_ext_find_extent(inode, map->m_lblk,
2753 path);
2754 if (IS_ERR(path)) {
2755 err = PTR_ERR(path);
2756 return err;
2757 }
2758 /* get the second half extent details */
2759 ex = path[depth].p_ext;
2760 err = ext4_ext_get_access(handle, inode,
2761 path + depth);
2762 if (err)
2763 return err;
2764 ext4_ext_mark_uninitialized(ex);
2765 ext4_ext_dirty(handle, inode, path + depth);
2766 return err;
2767 }
2768
2769 /* zeroed the second half */
2770 return allocated;
2771 }
2772 ex3 = &newex;
2773 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2774 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2775 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2776 ext4_ext_mark_uninitialized(ex3);
2777 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2778 if (err == -ENOSPC && may_zeroout) {
2779 err = ext4_ext_zeroout(inode, &orig_ex);
2780 if (err)
2781 goto fix_extent_len;
2782 /* update the extent length and mark as initialized */
2783 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786 ext4_ext_dirty(handle, inode, path + depth);
2787 /* zeroed the full extent */
2788 /* blocks available from map->m_lblk */
2789 return allocated;
2790
2791 } else if (err)
2792 goto fix_extent_len;
2793 /*
2794 * The depth, and hence eh & ex might change
2795 * as part of the insert above.
2796 */
2797 newdepth = ext_depth(inode);
2798 /*
2799 * update the extent length after successful insert of the
2800 * split extent
2801 */
2802 ee_len -= ext4_ext_get_actual_len(ex3);
2803 orig_ex.ee_len = cpu_to_le16(ee_len);
2804 may_zeroout = ee_block + ee_len <= eof_block;
2805
2806 depth = newdepth;
2807 ext4_ext_drop_refs(path);
2808 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2809 if (IS_ERR(path)) {
2810 err = PTR_ERR(path);
2811 goto out; 2982 goto out;
2812 }
2813 eh = path[depth].p_hdr;
2814 ex = path[depth].p_ext;
2815 if (ex2 != &newex)
2816 ex2 = ex;
2817 2983
2818 err = ext4_ext_get_access(handle, inode, path + depth); 2984 err = ext4_ext_get_access(handle, inode, path + depth);
2819 if (err) 2985 if (err)
2820 goto out; 2986 goto out;
2821 2987 ext4_ext_mark_initialized(ex);
2822 allocated = map->m_len; 2988 ext4_ext_try_to_merge(inode, path, ex);
2823 2989 err = ext4_ext_dirty(handle, inode, path + depth);
2824 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2990 goto out;
2825 * to insert a extent in the middle zerout directly
2826 * otherwise give the extent a chance to merge to left
2827 */
2828 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2829 map->m_lblk != ee_block && may_zeroout) {
2830 err = ext4_ext_zeroout(inode, &orig_ex);
2831 if (err)
2832 goto fix_extent_len;
2833 /* update the extent length and mark as initialized */
2834 ex->ee_block = orig_ex.ee_block;
2835 ex->ee_len = orig_ex.ee_len;
2836 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2837 ext4_ext_dirty(handle, inode, path + depth);
2838 /* zero out the first half */
2839 /* blocks available from map->m_lblk */
2840 return allocated;
2841 }
2842 }
2843 /*
2844 * If there was a change of depth as part of the
2845 * insertion of ex3 above, we need to update the length
2846 * of the ex1 extent again here
2847 */
2848 if (ex1 && ex1 != ex) {
2849 ex1 = ex;
2850 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2851 ext4_ext_mark_uninitialized(ex1);
2852 ex2 = &newex;
2853 }
2854 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2855 ex2->ee_block = cpu_to_le32(map->m_lblk);
2856 ext4_ext_store_pblock(ex2, newblock);
2857 ex2->ee_len = cpu_to_le16(allocated);
2858 if (ex2 != ex)
2859 goto insert;
2860 /*
2861 * New (initialized) extent starts from the first block
2862 * in the current extent. i.e., ex2 == ex
2863 * We have to see if it can be merged with the extent
2864 * on the left.
2865 */
2866 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2867 /*
2868 * To merge left, pass "ex2 - 1" to try_to_merge(),
2869 * since it merges towards right _only_.
2870 */
2871 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2872 if (ret) {
2873 err = ext4_ext_correct_indexes(handle, inode, path);
2874 if (err)
2875 goto out;
2876 depth = ext_depth(inode);
2877 ex2--;
2878 }
2879 } 2991 }
2992
2880 /* 2993 /*
2881 * Try to Merge towards right. This might be required 2994 * four cases:
2882 * only when the whole extent is being written to. 2995 * 1. split the extent into three extents.
2883 * i.e. ex2 == ex and ex3 == NULL. 2996 * 2. split the extent into two extents, zeroout the first half.
2997 * 3. split the extent into two extents, zeroout the second half.
2998 * 4. split the extent into two extents with out zeroout.
2884 */ 2999 */
2885 if (!ex3) { 3000 split_map.m_lblk = map->m_lblk;
2886 ret = ext4_ext_try_to_merge(inode, path, ex2); 3001 split_map.m_len = map->m_len;
2887 if (ret) { 3002
2888 err = ext4_ext_correct_indexes(handle, inode, path); 3003 if (allocated > map->m_len) {
3004 if (allocated <= EXT4_EXT_ZERO_LEN &&
3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3006 /* case 3 */
3007 zero_ex.ee_block =
3008 cpu_to_le32(map->m_lblk);
3009 zero_ex.ee_len = cpu_to_le16(allocated);
3010 ext4_ext_store_pblock(&zero_ex,
3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3012 err = ext4_ext_zeroout(inode, &zero_ex);
2889 if (err) 3013 if (err)
2890 goto out; 3014 goto out;
3015 split_map.m_lblk = map->m_lblk;
3016 split_map.m_len = allocated;
3017 } else if ((map->m_lblk - ee_block + map->m_len <
3018 EXT4_EXT_ZERO_LEN) &&
3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3020 /* case 2 */
3021 if (map->m_lblk != ee_block) {
3022 zero_ex.ee_block = ex->ee_block;
3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3024 ee_block);
3025 ext4_ext_store_pblock(&zero_ex,
3026 ext4_ext_pblock(ex));
3027 err = ext4_ext_zeroout(inode, &zero_ex);
3028 if (err)
3029 goto out;
3030 }
3031
3032 split_map.m_lblk = ee_block;
3033 split_map.m_len = map->m_lblk - ee_block + map->m_len;
3034 allocated = map->m_len;
2891 } 3035 }
2892 } 3036 }
2893 /* Mark modified extent as dirty */ 3037
2894 err = ext4_ext_dirty(handle, inode, path + depth); 3038 allocated = ext4_split_extent(handle, inode, path,
2895 goto out; 3039 &split_map, split_flag, 0);
2896insert: 3040 if (allocated < 0)
2897 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3041 err = allocated;
2898 if (err == -ENOSPC && may_zeroout) { 3042
2899 err = ext4_ext_zeroout(inode, &orig_ex);
2900 if (err)
2901 goto fix_extent_len;
2902 /* update the extent length and mark as initialized */
2903 ex->ee_block = orig_ex.ee_block;
2904 ex->ee_len = orig_ex.ee_len;
2905 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2906 ext4_ext_dirty(handle, inode, path + depth);
2907 /* zero out the first half */
2908 return allocated;
2909 } else if (err)
2910 goto fix_extent_len;
2911out: 3043out:
2912 ext4_ext_show_leaf(inode, path);
2913 return err ? err : allocated; 3044 return err ? err : allocated;
2914
2915fix_extent_len:
2916 ex->ee_block = orig_ex.ee_block;
2917 ex->ee_len = orig_ex.ee_len;
2918 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2919 ext4_ext_mark_uninitialized(ex);
2920 ext4_ext_dirty(handle, inode, path + depth);
2921 return err;
2922} 3045}
2923 3046
2924/* 3047/*
@@ -2926,15 +3049,15 @@ fix_extent_len:
2926 * ext4_get_blocks_dio_write() when DIO to write 3049 * ext4_get_blocks_dio_write() when DIO to write
2927 * to an uninitialized extent. 3050 * to an uninitialized extent.
2928 * 3051 *
2929 * Writing to an uninitized extent may result in splitting the uninitialized 3052 * Writing to an uninitialized extent may result in splitting the uninitialized
2930 * extent into multiple /intialized unintialized extents (up to three) 3053 * extent into multiple /initialized uninitialized extents (up to three)
2931 * There are three possibilities: 3054 * There are three possibilities:
2932 * a> There is no split required: Entire extent should be uninitialized 3055 * a> There is no split required: Entire extent should be uninitialized
2933 * b> Splits in two extents: Write is happening at either end of the extent 3056 * b> Splits in two extents: Write is happening at either end of the extent
2934 * c> Splits in three extents: Somone is writing in middle of the extent 3057 * c> Splits in three extents: Somone is writing in middle of the extent
2935 * 3058 *
2936 * One of more index blocks maybe needed if the extent tree grow after 3059 * One of more index blocks maybe needed if the extent tree grow after
2937 * the unintialized extent split. To prevent ENOSPC occur at the IO 3060 * the uninitialized extent split. To prevent ENOSPC occur at the IO
2938 * complete, we need to split the uninitialized extent before DIO submit 3061 * complete, we need to split the uninitialized extent before DIO submit
2939 * the IO. The uninitialized extent called at this time will be split 3062 * the IO. The uninitialized extent called at this time will be split
2940 * into three uninitialized extent(at most). After IO complete, the part 3063 * into three uninitialized extent(at most). After IO complete, the part
@@ -2949,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2949 struct ext4_ext_path *path, 3072 struct ext4_ext_path *path,
2950 int flags) 3073 int flags)
2951{ 3074{
2952 struct ext4_extent *ex, newex, orig_ex; 3075 ext4_lblk_t eof_block;
2953 struct ext4_extent *ex1 = NULL; 3076 ext4_lblk_t ee_block;
2954 struct ext4_extent *ex2 = NULL; 3077 struct ext4_extent *ex;
2955 struct ext4_extent *ex3 = NULL; 3078 unsigned int ee_len;
2956 ext4_lblk_t ee_block, eof_block; 3079 int split_flag = 0, depth;
2957 unsigned int allocated, ee_len, depth;
2958 ext4_fsblk_t newblock;
2959 int err = 0;
2960 int may_zeroout;
2961 3080
2962 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2963 "block %llu, max_blocks %u\n", inode->i_ino, 3082 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2967,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2967 inode->i_sb->s_blocksize_bits; 3086 inode->i_sb->s_blocksize_bits;
2968 if (eof_block < map->m_lblk + map->m_len) 3087 if (eof_block < map->m_lblk + map->m_len)
2969 eof_block = map->m_lblk + map->m_len; 3088 eof_block = map->m_lblk + map->m_len;
2970
2971 depth = ext_depth(inode);
2972 ex = path[depth].p_ext;
2973 ee_block = le32_to_cpu(ex->ee_block);
2974 ee_len = ext4_ext_get_actual_len(ex);
2975 allocated = ee_len - (map->m_lblk - ee_block);
2976 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2977
2978 ex2 = ex;
2979 orig_ex.ee_block = ex->ee_block;
2980 orig_ex.ee_len = cpu_to_le16(ee_len);
2981 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2982
2983 /* 3089 /*
2984 * It is safe to convert extent to initialized via explicit 3090 * It is safe to convert extent to initialized via explicit
2985 * zeroout only if extent is fully insde i_size or new_size. 3091 * zeroout only if extent is fully insde i_size or new_size.
2986 */ 3092 */
2987 may_zeroout = ee_block + ee_len <= eof_block; 3093 depth = ext_depth(inode);
2988 3094 ex = path[depth].p_ext;
2989 /* 3095 ee_block = le32_to_cpu(ex->ee_block);
2990 * If the uninitialized extent begins at the same logical 3096 ee_len = ext4_ext_get_actual_len(ex);
2991 * block where the write begins, and the write completely
2992 * covers the extent, then we don't need to split it.
2993 */
2994 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2995 return allocated;
2996
2997 err = ext4_ext_get_access(handle, inode, path + depth);
2998 if (err)
2999 goto out;
3000 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
3001 if (map->m_lblk > ee_block) {
3002 ex1 = ex;
3003 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3004 ext4_ext_mark_uninitialized(ex1);
3005 ex2 = &newex;
3006 }
3007 /*
3008 * for sanity, update the length of the ex2 extent before
3009 * we insert ex3, if ex1 is NULL. This is to avoid temporary
3010 * overlap of blocks.
3011 */
3012 if (!ex1 && allocated > map->m_len)
3013 ex2->ee_len = cpu_to_le16(map->m_len);
3014 /* ex3: to ee_block + ee_len : uninitialised */
3015 if (allocated > map->m_len) {
3016 unsigned int newdepth;
3017 ex3 = &newex;
3018 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
3019 ext4_ext_store_pblock(ex3, newblock + map->m_len);
3020 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
3021 ext4_ext_mark_uninitialized(ex3);
3022 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
3023 if (err == -ENOSPC && may_zeroout) {
3024 err = ext4_ext_zeroout(inode, &orig_ex);
3025 if (err)
3026 goto fix_extent_len;
3027 /* update the extent length and mark as initialized */
3028 ex->ee_block = orig_ex.ee_block;
3029 ex->ee_len = orig_ex.ee_len;
3030 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3031 ext4_ext_dirty(handle, inode, path + depth);
3032 /* zeroed the full extent */
3033 /* blocks available from map->m_lblk */
3034 return allocated;
3035
3036 } else if (err)
3037 goto fix_extent_len;
3038 /*
3039 * The depth, and hence eh & ex might change
3040 * as part of the insert above.
3041 */
3042 newdepth = ext_depth(inode);
3043 /*
3044 * update the extent length after successful insert of the
3045 * split extent
3046 */
3047 ee_len -= ext4_ext_get_actual_len(ex3);
3048 orig_ex.ee_len = cpu_to_le16(ee_len);
3049 may_zeroout = ee_block + ee_len <= eof_block;
3050
3051 depth = newdepth;
3052 ext4_ext_drop_refs(path);
3053 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3054 if (IS_ERR(path)) {
3055 err = PTR_ERR(path);
3056 goto out;
3057 }
3058 ex = path[depth].p_ext;
3059 if (ex2 != &newex)
3060 ex2 = ex;
3061
3062 err = ext4_ext_get_access(handle, inode, path + depth);
3063 if (err)
3064 goto out;
3065 3097
3066 allocated = map->m_len; 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3067 } 3099 split_flag |= EXT4_EXT_MARK_UNINIT2;
3068 /*
3069 * If there was a change of depth as part of the
3070 * insertion of ex3 above, we need to update the length
3071 * of the ex1 extent again here
3072 */
3073 if (ex1 && ex1 != ex) {
3074 ex1 = ex;
3075 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3076 ext4_ext_mark_uninitialized(ex1);
3077 ex2 = &newex;
3078 }
3079 /*
3080 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3081 * using direct I/O, uninitialised still.
3082 */
3083 ex2->ee_block = cpu_to_le32(map->m_lblk);
3084 ext4_ext_store_pblock(ex2, newblock);
3085 ex2->ee_len = cpu_to_le16(allocated);
3086 ext4_ext_mark_uninitialized(ex2);
3087 if (ex2 != ex)
3088 goto insert;
3089 /* Mark modified extent as dirty */
3090 err = ext4_ext_dirty(handle, inode, path + depth);
3091 ext_debug("out here\n");
3092 goto out;
3093insert:
3094 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3095 if (err == -ENOSPC && may_zeroout) {
3096 err = ext4_ext_zeroout(inode, &orig_ex);
3097 if (err)
3098 goto fix_extent_len;
3099 /* update the extent length and mark as initialized */
3100 ex->ee_block = orig_ex.ee_block;
3101 ex->ee_len = orig_ex.ee_len;
3102 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3103 ext4_ext_dirty(handle, inode, path + depth);
3104 /* zero out the first half */
3105 return allocated;
3106 } else if (err)
3107 goto fix_extent_len;
3108out:
3109 ext4_ext_show_leaf(inode, path);
3110 return err ? err : allocated;
3111 3100
3112fix_extent_len: 3101 flags |= EXT4_GET_BLOCKS_PRE_IO;
3113 ex->ee_block = orig_ex.ee_block; 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3114 ex->ee_len = orig_ex.ee_len;
3115 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3116 ext4_ext_mark_uninitialized(ex);
3117 ext4_ext_dirty(handle, inode, path + depth);
3118 return err;
3119} 3103}
3104
3120static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3105static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3121 struct inode *inode, 3106 struct inode *inode,
3122 struct ext4_ext_path *path) 3107 struct ext4_ext_path *path)
@@ -3125,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3125 struct ext4_extent_header *eh; 3110 struct ext4_extent_header *eh;
3126 int depth; 3111 int depth;
3127 int err = 0; 3112 int err = 0;
3128 int ret = 0;
3129 3113
3130 depth = ext_depth(inode); 3114 depth = ext_depth(inode);
3131 eh = path[depth].p_hdr; 3115 eh = path[depth].p_hdr;
3132 ex = path[depth].p_ext; 3116 ex = path[depth].p_ext;
3133 3117
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3119 "block %llu, max_blocks %u\n", inode->i_ino,
3120 (unsigned long long)le32_to_cpu(ex->ee_block),
3121 ext4_ext_get_actual_len(ex));
3122
3134 err = ext4_ext_get_access(handle, inode, path + depth); 3123 err = ext4_ext_get_access(handle, inode, path + depth);
3135 if (err) 3124 if (err)
3136 goto out; 3125 goto out;
3137 /* first mark the extent as initialized */ 3126 /* first mark the extent as initialized */
3138 ext4_ext_mark_initialized(ex); 3127 ext4_ext_mark_initialized(ex);
3139 3128
3140 /* 3129 /* note: ext4_ext_correct_indexes() isn't needed here because
3141 * We have to see if it can be merged with the extent 3130 * borders are not changed
3142 * on the left.
3143 */
3144 if (ex > EXT_FIRST_EXTENT(eh)) {
3145 /*
3146 * To merge left, pass "ex - 1" to try_to_merge(),
3147 * since it merges towards right _only_.
3148 */
3149 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3150 if (ret) {
3151 err = ext4_ext_correct_indexes(handle, inode, path);
3152 if (err)
3153 goto out;
3154 depth = ext_depth(inode);
3155 ex--;
3156 }
3157 }
3158 /*
3159 * Try to Merge towards right.
3160 */ 3131 */
3161 ret = ext4_ext_try_to_merge(inode, path, ex); 3132 ext4_ext_try_to_merge(inode, path, ex);
3162 if (ret) { 3133
3163 err = ext4_ext_correct_indexes(handle, inode, path);
3164 if (err)
3165 goto out;
3166 depth = ext_depth(inode);
3167 }
3168 /* Mark modified extent as dirty */ 3134 /* Mark modified extent as dirty */
3169 err = ext4_ext_dirty(handle, inode, path + depth); 3135 err = ext4_ext_dirty(handle, inode, path + depth);
3170out: 3136out:
@@ -3180,6 +3146,56 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3180 unmap_underlying_metadata(bdev, block + i); 3146 unmap_underlying_metadata(bdev, block + i);
3181} 3147}
3182 3148
3149/*
3150 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3151 */
3152static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3153 ext4_lblk_t lblk,
3154 struct ext4_ext_path *path,
3155 unsigned int len)
3156{
3157 int i, depth;
3158 struct ext4_extent_header *eh;
3159 struct ext4_extent *last_ex;
3160
3161 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3162 return 0;
3163
3164 depth = ext_depth(inode);
3165 eh = path[depth].p_hdr;
3166
3167 if (unlikely(!eh->eh_entries)) {
3168 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3169 "EOFBLOCKS_FL set");
3170 return -EIO;
3171 }
3172 last_ex = EXT_LAST_EXTENT(eh);
3173 /*
3174 * We should clear the EOFBLOCKS_FL flag if we are writing the
3175 * last block in the last extent in the file. We test this by
3176 * first checking to see if the caller to
3177 * ext4_ext_get_blocks() was interested in the last block (or
3178 * a block beyond the last block) in the current extent. If
3179 * this turns out to be false, we can bail out from this
3180 * function immediately.
3181 */
3182 if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3183 ext4_ext_get_actual_len(last_ex))
3184 return 0;
3185 /*
3186 * If the caller does appear to be planning to write at or
3187 * beyond the end of the current extent, we then test to see
3188 * if the current extent is the last extent in the file, by
3189 * checking to make sure it was reached via the rightmost node
3190 * at each level of the tree.
3191 */
3192 for (i = depth-1; i >= 0; i--)
3193 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3194 return 0;
3195 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3196 return ext4_mark_inode_dirty(handle, inode);
3197}
3198
3183static int 3199static int
3184ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3200ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3185 struct ext4_map_blocks *map, 3201 struct ext4_map_blocks *map,
@@ -3202,12 +3218,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3202 path, flags); 3218 path, flags);
3203 /* 3219 /*
3204 * Flag the inode(non aio case) or end_io struct (aio case) 3220 * Flag the inode(non aio case) or end_io struct (aio case)
3205 * that this IO needs to convertion to written when IO is 3221 * that this IO needs to conversion to written when IO is
3206 * completed 3222 * completed
3207 */ 3223 */
3208 if (io) 3224 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3209 io->flag = EXT4_IO_UNWRITTEN; 3225 io->flag = EXT4_IO_END_UNWRITTEN;
3210 else 3226 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3227 } else
3211 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3228 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3212 if (ext4_should_dioread_nolock(inode)) 3229 if (ext4_should_dioread_nolock(inode))
3213 map->m_flags |= EXT4_MAP_UNINIT; 3230 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3217,8 +3234,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3217 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3234 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3218 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3235 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3219 path); 3236 path);
3220 if (ret >= 0) 3237 if (ret >= 0) {
3221 ext4_update_inode_fsync_trans(handle, inode, 1); 3238 ext4_update_inode_fsync_trans(handle, inode, 1);
3239 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3240 path, map->m_len);
3241 } else
3242 err = ret;
3222 goto out2; 3243 goto out2;
3223 } 3244 }
3224 /* buffered IO case */ 3245 /* buffered IO case */
@@ -3244,8 +3265,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3244 3265
3245 /* buffered write, writepage time, convert*/ 3266 /* buffered write, writepage time, convert*/
3246 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3267 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3247 if (ret >= 0) 3268 if (ret >= 0) {
3248 ext4_update_inode_fsync_trans(handle, inode, 1); 3269 ext4_update_inode_fsync_trans(handle, inode, 1);
3270 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3271 map->m_len);
3272 if (err < 0)
3273 goto out2;
3274 }
3275
3249out: 3276out:
3250 if (ret <= 0) { 3277 if (ret <= 0) {
3251 err = ret; 3278 err = ret;
@@ -3292,6 +3319,7 @@ out2:
3292 } 3319 }
3293 return err ? err : allocated; 3320 return err ? err : allocated;
3294} 3321}
3322
3295/* 3323/*
3296 * Block allocation/map/preallocation routine for extents based files 3324 * Block allocation/map/preallocation routine for extents based files
3297 * 3325 *
@@ -3314,21 +3342,24 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3314 struct ext4_map_blocks *map, int flags) 3342 struct ext4_map_blocks *map, int flags)
3315{ 3343{
3316 struct ext4_ext_path *path = NULL; 3344 struct ext4_ext_path *path = NULL;
3317 struct ext4_extent_header *eh; 3345 struct ext4_extent newex, *ex;
3318 struct ext4_extent newex, *ex, *last_ex; 3346 ext4_fsblk_t newblock = 0;
3319 ext4_fsblk_t newblock; 3347 int err = 0, depth, ret;
3320 int i, err = 0, depth, ret, cache_type;
3321 unsigned int allocated = 0; 3348 unsigned int allocated = 0;
3349 unsigned int punched_out = 0;
3350 unsigned int result = 0;
3322 struct ext4_allocation_request ar; 3351 struct ext4_allocation_request ar;
3323 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3353 struct ext4_map_blocks punch_map;
3324 3354
3325 ext_debug("blocks %u/%u requested for inode %lu\n", 3355 ext_debug("blocks %u/%u requested for inode %lu\n",
3326 map->m_lblk, map->m_len, inode->i_ino); 3356 map->m_lblk, map->m_len, inode->i_ino);
3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3327 3358
3328 /* check in cache */ 3359 /* check in cache */
3329 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
3330 if (cache_type) { 3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
3331 if (cache_type == EXT4_EXT_CACHE_GAP) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3333 /* 3364 /*
3334 * block isn't allocated yet and 3365 * block isn't allocated yet and
@@ -3337,17 +3368,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3337 goto out2; 3368 goto out2;
3338 } 3369 }
3339 /* we should allocate requested block */ 3370 /* we should allocate requested block */
3340 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3371 } else {
3341 /* block is already allocated */ 3372 /* block is already allocated */
3342 newblock = map->m_lblk 3373 newblock = map->m_lblk
3343 - le32_to_cpu(newex.ee_block) 3374 - le32_to_cpu(newex.ee_block)
3344 + ext_pblock(&newex); 3375 + ext4_ext_pblock(&newex);
3345 /* number of remaining blocks in the extent */ 3376 /* number of remaining blocks in the extent */
3346 allocated = ext4_ext_get_actual_len(&newex) - 3377 allocated = ext4_ext_get_actual_len(&newex) -
3347 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3378 (map->m_lblk - le32_to_cpu(newex.ee_block));
3348 goto out; 3379 goto out;
3349 } else {
3350 BUG();
3351 } 3380 }
3352 } 3381 }
3353 3382
@@ -3374,12 +3403,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3374 err = -EIO; 3403 err = -EIO;
3375 goto out2; 3404 goto out2;
3376 } 3405 }
3377 eh = path[depth].p_hdr;
3378 3406
3379 ex = path[depth].p_ext; 3407 ex = path[depth].p_ext;
3380 if (ex) { 3408 if (ex) {
3381 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3409 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3382 ext4_fsblk_t ee_start = ext_pblock(ex); 3410 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3383 unsigned short ee_len; 3411 unsigned short ee_len;
3384 3412
3385 /* 3413 /*
@@ -3395,17 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3395 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3396 ee_block, ee_len, newblock); 3424 ee_block, ee_len, newblock);
3397 3425
3398 /* Do not put uninitialized extent in the cache */ 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3399 if (!ext4_ext_is_uninitialized(ex)) { 3427 /*
3400 ext4_ext_put_in_cache(inode, ee_block, 3428 * Do not put uninitialized extent
3401 ee_len, ee_start, 3429 * in the cache
3402 EXT4_EXT_CACHE_EXTENT); 3430 */
3403 goto out; 3431 if (!ext4_ext_is_uninitialized(ex)) {
3432 ext4_ext_put_in_cache(inode, ee_block,
3433 ee_len, ee_start);
3434 goto out;
3435 }
3436 ret = ext4_ext_handle_uninitialized_extents(
3437 handle, inode, map, path, flags,
3438 allocated, newblock);
3439 return ret;
3404 } 3440 }
3405 ret = ext4_ext_handle_uninitialized_extents(handle, 3441
3406 inode, map, path, flags, allocated, 3442 /*
3407 newblock); 3443 * Punch out the map length, but only to the
3408 return ret; 3444 * end of the extent
3445 */
3446 punched_out = allocated < map->m_len ?
3447 allocated : map->m_len;
3448
3449 /*
3450 * Sense extents need to be converted to
3451 * uninitialized, they must fit in an
3452 * uninitialized extent
3453 */
3454 if (punched_out > EXT_UNINIT_MAX_LEN)
3455 punched_out = EXT_UNINIT_MAX_LEN;
3456
3457 punch_map.m_lblk = map->m_lblk;
3458 punch_map.m_pblk = newblock;
3459 punch_map.m_len = punched_out;
3460 punch_map.m_flags = 0;
3461
3462 /* Check to see if the extent needs to be split */
3463 if (punch_map.m_len != ee_len ||
3464 punch_map.m_lblk != ee_block) {
3465
3466 ret = ext4_split_extent(handle, inode,
3467 path, &punch_map, 0,
3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3469 EXT4_GET_BLOCKS_PRE_IO);
3470
3471 if (ret < 0) {
3472 err = ret;
3473 goto out2;
3474 }
3475 /*
3476 * find extent for the block at
3477 * the start of the hole
3478 */
3479 ext4_ext_drop_refs(path);
3480 kfree(path);
3481
3482 path = ext4_ext_find_extent(inode,
3483 map->m_lblk, NULL);
3484 if (IS_ERR(path)) {
3485 err = PTR_ERR(path);
3486 path = NULL;
3487 goto out2;
3488 }
3489
3490 depth = ext_depth(inode);
3491 ex = path[depth].p_ext;
3492 ee_len = ext4_ext_get_actual_len(ex);
3493 ee_block = le32_to_cpu(ex->ee_block);
3494 ee_start = ext4_ext_pblock(ex);
3495
3496 }
3497
3498 ext4_ext_mark_uninitialized(ex);
3499
3500 err = ext4_ext_remove_space(inode, map->m_lblk,
3501 map->m_lblk + punched_out);
3502
3503 goto out2;
3409 } 3504 }
3410 } 3505 }
3411 3506
@@ -3467,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3467 else 3562 else
3468 /* disable in-core preallocation for non-regular files */ 3563 /* disable in-core preallocation for non-regular files */
3469 ar.flags = 0; 3564 ar.flags = 0;
3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3470 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err);
3471 if (!newblock) 3568 if (!newblock)
3472 goto out2; 3569 goto out2;
@@ -3481,15 +3578,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3481 ext4_ext_mark_uninitialized(&newex); 3578 ext4_ext_mark_uninitialized(&newex);
3482 /* 3579 /*
3483 * io_end structure was created for every IO write to an 3580 * io_end structure was created for every IO write to an
3484 * uninitialized extent. To avoid unecessary conversion, 3581 * uninitialized extent. To avoid unnecessary conversion,
3485 * here we flag the IO that really needs the conversion. 3582 * here we flag the IO that really needs the conversion.
3486 * For non asycn direct IO case, flag the inode state 3583 * For non asycn direct IO case, flag the inode state
3487 * that we need to perform convertion when IO is done. 3584 * that we need to perform conversion when IO is done.
3488 */ 3585 */
3489 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3586 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3490 if (io) 3587 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3491 io->flag = EXT4_IO_UNWRITTEN; 3588 io->flag = EXT4_IO_END_UNWRITTEN;
3492 else 3589 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3590 } else
3493 ext4_set_inode_state(inode, 3591 ext4_set_inode_state(inode,
3494 EXT4_STATE_DIO_UNWRITTEN); 3592 EXT4_STATE_DIO_UNWRITTEN);
3495 } 3593 }
@@ -3497,44 +3595,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3497 map->m_flags |= EXT4_MAP_UNINIT; 3595 map->m_flags |= EXT4_MAP_UNINIT;
3498 } 3596 }
3499 3597
3500 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3598 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3501 if (unlikely(!eh->eh_entries)) { 3599 if (err)
3502 EXT4_ERROR_INODE(inode, 3600 goto out2;
3503 "eh->eh_entries == 0 and " 3601
3504 "EOFBLOCKS_FL set");
3505 err = -EIO;
3506 goto out2;
3507 }
3508 last_ex = EXT_LAST_EXTENT(eh);
3509 /*
3510 * If the current leaf block was reached by looking at
3511 * the last index block all the way down the tree, and
3512 * we are extending the inode beyond the last extent
3513 * in the current leaf block, then clear the
3514 * EOFBLOCKS_FL flag.
3515 */
3516 for (i = depth-1; i >= 0; i--) {
3517 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3518 break;
3519 }
3520 if ((i < 0) &&
3521 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3522 ext4_ext_get_actual_len(last_ex)))
3523 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3524 }
3525 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3602 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3526 if (err) { 3603 if (err) {
3527 /* free data blocks we just allocated */ 3604 /* free data blocks we just allocated */
3528 /* not a good idea to call discard here directly, 3605 /* not a good idea to call discard here directly,
3529 * but otherwise we'd need to call it every free() */ 3606 * but otherwise we'd need to call it every free() */
3530 ext4_discard_preallocations(inode); 3607 ext4_discard_preallocations(inode);
3531 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3608 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3532 ext4_ext_get_actual_len(&newex), 0); 3609 ext4_ext_get_actual_len(&newex), 0);
3533 goto out2; 3610 goto out2;
3534 } 3611 }
3535 3612
3536 /* previous routine could use block we allocated */ 3613 /* previous routine could use block we allocated */
3537 newblock = ext_pblock(&newex); 3614 newblock = ext4_ext_pblock(&newex);
3538 allocated = ext4_ext_get_actual_len(&newex); 3615 allocated = ext4_ext_get_actual_len(&newex);
3539 if (allocated > map->m_len) 3616 if (allocated > map->m_len)
3540 allocated = map->m_len; 3617 allocated = map->m_len;
@@ -3552,8 +3629,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3552 * when it is _not_ an uninitialized extent. 3629 * when it is _not_ an uninitialized extent.
3553 */ 3630 */
3554 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3631 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3555 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3632 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3556 EXT4_EXT_CACHE_EXTENT);
3557 ext4_update_inode_fsync_trans(handle, inode, 1); 3633 ext4_update_inode_fsync_trans(handle, inode, 1);
3558 } else 3634 } else
3559 ext4_update_inode_fsync_trans(handle, inode, 0); 3635 ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3569,7 +3645,13 @@ out2:
3569 ext4_ext_drop_refs(path); 3645 ext4_ext_drop_refs(path);
3570 kfree(path); 3646 kfree(path);
3571 } 3647 }
3572 return err ? err : allocated; 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3649 newblock, map->m_len, err ? err : allocated);
3650
3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3652 punched_out : allocated;
3653
3654 return err ? err : result;
3573} 3655}
3574 3656
3575void ext4_ext_truncate(struct inode *inode) 3657void ext4_ext_truncate(struct inode *inode)
@@ -3581,6 +3663,12 @@ void ext4_ext_truncate(struct inode *inode)
3581 int err = 0; 3663 int err = 0;
3582 3664
3583 /* 3665 /*
3666 * finish any pending end_io work so we won't run the risk of
3667 * converting any truncated blocks to initialized later
3668 */
3669 ext4_flush_completed_IO(inode);
3670
3671 /*
3584 * probably first extent we're gonna free will be last in block 3672 * probably first extent we're gonna free will be last in block
3585 */ 3673 */
3586 err = ext4_writepage_trans_blocks(inode); 3674 err = ext4_writepage_trans_blocks(inode);
@@ -3611,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
3611 3699
3612 last_block = (inode->i_size + sb->s_blocksize - 1) 3700 last_block = (inode->i_size + sb->s_blocksize - 1)
3613 >> EXT4_BLOCK_SIZE_BITS(sb); 3701 >> EXT4_BLOCK_SIZE_BITS(sb);
3614 err = ext4_ext_remove_space(inode, last_block); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
3615 3703
3616 /* In a multi-transaction truncate, we only make the final 3704 /* In a multi-transaction truncate, we only make the final
3617 * transaction synchronous. 3705 * transaction synchronous.
@@ -3619,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
3619 if (IS_SYNC(inode)) 3707 if (IS_SYNC(inode))
3620 ext4_handle_sync(handle); 3708 ext4_handle_sync(handle);
3621 3709
3622out_stop:
3623 up_write(&EXT4_I(inode)->i_data_sem); 3710 up_write(&EXT4_I(inode)->i_data_sem);
3711
3712out_stop:
3624 /* 3713 /*
3625 * If this was a simple ftruncate() and the file will remain alive, 3714 * If this was a simple ftruncate() and the file will remain alive,
3626 * then we need to clear up the orphan record which we created above. 3715 * then we need to clear up the orphan record which we created above.
@@ -3667,14 +3756,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
3667} 3756}
3668 3757
3669/* 3758/*
3670 * preallocate space for a file. This implements ext4's fallocate inode 3759 * preallocate space for a file. This implements ext4's fallocate file
3671 * operation, which gets called from sys_fallocate system call. 3760 * operation, which gets called from sys_fallocate system call.
3672 * For block-mapped files, posix_fallocate should fall back to the method 3761 * For block-mapped files, posix_fallocate should fall back to the method
3673 * of writing zeroes to the required new blocks (the same behavior which is 3762 * of writing zeroes to the required new blocks (the same behavior which is
3674 * expected for file systems which do not support fallocate() system call). 3763 * expected for file systems which do not support fallocate() system call).
3675 */ 3764 */
3676long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3765long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3677{ 3766{
3767 struct inode *inode = file->f_path.dentry->d_inode;
3678 handle_t *handle; 3768 handle_t *handle;
3679 loff_t new_size; 3769 loff_t new_size;
3680 unsigned int max_blocks; 3770 unsigned int max_blocks;
@@ -3691,10 +3781,14 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3691 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3692 return -EOPNOTSUPP; 3782 return -EOPNOTSUPP;
3693 3783
3694 /* preallocation to directories is currently not supported */ 3784 /* Return error if mode is not supported */
3695 if (S_ISDIR(inode->i_mode)) 3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3696 return -ENODEV; 3786 return -EOPNOTSUPP;
3787
3788 if (mode & FALLOC_FL_PUNCH_HOLE)
3789 return ext4_punch_hole(file, offset, len);
3697 3790
3791 trace_ext4_fallocate_enter(inode, offset, len, mode);
3698 map.m_lblk = offset >> blkbits; 3792 map.m_lblk = offset >> blkbits;
3699 /* 3793 /*
3700 * We can't just convert len to max_blocks because 3794 * We can't just convert len to max_blocks because
@@ -3710,6 +3804,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3710 ret = inode_newsize_ok(inode, (len + offset)); 3804 ret = inode_newsize_ok(inode, (len + offset));
3711 if (ret) { 3805 if (ret) {
3712 mutex_unlock(&inode->i_mutex); 3806 mutex_unlock(&inode->i_mutex);
3807 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3713 return ret; 3808 return ret;
3714 } 3809 }
3715retry: 3810retry:
@@ -3722,14 +3817,15 @@ retry:
3722 break; 3817 break;
3723 } 3818 }
3724 ret = ext4_map_blocks(handle, inode, &map, 3819 ret = ext4_map_blocks(handle, inode, &map,
3725 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3821 EXT4_GET_BLOCKS_NO_NORMALIZE);
3726 if (ret <= 0) { 3822 if (ret <= 0) {
3727#ifdef EXT4FS_DEBUG 3823#ifdef EXT4FS_DEBUG
3728 WARN_ON(ret <= 0); 3824 WARN_ON(ret <= 0);
3729 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3825 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3730 "returned error inode#%lu, block=%u, " 3826 "returned error inode#%lu, block=%u, "
3731 "max_blocks=%u", __func__, 3827 "max_blocks=%u", __func__,
3732 inode->i_ino, block, max_blocks); 3828 inode->i_ino, map.m_lblk, max_blocks);
3733#endif 3829#endif
3734 ext4_mark_inode_dirty(handle, inode); 3830 ext4_mark_inode_dirty(handle, inode);
3735 ret2 = ext4_journal_stop(handle); 3831 ret2 = ext4_journal_stop(handle);
@@ -3754,6 +3850,8 @@ retry:
3754 goto retry; 3850 goto retry;
3755 } 3851 }
3756 mutex_unlock(&inode->i_mutex); 3852 mutex_unlock(&inode->i_mutex);
3853 trace_ext4_fallocate_exit(inode, offset, max_blocks,
3854 ret > 0 ? ret2 : ret);
3757 return ret > 0 ? ret2 : ret; 3855 return ret > 0 ? ret2 : ret;
3758} 3856}
3759 3857
@@ -3812,45 +3910,190 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3812 } 3910 }
3813 return ret > 0 ? ret2 : ret; 3911 return ret > 0 ? ret2 : ret;
3814} 3912}
3913
3815/* 3914/*
3816 * Callback function called for each extent to gather FIEMAP information. 3915 * Callback function called for each extent to gather FIEMAP information.
3817 */ 3916 */
3818static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3917static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
3819 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3918 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3820 void *data) 3919 void *data)
3821{ 3920{
3822 struct fiemap_extent_info *fieinfo = data;
3823 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3824 __u64 logical; 3921 __u64 logical;
3825 __u64 physical; 3922 __u64 physical;
3826 __u64 length; 3923 __u64 length;
3827 __u32 flags = 0; 3924 __u32 flags = 0;
3828 int error; 3925 int ret = 0;
3926 struct fiemap_extent_info *fieinfo = data;
3927 unsigned char blksize_bits;
3829 3928
3830 logical = (__u64)newex->ec_block << blksize_bits; 3929 blksize_bits = inode->i_sb->s_blocksize_bits;
3930 logical = (__u64)newex->ec_block << blksize_bits;
3831 3931
3832 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3932 if (newex->ec_start == 0) {
3833 pgoff_t offset; 3933 /*
3834 struct page *page; 3934 * No extent in extent-tree contains block @newex->ec_start,
3935 * then the block may stay in 1)a hole or 2)delayed-extent.
3936 *
3937 * Holes or delayed-extents are processed as follows.
3938 * 1. lookup dirty pages with specified range in pagecache.
3939 * If no page is got, then there is no delayed-extent and
3940 * return with EXT_CONTINUE.
3941 * 2. find the 1st mapped buffer,
3942 * 3. check if the mapped buffer is both in the request range
3943 * and a delayed buffer. If not, there is no delayed-extent,
3944 * then return.
3945 * 4. a delayed-extent is found, the extent will be collected.
3946 */
3947 ext4_lblk_t end = 0;
3948 pgoff_t last_offset;
3949 pgoff_t offset;
3950 pgoff_t index;
3951 pgoff_t start_index = 0;
3952 struct page **pages = NULL;
3835 struct buffer_head *bh = NULL; 3953 struct buffer_head *bh = NULL;
3954 struct buffer_head *head = NULL;
3955 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
3956
3957 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
3958 if (pages == NULL)
3959 return -ENOMEM;
3836 3960
3837 offset = logical >> PAGE_SHIFT; 3961 offset = logical >> PAGE_SHIFT;
3838 page = find_get_page(inode->i_mapping, offset); 3962repeat:
3839 if (!page || !page_has_buffers(page)) 3963 last_offset = offset;
3840 return EXT_CONTINUE; 3964 head = NULL;
3965 ret = find_get_pages_tag(inode->i_mapping, &offset,
3966 PAGECACHE_TAG_DIRTY, nr_pages, pages);
3967
3968 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3969 /* First time, try to find a mapped buffer. */
3970 if (ret == 0) {
3971out:
3972 for (index = 0; index < ret; index++)
3973 page_cache_release(pages[index]);
3974 /* just a hole. */
3975 kfree(pages);
3976 return EXT_CONTINUE;
3977 }
3978 index = 0;
3841 3979
3842 bh = page_buffers(page); 3980next_page:
3981 /* Try to find the 1st mapped buffer. */
3982 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3983 blksize_bits;
3984 if (!page_has_buffers(pages[index]))
3985 goto out;
3986 head = page_buffers(pages[index]);
3987 if (!head)
3988 goto out;
3843 3989
3844 if (!bh) 3990 index++;
3845 return EXT_CONTINUE; 3991 bh = head;
3992 do {
3993 if (end >= newex->ec_block +
3994 newex->ec_len)
3995 /* The buffer is out of
3996 * the request range.
3997 */
3998 goto out;
3999
4000 if (buffer_mapped(bh) &&
4001 end >= newex->ec_block) {
4002 start_index = index - 1;
4003 /* get the 1st mapped buffer. */
4004 goto found_mapped_buffer;
4005 }
4006
4007 bh = bh->b_this_page;
4008 end++;
4009 } while (bh != head);
3846 4010
3847 if (buffer_delay(bh)) { 4011 /* No mapped buffer in the range found in this page,
3848 flags |= FIEMAP_EXTENT_DELALLOC; 4012 * We need to look up next page.
3849 page_cache_release(page); 4013 */
4014 if (index >= ret) {
4015 /* There is no page left, but we need to limit
4016 * newex->ec_len.
4017 */
4018 newex->ec_len = end - newex->ec_block;
4019 goto out;
4020 }
4021 goto next_page;
3850 } else { 4022 } else {
3851 page_cache_release(page); 4023 /*Find contiguous delayed buffers. */
3852 return EXT_CONTINUE; 4024 if (ret > 0 && pages[0]->index == last_offset)
4025 head = page_buffers(pages[0]);
4026 bh = head;
4027 index = 1;
4028 start_index = 0;
4029 }
4030
4031found_mapped_buffer:
4032 if (bh != NULL && buffer_delay(bh)) {
4033 /* 1st or contiguous delayed buffer found. */
4034 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4035 /*
4036 * 1st delayed buffer found, record
4037 * the start of extent.
4038 */
4039 flags |= FIEMAP_EXTENT_DELALLOC;
4040 newex->ec_block = end;
4041 logical = (__u64)end << blksize_bits;
4042 }
4043 /* Find contiguous delayed buffers. */
4044 do {
4045 if (!buffer_delay(bh))
4046 goto found_delayed_extent;
4047 bh = bh->b_this_page;
4048 end++;
4049 } while (bh != head);
4050
4051 for (; index < ret; index++) {
4052 if (!page_has_buffers(pages[index])) {
4053 bh = NULL;
4054 break;
4055 }
4056 head = page_buffers(pages[index]);
4057 if (!head) {
4058 bh = NULL;
4059 break;
4060 }
4061
4062 if (pages[index]->index !=
4063 pages[start_index]->index + index
4064 - start_index) {
4065 /* Blocks are not contiguous. */
4066 bh = NULL;
4067 break;
4068 }
4069 bh = head;
4070 do {
4071 if (!buffer_delay(bh))
4072 /* Delayed-extent ends. */
4073 goto found_delayed_extent;
4074 bh = bh->b_this_page;
4075 end++;
4076 } while (bh != head);
4077 }
4078 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4079 /* a hole found. */
4080 goto out;
4081
4082found_delayed_extent:
4083 newex->ec_len = min(end - newex->ec_block,
4084 (ext4_lblk_t)EXT_INIT_MAX_LEN);
4085 if (ret == nr_pages && bh != NULL &&
4086 newex->ec_len < EXT_INIT_MAX_LEN &&
4087 buffer_delay(bh)) {
4088 /* Have not collected an extent and continue. */
4089 for (index = 0; index < ret; index++)
4090 page_cache_release(pages[index]);
4091 goto repeat;
3853 } 4092 }
4093
4094 for (index = 0; index < ret; index++)
4095 page_cache_release(pages[index]);
4096 kfree(pages);
3854 } 4097 }
3855 4098
3856 physical = (__u64)newex->ec_start << blksize_bits; 4099 physical = (__u64)newex->ec_start << blksize_bits;
@@ -3859,32 +4102,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3859 if (ex && ext4_ext_is_uninitialized(ex)) 4102 if (ex && ext4_ext_is_uninitialized(ex))
3860 flags |= FIEMAP_EXTENT_UNWRITTEN; 4103 flags |= FIEMAP_EXTENT_UNWRITTEN;
3861 4104
3862 /* 4105 if (next == EXT_MAX_BLOCKS)
3863 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3864 *
3865 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3866 * this also indicates no more allocated blocks.
3867 *
3868 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3869 */
3870 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3871 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3872 loff_t size = i_size_read(inode);
3873 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3874
3875 flags |= FIEMAP_EXTENT_LAST; 4106 flags |= FIEMAP_EXTENT_LAST;
3876 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3877 logical+length > size)
3878 length = (size - logical + bs - 1) & ~(bs-1);
3879 }
3880 4107
3881 error = fiemap_fill_next_extent(fieinfo, logical, physical, 4108 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
3882 length, flags); 4109 length, flags);
3883 if (error < 0) 4110 if (ret < 0)
3884 return error; 4111 return ret;
3885 if (error == 1) 4112 if (ret == 1)
3886 return EXT_BREAK; 4113 return EXT_BREAK;
3887
3888 return EXT_CONTINUE; 4114 return EXT_CONTINUE;
3889} 4115}
3890 4116
@@ -3926,6 +4152,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
3926 return (error < 0 ? error : 0); 4152 return (error < 0 ? error : 0);
3927} 4153}
3928 4154
4155/*
4156 * ext4_ext_punch_hole
4157 *
4158 * Punches a hole of "length" bytes in a file starting
4159 * at byte "offset"
4160 *
4161 * @inode: The inode of the file to punch a hole in
4162 * @offset: The starting byte offset of the hole
4163 * @length: The length of the hole
4164 *
4165 * Returns the number of blocks removed or negative on err
4166 */
4167int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4168{
4169 struct inode *inode = file->f_path.dentry->d_inode;
4170 struct super_block *sb = inode->i_sb;
4171 struct ext4_ext_cache cache_ex;
4172 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4173 struct address_space *mapping = inode->i_mapping;
4174 struct ext4_map_blocks map;
4175 handle_t *handle;
4176 loff_t first_block_offset, last_block_offset, block_len;
4177 loff_t first_page, last_page, first_page_offset, last_page_offset;
4178 int ret, credits, blocks_released, err = 0;
4179
4180 first_block = (offset + sb->s_blocksize - 1) >>
4181 EXT4_BLOCK_SIZE_BITS(sb);
4182 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4183
4184 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4185 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4186
4187 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4188 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4189
4190 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4191 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4192
4193 /*
4194 * Write out all dirty pages to avoid race conditions
4195 * Then release them.
4196 */
4197 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4198 err = filemap_write_and_wait_range(mapping,
4199 first_page_offset == 0 ? 0 : first_page_offset-1,
4200 last_page_offset);
4201
4202 if (err)
4203 return err;
4204 }
4205
4206 /* Now release the pages */
4207 if (last_page_offset > first_page_offset) {
4208 truncate_inode_pages_range(mapping, first_page_offset,
4209 last_page_offset-1);
4210 }
4211
4212 /* finish any pending end_io work */
4213 ext4_flush_completed_IO(inode);
4214
4215 credits = ext4_writepage_trans_blocks(inode);
4216 handle = ext4_journal_start(inode, credits);
4217 if (IS_ERR(handle))
4218 return PTR_ERR(handle);
4219
4220 err = ext4_orphan_add(handle, inode);
4221 if (err)
4222 goto out;
4223
4224 /*
4225 * Now we need to zero out the un block aligned data.
4226 * If the file is smaller than a block, just
4227 * zero out the middle
4228 */
4229 if (first_block > last_block)
4230 ext4_block_zero_page_range(handle, mapping, offset, length);
4231 else {
4232 /* zero out the head of the hole before the first block */
4233 block_len = first_block_offset - offset;
4234 if (block_len > 0)
4235 ext4_block_zero_page_range(handle, mapping,
4236 offset, block_len);
4237
4238 /* zero out the tail of the hole after the last block */
4239 block_len = offset + length - last_block_offset;
4240 if (block_len > 0) {
4241 ext4_block_zero_page_range(handle, mapping,
4242 last_block_offset, block_len);
4243 }
4244 }
4245
4246 /* If there are no blocks to remove, return now */
4247 if (first_block >= last_block)
4248 goto out;
4249
4250 down_write(&EXT4_I(inode)->i_data_sem);
4251 ext4_ext_invalidate_cache(inode);
4252 ext4_discard_preallocations(inode);
4253
4254 /*
4255 * Loop over all the blocks and identify blocks
4256 * that need to be punched out
4257 */
4258 iblock = first_block;
4259 blocks_released = 0;
4260 while (iblock < last_block) {
4261 max_blocks = last_block - iblock;
4262 num_blocks = 1;
4263 memset(&map, 0, sizeof(map));
4264 map.m_lblk = iblock;
4265 map.m_len = max_blocks;
4266 ret = ext4_ext_map_blocks(handle, inode, &map,
4267 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4268
4269 if (ret > 0) {
4270 blocks_released += ret;
4271 num_blocks = ret;
4272 } else if (ret == 0) {
4273 /*
4274 * If map blocks could not find the block,
4275 * then it is in a hole. If the hole was
4276 * not already cached, then map blocks should
4277 * put it in the cache. So we can get the hole
4278 * out of the cache
4279 */
4280 memset(&cache_ex, 0, sizeof(cache_ex));
4281 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4282 !cache_ex.ec_start) {
4283
4284 /* The hole is cached */
4285 num_blocks = cache_ex.ec_block +
4286 cache_ex.ec_len - iblock;
4287
4288 } else {
4289 /* The block could not be identified */
4290 err = -EIO;
4291 break;
4292 }
4293 } else {
4294 /* Map blocks error */
4295 err = ret;
4296 break;
4297 }
4298
4299 if (num_blocks == 0) {
4300 /* This condition should never happen */
4301 ext_debug("Block lookup failed");
4302 err = -EIO;
4303 break;
4304 }
4305
4306 iblock += num_blocks;
4307 }
4308
4309 if (blocks_released > 0) {
4310 ext4_ext_invalidate_cache(inode);
4311 ext4_discard_preallocations(inode);
4312 }
4313
4314 if (IS_SYNC(inode))
4315 ext4_handle_sync(handle);
4316
4317 up_write(&EXT4_I(inode)->i_data_sem);
4318
4319out:
4320 ext4_orphan_del(handle, inode);
4321 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4322 ext4_mark_inode_dirty(handle, inode);
4323 ext4_journal_stop(handle);
4324 return err;
4325}
3929int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4326int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3930 __u64 start, __u64 len) 4327 __u64 start, __u64 len)
3931{ 4328{
@@ -3948,8 +4345,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3948 4345
3949 start_blk = start >> inode->i_sb->s_blocksize_bits; 4346 start_blk = start >> inode->i_sb->s_blocksize_bits;
3950 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4347 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3951 if (last_blk >= EXT_MAX_BLOCK) 4348 if (last_blk >= EXT_MAX_BLOCKS)
3952 last_blk = EXT_MAX_BLOCK-1; 4349 last_blk = EXT_MAX_BLOCKS-1;
3953 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4350 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3954 4351
3955 /* 4352 /*
@@ -3962,4 +4359,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3962 4359
3963 return error; 4360 return error;
3964} 4361}
3965
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode)
59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63}
64
65/*
66 * This tests whether the IO in question is block-aligned or not.
67 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68 * are converted to written only after the IO is complete. Until they are
69 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
70 * it needs to zero out portions of the start and/or end block. If 2 AIO
71 * threads are at work on the same unwritten block, they must be synchronized
72 * or one thread will zero the other's data, causing corruption.
73 */
74static int
75ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76 unsigned long nr_segs, loff_t pos)
77{
78 struct super_block *sb = inode->i_sb;
79 int blockmask = sb->s_blocksize - 1;
80 size_t count = iov_length(iov, nr_segs);
81 loff_t final_size = pos + count;
82
83 if (pos >= inode->i_size)
84 return 0;
85
86 if ((pos & blockmask) || (final_size & blockmask))
87 return 1;
88
89 return 0;
90}
91
58static ssize_t 92static ssize_t
59ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 93ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
60 unsigned long nr_segs, loff_t pos) 94 unsigned long nr_segs, loff_t pos)
61{ 95{
62 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0;
98 int ret;
63 99
64 /* 100 /*
65 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 114 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79 sbi->s_bitmap_maxbytes - pos); 115 sbi->s_bitmap_maxbytes - pos);
80 } 116 }
117 } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118 !is_sync_kiocb(iocb))) {
119 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
120 }
121
122 /* Unaligned direct AIO must be serialized; see comment above */
123 if (unaligned_aio) {
124 static unsigned long unaligned_warn_time;
125
126 /* Warn about this once per day */
127 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128 ext4_msg(inode->i_sb, KERN_WARNING,
129 "Unaligned AIO/DIO on inode %ld by %s; "
130 "performance will be poor.",
131 inode->i_ino, current->comm);
132 mutex_lock(ext4_aio_mutex(inode));
133 ext4_aiodio_wait(inode);
81 } 134 }
82 135
83 return generic_file_aio_write(iocb, iov, nr_segs, pos); 136 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137
138 if (unaligned_aio)
139 mutex_unlock(ext4_aio_mutex(inode));
140
141 return ret;
84} 142}
85 143
86static const struct vm_operations_struct ext4_file_vm_ops = { 144static const struct vm_operations_struct ext4_file_vm_ops = {
@@ -104,6 +162,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
104{ 162{
105 struct super_block *sb = inode->i_sb; 163 struct super_block *sb = inode->i_sb;
106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 164 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
165 struct ext4_inode_info *ei = EXT4_I(inode);
107 struct vfsmount *mnt = filp->f_path.mnt; 166 struct vfsmount *mnt = filp->f_path.mnt;
108 struct path path; 167 struct path path;
109 char buf[64], *cp; 168 char buf[64], *cp;
@@ -127,11 +186,74 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
127 ext4_mark_super_dirty(sb); 186 ext4_mark_super_dirty(sb);
128 } 187 }
129 } 188 }
189 /*
190 * Set up the jbd2_inode if we are opening the inode for
191 * writing and the journal is present
192 */
193 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
194 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
195
196 spin_lock(&inode->i_lock);
197 if (!ei->jinode) {
198 if (!jinode) {
199 spin_unlock(&inode->i_lock);
200 return -ENOMEM;
201 }
202 ei->jinode = jinode;
203 jbd2_journal_init_jbd_inode(ei->jinode, inode);
204 jinode = NULL;
205 }
206 spin_unlock(&inode->i_lock);
207 if (unlikely(jinode != NULL))
208 jbd2_free_inode(jinode);
209 }
130 return dquot_file_open(inode, filp); 210 return dquot_file_open(inode, filp);
131} 211}
132 212
213/*
214 * ext4_llseek() copied from generic_file_llseek() to handle both
215 * block-mapped and extent-mapped maxbytes values. This should
216 * otherwise be identical with generic_file_llseek().
217 */
218loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
219{
220 struct inode *inode = file->f_mapping->host;
221 loff_t maxbytes;
222
223 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
224 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
225 else
226 maxbytes = inode->i_sb->s_maxbytes;
227 mutex_lock(&inode->i_mutex);
228 switch (origin) {
229 case SEEK_END:
230 offset += inode->i_size;
231 break;
232 case SEEK_CUR:
233 if (offset == 0) {
234 mutex_unlock(&inode->i_mutex);
235 return file->f_pos;
236 }
237 offset += file->f_pos;
238 break;
239 }
240
241 if (offset < 0 || offset > maxbytes) {
242 mutex_unlock(&inode->i_mutex);
243 return -EINVAL;
244 }
245
246 if (offset != file->f_pos) {
247 file->f_pos = offset;
248 file->f_version = 0;
249 }
250 mutex_unlock(&inode->i_mutex);
251
252 return offset;
253}
254
133const struct file_operations ext4_file_operations = { 255const struct file_operations ext4_file_operations = {
134 .llseek = generic_file_llseek, 256 .llseek = ext4_llseek,
135 .read = do_sync_read, 257 .read = do_sync_read,
136 .write = do_sync_write, 258 .write = do_sync_write,
137 .aio_read = generic_file_aio_read, 259 .aio_read = generic_file_aio_read,
@@ -146,10 +268,10 @@ const struct file_operations ext4_file_operations = {
146 .fsync = ext4_sync_file, 268 .fsync = ext4_sync_file,
147 .splice_read = generic_file_splice_read, 269 .splice_read = generic_file_splice_read,
148 .splice_write = generic_file_splice_write, 270 .splice_write = generic_file_splice_write,
271 .fallocate = ext4_fallocate,
149}; 272};
150 273
151const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
152 .truncate = ext4_truncate,
153 .setattr = ext4_setattr, 275 .setattr = ext4_setattr,
154 .getattr = ext4_getattr, 276 .getattr = ext4_getattr,
155#ifdef CONFIG_EXT4_FS_XATTR 277#ifdef CONFIG_EXT4_FS_XATTR
@@ -159,7 +281,6 @@ const struct inode_operations ext4_file_inode_operations = {
159 .removexattr = generic_removexattr, 281 .removexattr = generic_removexattr,
160#endif 282#endif
161 .check_acl = ext4_check_acl, 283 .check_acl = ext4_check_acl,
162 .fallocate = ext4_fallocate,
163 .fiemap = ext4_fiemap, 284 .fiemap = ext4_fiemap,
164}; 285};
165 286
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78extern int ext4_flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list);
94 /*
95 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written.
97 *
98 * When ext4_sync_file() is called, run_queue() may already
99 * about to flush the work corresponding to this io structure.
100 * It will be upset if it founds the io structure related
101 * to the work-to-be schedule is freed.
102 *
103 * Thus we need to keep the io structure still valid here after
104 * conversion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work
106 * queue work.
107 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0)
112 ret2 = ret;
113 else
114 list_del_init(&io->list);
115 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0;
118}
119
37/* 120/*
38 * If we're not journaling and this is a just-created file, we have to 121 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since 122 * sync our parent directory (if it was freshly created) since
@@ -42,9 +125,11 @@
42 * the parent directory's parent as well, and so on recursively, if 125 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created. 126 * they are also freshly created.
44 */ 127 */
45static void ext4_sync_parent(struct inode *inode) 128static int ext4_sync_parent(struct inode *inode)
46{ 129{
130 struct writeback_control wbc;
47 struct dentry *dentry = NULL; 131 struct dentry *dentry = NULL;
132 int ret = 0;
48 133
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { 134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); 135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -53,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) 138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break; 139 break;
55 inode = dentry->d_parent->d_inode; 140 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping); 141 ret = sync_mapping_buffers(inode->i_mapping);
142 if (ret)
143 break;
144 memset(&wbc, 0, sizeof(wbc));
145 wbc.sync_mode = WB_SYNC_ALL;
146 wbc.nr_to_write = 0; /* only write out the inode */
147 ret = sync_inode(inode, &wbc);
148 if (ret)
149 break;
57 } 150 }
151 return ret;
58} 152}
59 153
60/* 154/*
@@ -78,23 +172,24 @@ int ext4_sync_file(struct file *file, int datasync)
78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
79 int ret; 173 int ret;
80 tid_t commit_tid; 174 tid_t commit_tid;
175 bool needs_barrier = false;
81 176
82 J_ASSERT(ext4_journal_current_handle() == NULL); 177 J_ASSERT(ext4_journal_current_handle() == NULL);
83 178
84 trace_ext4_sync_file(file, datasync); 179 trace_ext4_sync_file_enter(file, datasync);
85 180
86 if (inode->i_sb->s_flags & MS_RDONLY) 181 if (inode->i_sb->s_flags & MS_RDONLY)
87 return 0; 182 return 0;
88 183
89 ret = flush_completed_IO(inode); 184 ret = ext4_flush_completed_IO(inode);
90 if (ret < 0) 185 if (ret < 0)
91 return ret; 186 goto out;
92 187
93 if (!journal) { 188 if (!journal) {
94 ret = generic_file_fsync(file, datasync); 189 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry)) 190 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode); 191 ret = ext4_sync_parent(inode);
97 return ret; 192 goto out;
98 } 193 }
99 194
100 /* 195 /*
@@ -111,27 +206,20 @@ int ext4_sync_file(struct file *file, int datasync)
111 * (they were dirtied by commit). But that's OK - the blocks are 206 * (they were dirtied by commit). But that's OK - the blocks are
112 * safe in-journal, which is all fsync() needs to ensure. 207 * safe in-journal, which is all fsync() needs to ensure.
113 */ 208 */
114 if (ext4_should_journal_data(inode)) 209 if (ext4_should_journal_data(inode)) {
115 return ext4_force_commit(inode->i_sb); 210 ret = ext4_force_commit(inode->i_sb);
211 goto out;
212 }
116 213
117 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 214 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
118 if (jbd2_log_start_commit(journal, commit_tid)) { 215 if (journal->j_flags & JBD2_BARRIER &&
119 /* 216 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
120 * When the journal is on a different device than the 217 needs_barrier = true;
121 * fs data disk, we need to issue the barrier in 218 jbd2_log_start_commit(journal, commit_tid);
122 * writeback mode. (In ordered mode, the jbd2 layer 219 ret = jbd2_log_wait_commit(journal, commit_tid);
123 * will take care of issuing the barrier. In 220 if (needs_barrier)
124 * data=journal, all of the data blocks are written to 221 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
125 * the journal device.) 222 out:
126 */ 223 trace_ext4_sync_file_exit(inode, ret);
127 if (ext4_should_writeback_data(inode) &&
128 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT);
132 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
135 BLKDEV_IFL_WAIT);
136 return ret; 224 return ret;
137} 225}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
50 * need to use it within a single byte (to ensure we get endianness right). 50 * need to use it within a single byte (to ensure we get endianness right).
51 * We can use memset for the rest of the bitmap as there are no other users. 51 * We can use memset for the rest of the bitmap as there are no other users.
52 */ 52 */
53void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) 53void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
54{ 54{
55 int i; 55 int i;
56 56
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
65} 65}
66 66
67/* Initializes an uninitialized inode bitmap */ 67/* Initializes an uninitialized inode bitmap */
68unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, 68static unsigned ext4_init_inode_bitmap(struct super_block *sb,
69 ext4_group_t block_group, 69 struct buffer_head *bh,
70 struct ext4_group_desc *gdp) 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp)
71{ 72{
72 struct ext4_sb_info *sbi = EXT4_SB(sb); 73 struct ext4_sb_info *sbi = EXT4_SB(sb);
73 74
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
85 } 86 }
86 87
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
89 bh->b_data); 90 bh->b_data);
90 91
91 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 108 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 109 if (!desc)
109 return NULL; 110 return NULL;
111
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 112 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 113 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 114 if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 125 unlock_buffer(bh);
124 return bh; 126 return bh;
125 } 127 }
128
126 ext4_lock_group(sb, block_group); 129 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 130 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 131 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 136 return bh;
134 } 137 }
135 ext4_unlock_group(sb, block_group); 138 ext4_unlock_group(sb, block_group);
139
136 if (buffer_uptodate(bh)) { 140 if (buffer_uptodate(bh)) {
137 /* 141 /*
138 * if not uninit if bh is uptodate, 142 * if not uninit if bh is uptodate,
@@ -148,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
148 * We do it here so the bitmap uptodate bit 152 * We do it here so the bitmap uptodate bit
149 * get set with buffer lock held. 153 * get set with buffer lock held.
150 */ 154 */
155 trace_ext4_load_inode_bitmap(sb, block_group);
151 set_bitmap_uptodate(bh); 156 set_bitmap_uptodate(bh);
152 if (bh_submit_read(bh) < 0) { 157 if (bh_submit_read(bh) < 0) {
153 put_bh(bh); 158 put_bh(bh);
@@ -411,8 +416,8 @@ struct orlov_stats {
411 * for a particular block group or flex_bg. If flex_size is 1, then g 416 * for a particular block group or flex_bg. If flex_size is 1, then g
412 * is a block group number; otherwise it is flex_bg number. 417 * is a block group number; otherwise it is flex_bg number.
413 */ 418 */
414void get_orlov_stats(struct super_block *sb, ext4_group_t g, 419static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
415 int flex_size, struct orlov_stats *stats) 420 int flex_size, struct orlov_stats *stats)
416{ 421{
417 struct ext4_group_desc *desc; 422 struct ext4_group_desc *desc;
418 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; 423 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -645,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
645 *group = parent_group + flex_size; 650 *group = parent_group + flex_size;
646 if (*group > ngroups) 651 if (*group > ngroups)
647 *group = 0; 652 *group = 0;
648 return find_group_orlov(sb, parent, group, mode, 0); 653 return find_group_orlov(sb, parent, group, mode, NULL);
649 } 654 }
650 655
651 /* 656 /*
@@ -712,8 +717,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 717{
713 int free = 0, retval = 0, count; 718 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 719 struct ext4_sb_info *sbi = EXT4_SB(sb);
720 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 721 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 722
723 /*
724 * We have to be sure that new inode allocation does not race with
725 * inode table initialization, because otherwise we may end up
726 * allocating and writing new inode right before sb_issue_zeroout
727 * takes place and overwriting our new inode with zeroes. So we
728 * take alloc_sem to prevent it.
729 */
730 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 731 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 732 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 733 /* not a free inode */
@@ -724,6 +738,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 738 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 739 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 740 ext4_unlock_group(sb, group);
741 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 742 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 743 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 744 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +787,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 787 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 788err_ret:
774 ext4_unlock_group(sb, group); 789 ext4_unlock_group(sb, group);
790 up_read(&grp->alloc_sem);
775 return retval; 791 return retval;
776} 792}
777 793
@@ -1012,7 +1028,7 @@ got:
1012 inode->i_generation = sbi->s_next_generation++; 1028 inode->i_generation = sbi->s_next_generation++;
1013 spin_unlock(&sbi->s_next_gen_lock); 1029 spin_unlock(&sbi->s_next_gen_lock);
1014 1030
1015 ei->i_state_flags = 0; 1031 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1016 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1032 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1017 1033
1018 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
@@ -1027,7 +1043,7 @@ got:
1027 if (err) 1043 if (err)
1028 goto fail_free_drop; 1044 goto fail_free_drop;
1029 1045
1030 err = ext4_init_security(handle, inode, dir); 1046 err = ext4_init_security(handle, inode, dir, qstr);
1031 if (err) 1047 if (err)
1032 goto fail_free_drop; 1048 goto fail_free_drop;
1033 1049
@@ -1039,6 +1055,11 @@ got:
1039 } 1055 }
1040 } 1056 }
1041 1057
1058 if (ext4_handle_valid(handle)) {
1059 ei->i_sync_tid = handle->h_transaction->t_tid;
1060 ei->i_datasync_tid = handle->h_transaction->t_tid;
1061 }
1062
1042 err = ext4_mark_inode_dirty(handle, inode); 1063 err = ext4_mark_inode_dirty(handle, inode);
1043 if (err) { 1064 if (err) {
1044 ext4_std_error(sb, err); 1065 ext4_std_error(sb, err);
@@ -1205,3 +1226,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1226 }
1206 return count; 1227 return count;
1207} 1228}
1229
1230/*
1231 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1232 * inode table. Must be called without any spinlock held. The only place
1233 * where it is called from on active part of filesystem is ext4lazyinit
1234 * thread, so we do not need any special locks, however we have to prevent
1235 * inode allocation from the current group, so we take alloc_sem lock, to
1236 * block ext4_claim_inode until we are finished.
1237 */
1238extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1239 int barrier)
1240{
1241 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1242 struct ext4_sb_info *sbi = EXT4_SB(sb);
1243 struct ext4_group_desc *gdp = NULL;
1244 struct buffer_head *group_desc_bh;
1245 handle_t *handle;
1246 ext4_fsblk_t blk;
1247 int num, ret = 0, used_blks = 0;
1248
1249 /* This should not happen, but just to be sure check this */
1250 if (sb->s_flags & MS_RDONLY) {
1251 ret = 1;
1252 goto out;
1253 }
1254
1255 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1256 if (!gdp)
1257 goto out;
1258
1259 /*
1260 * We do not need to lock this, because we are the only one
1261 * handling this flag.
1262 */
1263 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1264 goto out;
1265
1266 handle = ext4_journal_start_sb(sb, 1);
1267 if (IS_ERR(handle)) {
1268 ret = PTR_ERR(handle);
1269 goto out;
1270 }
1271
1272 down_write(&grp->alloc_sem);
1273 /*
1274 * If inode bitmap was already initialized there may be some
1275 * used inodes so we need to skip blocks with used inodes in
1276 * inode table.
1277 */
1278 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1279 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1280 ext4_itable_unused_count(sb, gdp)),
1281 sbi->s_inodes_per_block);
1282
1283 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1284 ext4_error(sb, "Something is wrong with group %u\n"
1285 "Used itable blocks: %d"
1286 "itable unused count: %u\n",
1287 group, used_blks,
1288 ext4_itable_unused_count(sb, gdp));
1289 ret = 1;
1290 goto out;
1291 }
1292
1293 blk = ext4_inode_table(sb, gdp) + used_blks;
1294 num = sbi->s_itb_per_group - used_blks;
1295
1296 BUFFER_TRACE(group_desc_bh, "get_write_access");
1297 ret = ext4_journal_get_write_access(handle,
1298 group_desc_bh);
1299 if (ret)
1300 goto err_out;
1301
1302 /*
1303 * Skip zeroout if the inode table is full. But we set the ZEROED
1304 * flag anyway, because obviously, when it is full it does not need
1305 * further zeroing.
1306 */
1307 if (unlikely(num == 0))
1308 goto skip_zeroout;
1309
1310 ext4_debug("going to zero out inode table in group %d\n",
1311 group);
1312 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1313 if (ret < 0)
1314 goto err_out;
1315 if (barrier)
1316 blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1317
1318skip_zeroout:
1319 ext4_lock_group(sb, group);
1320 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1321 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1322 ext4_unlock_group(sb, group);
1323
1324 BUFFER_TRACE(group_desc_bh,
1325 "call ext4_handle_dirty_metadata");
1326 ret = ext4_handle_dirty_metadata(handle, NULL,
1327 group_desc_bh);
1328
1329err_out:
1330 up_write(&grp->alloc_sem);
1331 ext4_journal_stop(handle);
1332out:
1333 return ret;
1334}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..e3126c051006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/printk.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/ratelimit.h>
43 45
44#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
45#include "xattr.h" 47#include "xattr.h"
@@ -53,13 +55,27 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 55static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 56 loff_t new_size)
55{ 57{
56 return jbd2_journal_begin_ordered_truncate( 58 trace_ext4_begin_ordered_truncate(inode, new_size);
57 EXT4_SB(inode->i_sb)->s_journal, 59 /*
58 &EXT4_I(inode)->jinode, 60 * If jinode is zero, then we never opened the file for
59 new_size); 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush.
64 */
65 if (!EXT4_I(inode)->jinode)
66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode,
69 new_size);
60} 70}
61 71
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 72static void ext4_invalidatepage(struct page *page, unsigned long offset);
73static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create);
75static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
76static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 79
64/* 80/*
65 * Test whether an inode is a fast symlink. 81 * Test whether an inode is a fast symlink.
@@ -157,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
157 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
158 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
159 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
160 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 176 ret = ext4_journal_restart(handle, nblocks);
161 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
162 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
163 179
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
172 handle_t *handle; 188 handle_t *handle;
173 int err; 189 int err;
174 190
191 trace_ext4_evict_inode(inode);
175 if (inode->i_nlink) { 192 if (inode->i_nlink) {
176 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
177 goto no_delete; 194 goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
544} 561}
545 562
546/** 563/**
547 * ext4_blks_to_allocate: Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
548 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
549 * 566 *
550 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
583 600
584/** 601/**
585 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
586 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
587 * blocks 608 * blocks
588 * 609 * @blks: number of desired blocks
589 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
590 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
591 * @blks: on return it will store the total number of allocated 612 * @err: on return it will store the error code
592 * direct blocks 613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
593 */ 616 */
594static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
595 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -616,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
616 while (target > 0) { 639 while (target > 0) {
617 count = target; 640 count = target;
618 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
619 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
620 goal, &count, err); 643 0, &count, err);
621 if (*err) 644 if (*err)
622 goto failed_out; 645 goto failed_out;
623 646
@@ -697,15 +720,17 @@ allocated:
697 return ret; 720 return ret;
698failed_out: 721failed_out:
699 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
700 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
701 return ret; 724 return ret;
702} 725}
703 726
704/** 727/**
705 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
706 * @inode: owner 730 * @inode: owner
707 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
708 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
709 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
710 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
711 * 736 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 * parent to disk. 780 * parent to disk.
756 */ 781 */
757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) {
784 err = -EIO;
785 goto failed;
786 }
787
758 branch[n].bh = bh; 788 branch[n].bh = bh;
759 lock_buffer(bh); 789 lock_buffer(bh);
760 BUFFER_TRACE(bh, "call get_create_access"); 790 BUFFER_TRACE(bh, "call get_create_access");
@@ -793,26 +823,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
793 return err; 823 return err;
794failed: 824failed:
795 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
796 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
797 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
798 /* 828 /*
799 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
800 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
801 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
802 */ 832 */
803 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
804 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
805 } 835 }
806 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
807 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
808 838
809 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
810 840
811 return err; 841 return err;
812} 842}
813 843
814/** 844/**
815 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
816 * @inode: owner 847 * @inode: owner
817 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
818 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
@@ -893,7 +924,7 @@ err_out:
893 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
894 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
895 } 926 }
896 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
897 blks, 0); 928 blks, 0);
898 929
899 return err; 930 return err;
@@ -942,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
942 int count = 0; 973 int count = 0;
943 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
944 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
945 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
946 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
947 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1027,6 +1059,8 @@ cleanup:
1027 partial--; 1059 partial--;
1028 } 1060 }
1029out: 1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1030 return err; 1064 return err;
1031} 1065}
1032 1066
@@ -1068,7 +1102,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1068 * Calculate the number of metadata blocks need to reserve 1102 * Calculate the number of metadata blocks need to reserve
1069 * to allocate a block located at @lblock 1103 * to allocate a block located at @lblock
1070 */ 1104 */
1071static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1105static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1072{ 1106{
1073 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1074 return ext4_ext_calc_metadata_amount(inode, lblock); 1108 return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1241,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1207 break; 1241 break;
1208 idx++; 1242 idx++;
1209 num++; 1243 num++;
1210 if (num >= max_pages) 1244 if (num >= max_pages) {
1245 done = 1;
1211 break; 1246 break;
1247 }
1212 } 1248 }
1213 pagevec_release(&pvec); 1249 pagevec_release(&pvec);
1214 } 1250 }
@@ -1305,7 +1341,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1305 * avoid double accounting 1341 * avoid double accounting
1306 */ 1342 */
1307 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1308 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1309 /* 1345 /*
1310 * We need to check for EXT4 here because migrate 1346 * We need to check for EXT4 here because migrate
1311 * could have changed the inode type in between 1347 * could have changed the inode type in between
@@ -1335,7 +1371,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1335 ext4_da_update_reserve_space(inode, retval, 1); 1371 ext4_da_update_reserve_space(inode, retval, 1);
1336 } 1372 }
1337 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1338 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1339 1375
1340 up_write((&EXT4_I(inode)->i_data_sem)); 1376 up_write((&EXT4_I(inode)->i_data_sem));
1341 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1574,10 @@ static int do_journal_get_write_access(handle_t *handle,
1538 if (!buffer_mapped(bh) || buffer_freed(bh)) 1574 if (!buffer_mapped(bh) || buffer_freed(bh))
1539 return 0; 1575 return 0;
1540 /* 1576 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean 1577 * __block_write_begin() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain 1578 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit 1579 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear 1580 * by __block_write_begin() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot 1581 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer. 1582 * ever write the buffer.
1547 */ 1583 */
@@ -1863,7 +1899,7 @@ static int ext4_journalled_write_end(struct file *file,
1863/* 1899/*
1864 * Reserve a single block located at lblock 1900 * Reserve a single block located at lblock
1865 */ 1901 */
1866static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) 1902static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1867{ 1903{
1868 int retries = 0; 1904 int retries = 0;
1869 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1894,7 +1930,7 @@ repeat:
1894 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1895 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1896 */ 1932 */
1897 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1898 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1899 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1900 yield(); 1936 yield();
@@ -1995,16 +2031,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1995 * 2031 *
1996 * As pages are already locked by write_cache_pages(), we can't use it 2032 * As pages are already locked by write_cache_pages(), we can't use it
1997 */ 2033 */
1998static int mpage_da_submit_io(struct mpage_da_data *mpd) 2034static int mpage_da_submit_io(struct mpage_da_data *mpd,
2035 struct ext4_map_blocks *map)
1999{ 2036{
2000 long pages_skipped;
2001 struct pagevec pvec; 2037 struct pagevec pvec;
2002 unsigned long index, end; 2038 unsigned long index, end;
2003 int ret = 0, err, nr_pages, i; 2039 int ret = 0, err, nr_pages, i;
2004 struct inode *inode = mpd->inode; 2040 struct inode *inode = mpd->inode;
2005 struct address_space *mapping = inode->i_mapping; 2041 struct address_space *mapping = inode->i_mapping;
2042 loff_t size = i_size_read(inode);
2043 unsigned int len, block_start;
2044 struct buffer_head *bh, *page_bufs = NULL;
2045 int journal_data = ext4_should_journal_data(inode);
2046 sector_t pblock = 0, cur_logical = 0;
2047 struct ext4_io_submit io_submit;
2006 2048
2007 BUG_ON(mpd->next_page <= mpd->first_page); 2049 BUG_ON(mpd->next_page <= mpd->first_page);
2050 memset(&io_submit, 0, sizeof(io_submit));
2008 /* 2051 /*
2009 * We need to start from the first_page to the next_page - 1 2052 * We need to start from the first_page to the next_page - 1
2010 * to make sure we also write the mapped dirty buffer_heads. 2053 * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,124 +2063,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2020 if (nr_pages == 0) 2063 if (nr_pages == 0)
2021 break; 2064 break;
2022 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2066 int commit_write = 0, skip_page = 0;
2023 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2024 2068
2025 index = page->index; 2069 index = page->index;
2026 if (index > end) 2070 if (index > end)
2027 break; 2071 break;
2072
2073 if (index == size >> PAGE_CACHE_SHIFT)
2074 len = size & ~PAGE_CACHE_MASK;
2075 else
2076 len = PAGE_CACHE_SIZE;
2077 if (map) {
2078 cur_logical = index << (PAGE_CACHE_SHIFT -
2079 inode->i_blkbits);
2080 pblock = map->m_pblk + (cur_logical -
2081 map->m_lblk);
2082 }
2028 index++; 2083 index++;
2029 2084
2030 BUG_ON(!PageLocked(page)); 2085 BUG_ON(!PageLocked(page));
2031 BUG_ON(PageWriteback(page)); 2086 BUG_ON(PageWriteback(page));
2032 2087
2033 pages_skipped = mpd->wbc->pages_skipped;
2034 err = mapping->a_ops->writepage(page, mpd->wbc);
2035 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2036 /*
2037 * have successfully written the page
2038 * without skipping the same
2039 */
2040 mpd->pages_written++;
2041 /* 2088 /*
2042 * In error case, we have to continue because 2089 * If the page does not have buffers (for
2043 * remaining pages are still locked 2090 * whatever reason), try to create them using
2044 * XXX: unlock and re-dirty them? 2091 * __block_write_begin. If this fails,
2092 * skip the page and move on.
2045 */ 2093 */
2046 if (ret == 0) 2094 if (!page_has_buffers(page)) {
2047 ret = err; 2095 if (__block_write_begin(page, 0, len,
2048 } 2096 noalloc_get_block_write)) {
2049 pagevec_release(&pvec); 2097 skip_page:
2050 } 2098 unlock_page(page);
2051 return ret; 2099 continue;
2052} 2100 }
2053 2101 commit_write = 1;
2054/* 2102 }
2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2056 *
2057 * the function goes through all passed space and put actual disk
2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2059 */
2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2061 struct ext4_map_blocks *map)
2062{
2063 struct inode *inode = mpd->inode;
2064 struct address_space *mapping = inode->i_mapping;
2065 int blocks = map->m_len;
2066 sector_t pblock = map->m_pblk, cur_logical;
2067 struct buffer_head *head, *bh;
2068 pgoff_t index, end;
2069 struct pagevec pvec;
2070 int nr_pages, i;
2071
2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2075
2076 pagevec_init(&pvec, 0);
2077
2078 while (index <= end) {
2079 /* XXX: optimize tail */
2080 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2081 if (nr_pages == 0)
2082 break;
2083 for (i = 0; i < nr_pages; i++) {
2084 struct page *page = pvec.pages[i];
2085
2086 index = page->index;
2087 if (index > end)
2088 break;
2089 index++;
2090
2091 BUG_ON(!PageLocked(page));
2092 BUG_ON(PageWriteback(page));
2093 BUG_ON(!page_has_buffers(page));
2094
2095 bh = page_buffers(page);
2096 head = bh;
2097
2098 /* skip blocks out of the range */
2099 do {
2100 if (cur_logical >= map->m_lblk)
2101 break;
2102 cur_logical++;
2103 } while ((bh = bh->b_this_page) != head);
2104 2103
2104 bh = page_bufs = page_buffers(page);
2105 block_start = 0;
2105 do { 2106 do {
2106 if (cur_logical >= map->m_lblk + blocks) 2107 if (!bh)
2107 break; 2108 goto skip_page;
2108 2109 if (map && (cur_logical >= map->m_lblk) &&
2109 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2110 (cur_logical <= (map->m_lblk +
2110 2111 (map->m_len - 1)))) {
2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2112
2113 if (buffer_delay(bh)) { 2112 if (buffer_delay(bh)) {
2114 clear_buffer_delay(bh); 2113 clear_buffer_delay(bh);
2115 bh->b_blocknr = pblock; 2114 bh->b_blocknr = pblock;
2116 } else {
2117 /*
2118 * unwritten already should have
2119 * blocknr assigned. Verify that
2120 */
2121 clear_buffer_unwritten(bh);
2122 BUG_ON(bh->b_blocknr != pblock);
2123 } 2115 }
2116 if (buffer_unwritten(bh) ||
2117 buffer_mapped(bh))
2118 BUG_ON(bh->b_blocknr != pblock);
2119 if (map->m_flags & EXT4_MAP_UNINIT)
2120 set_buffer_uninit(bh);
2121 clear_buffer_unwritten(bh);
2122 }
2124 2123
2125 } else if (buffer_mapped(bh)) 2124 /* skip page if block allocation undone */
2126 BUG_ON(bh->b_blocknr != pblock); 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2127 2126 skip_page = 1;
2128 if (map->m_flags & EXT4_MAP_UNINIT) 2127 bh = bh->b_this_page;
2129 set_buffer_uninit(bh); 2128 block_start += bh->b_size;
2130 cur_logical++; 2129 cur_logical++;
2131 pblock++; 2130 pblock++;
2132 } while ((bh = bh->b_this_page) != head); 2131 } while (bh != page_bufs);
2132
2133 if (skip_page)
2134 goto skip_page;
2135
2136 if (commit_write)
2137 /* mark the buffer_heads as dirty & uptodate */
2138 block_commit_write(page, 0, len);
2139
2140 clear_page_dirty_for_io(page);
2141 /*
2142 * Delalloc doesn't support data journalling,
2143 * but eventually maybe we'll lift this
2144 * restriction.
2145 */
2146 if (unlikely(journal_data && PageChecked(page)))
2147 err = __ext4_journalled_writepage(page, len);
2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149 err = ext4_bio_write_page(&io_submit, page,
2150 len, mpd->wbc);
2151 else
2152 err = block_write_full_page(page,
2153 noalloc_get_block_write, mpd->wbc);
2154
2155 if (!err)
2156 mpd->pages_written++;
2157 /*
2158 * In error case, we have to continue because
2159 * remaining pages are still locked
2160 */
2161 if (ret == 0)
2162 ret = err;
2133 } 2163 }
2134 pagevec_release(&pvec); 2164 pagevec_release(&pvec);
2135 } 2165 }
2166 ext4_io_submit(&io_submit);
2167 return ret;
2136} 2168}
2137 2169
2138 2170static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2140 sector_t logical, long blk_cnt)
2141{ 2171{
2142 int nr_pages, i; 2172 int nr_pages, i;
2143 pgoff_t index, end; 2173 pgoff_t index, end;
@@ -2145,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2145 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2146 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2147 2177
2148 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2178 index = mpd->first_page;
2149 end = (logical + blk_cnt - 1) >> 2179 end = mpd->next_page - 1;
2150 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2151 while (index <= end) { 2180 while (index <= end) {
2152 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2153 if (nr_pages == 0) 2182 if (nr_pages == 0)
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
2187} 2216}
2188 2217
2189/* 2218/*
2190 * mpage_da_map_blocks - go through given space 2219 * mpage_da_map_and_submit - go through given space, map them
2220 * if necessary, and then submit them for I/O
2191 * 2221 *
2192 * @mpd - bh describing space 2222 * @mpd - bh describing space
2193 * 2223 *
2194 * The function skips space we know is already mapped to disk blocks. 2224 * The function skips space we know is already mapped to disk blocks.
2195 * 2225 *
2196 */ 2226 */
2197static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2227static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2198{ 2228{
2199 int err, blks, get_blocks_flags; 2229 int err, blks, get_blocks_flags;
2200 struct ext4_map_blocks map; 2230 struct ext4_map_blocks map, *mapp = NULL;
2201 sector_t next = mpd->b_blocknr; 2231 sector_t next = mpd->b_blocknr;
2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2204 handle_t *handle = NULL; 2234 handle_t *handle = NULL;
2205 2235
2206 /* 2236 /*
2207 * We consider only non-mapped and non-allocated blocks 2237 * If the blocks are mapped already, or we couldn't accumulate
2238 * any blocks, then proceed immediately to the submission stage.
2208 */ 2239 */
2209 if ((mpd->b_state & (1 << BH_Mapped)) && 2240 if ((mpd->b_size == 0) ||
2210 !(mpd->b_state & (1 << BH_Delay)) && 2241 ((mpd->b_state & (1 << BH_Mapped)) &&
2211 !(mpd->b_state & (1 << BH_Unwritten))) 2242 !(mpd->b_state & (1 << BH_Delay)) &&
2212 return 0; 2243 !(mpd->b_state & (1 << BH_Unwritten))))
2213 2244 goto submit_io;
2214 /*
2215 * If we didn't accumulate anything to write simply return
2216 */
2217 if (!mpd->b_size)
2218 return 0;
2219 2245
2220 handle = ext4_journal_current_handle(); 2246 handle = ext4_journal_current_handle();
2221 BUG_ON(!handle); 2247 BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2231 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2232 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2233 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2234 * will set the magic i_delalloc_reserved_flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2235 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2236 * 2262 *
2237 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2252 2278
2253 err = blks; 2279 err = blks;
2254 /* 2280 /*
2255 * If get block returns with error we simply 2281 * If get block returns EAGAIN or ENOSPC and there
2256 * return. Later writepage will redirty the page and 2282 * appears to be free blocks we will just let
2257 * writepages will find the dirty page again 2283 * mpage_da_submit_io() unlock all of the pages.
2258 */ 2284 */
2259 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2260 return 0; 2286 goto submit_io;
2261 2287
2262 if (err == -ENOSPC && 2288 if (err == -ENOSPC &&
2263 ext4_count_free_blocks(sb)) { 2289 ext4_count_free_blocks(sb)) {
2264 mpd->retval = err; 2290 mpd->retval = err;
2265 return 0; 2291 goto submit_io;
2266 } 2292 }
2267 2293
2268 /* 2294 /*
@@ -2285,12 +2311,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2285 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2286 } 2312 }
2287 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2288 ext4_da_block_invalidatepages(mpd, next, 2314 ext4_da_block_invalidatepages(mpd);
2289 mpd->b_size >> mpd->inode->i_blkbits); 2315
2290 return err; 2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1;
2318 return;
2291 } 2319 }
2292 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
2293 2321
2322 mapp = &map;
2294 if (map.m_flags & EXT4_MAP_NEW) { 2323 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i; 2325 int i;
@@ -2299,18 +2328,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2299 unmap_underlying_metadata(bdev, map.m_pblk + i); 2328 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 } 2329 }
2301 2330
2302 /*
2303 * If blocks are delayed marked, we need to
2304 * put actual blocknr and drop delayed bit
2305 */
2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2307 (mpd->b_state & (1 << BH_Unwritten)))
2308 mpage_put_bnr_to_bhs(mpd, &map);
2309
2310 if (ext4_should_order_data(mpd->inode)) { 2331 if (ext4_should_order_data(mpd->inode)) {
2311 err = ext4_jbd2_file_inode(handle, mpd->inode); 2332 err = ext4_jbd2_file_inode(handle, mpd->inode);
2312 if (err) 2333 if (err)
2313 return err; 2334 /* This only happens if the journal is aborted */
2335 return;
2314 } 2336 }
2315 2337
2316 /* 2338 /*
@@ -2321,10 +2343,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2321 disksize = i_size_read(mpd->inode); 2343 disksize = i_size_read(mpd->inode);
2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2323 ext4_update_i_disksize(mpd->inode, disksize); 2345 ext4_update_i_disksize(mpd->inode, disksize);
2324 return ext4_mark_inode_dirty(handle, mpd->inode); 2346 err = ext4_mark_inode_dirty(handle, mpd->inode);
2347 if (err)
2348 ext4_error(mpd->inode->i_sb,
2349 "Failed to mark inode %lu dirty",
2350 mpd->inode->i_ino);
2325 } 2351 }
2326 2352
2327 return 0; 2353submit_io:
2354 mpage_da_submit_io(mpd, mapp);
2355 mpd->io_done = 1;
2328} 2356}
2329 2357
2330#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2358#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2429,7 @@ flush_it:
2401 * We couldn't merge the block to our extent, so we 2429 * We couldn't merge the block to our extent, so we
2402 * need to flush current extent and start new one 2430 * need to flush current extent and start new one
2403 */ 2431 */
2404 if (mpage_da_map_blocks(mpd) == 0) 2432 mpage_da_map_and_submit(mpd);
2405 mpage_da_submit_io(mpd);
2406 mpd->io_done = 1;
2407 return; 2433 return;
2408} 2434}
2409 2435
@@ -2413,104 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2413} 2439}
2414 2440
2415/* 2441/*
2416 * __mpage_da_writepage - finds extent of pages and blocks
2417 *
2418 * @page: page to consider
2419 * @wbc: not used, we just follow rules
2420 * @data: context
2421 *
2422 * The function finds extents of pages and scan them for all blocks.
2423 */
2424static int __mpage_da_writepage(struct page *page,
2425 struct writeback_control *wbc, void *data)
2426{
2427 struct mpage_da_data *mpd = data;
2428 struct inode *inode = mpd->inode;
2429 struct buffer_head *bh, *head;
2430 sector_t logical;
2431
2432 /*
2433 * Can we merge this page to current extent?
2434 */
2435 if (mpd->next_page != page->index) {
2436 /*
2437 * Nope, we can't. So, we map non-allocated blocks
2438 * and start IO on them using writepage()
2439 */
2440 if (mpd->next_page != mpd->first_page) {
2441 if (mpage_da_map_blocks(mpd) == 0)
2442 mpage_da_submit_io(mpd);
2443 /*
2444 * skip rest of the page in the page_vec
2445 */
2446 mpd->io_done = 1;
2447 redirty_page_for_writepage(wbc, page);
2448 unlock_page(page);
2449 return MPAGE_DA_EXTENT_TAIL;
2450 }
2451
2452 /*
2453 * Start next extent of pages ...
2454 */
2455 mpd->first_page = page->index;
2456
2457 /*
2458 * ... and blocks
2459 */
2460 mpd->b_size = 0;
2461 mpd->b_state = 0;
2462 mpd->b_blocknr = 0;
2463 }
2464
2465 mpd->next_page = page->index + 1;
2466 logical = (sector_t) page->index <<
2467 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2468
2469 if (!page_has_buffers(page)) {
2470 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2471 (1 << BH_Dirty) | (1 << BH_Uptodate));
2472 if (mpd->io_done)
2473 return MPAGE_DA_EXTENT_TAIL;
2474 } else {
2475 /*
2476 * Page with regular buffer heads, just add all dirty ones
2477 */
2478 head = page_buffers(page);
2479 bh = head;
2480 do {
2481 BUG_ON(buffer_locked(bh));
2482 /*
2483 * We need to try to allocate
2484 * unmapped blocks in the same page.
2485 * Otherwise we won't make progress
2486 * with the page in ext4_writepage
2487 */
2488 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2489 mpage_add_bh_to_extent(mpd, logical,
2490 bh->b_size,
2491 bh->b_state);
2492 if (mpd->io_done)
2493 return MPAGE_DA_EXTENT_TAIL;
2494 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2495 /*
2496 * mapped dirty buffer. We need to update
2497 * the b_state because we look at
2498 * b_state in mpage_da_map_blocks. We don't
2499 * update b_size because if we find an
2500 * unmapped buffer_head later we need to
2501 * use the b_state flag of that buffer_head.
2502 */
2503 if (mpd->b_size == 0)
2504 mpd->b_state = bh->b_state & BH_FLAGS;
2505 }
2506 logical++;
2507 } while ((bh = bh->b_this_page) != head);
2508 }
2509
2510 return 0;
2511}
2512
2513/*
2514 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2515 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2516 * reserve space for a single block. 2444 * reserve space for a single block.
@@ -2550,8 +2478,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2550 if (buffer_delay(bh)) 2478 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */ 2479 return 0; /* Not sure this could or should happen */
2552 /* 2480 /*
2553 * XXX: __block_prepare_write() unmaps passed block, 2481 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 * is it OK?
2555 */ 2482 */
2556 ret = ext4_da_reserve_space(inode, iblock); 2483 ret = ext4_da_reserve_space(inode, iblock);
2557 if (ret) 2484 if (ret)
@@ -2583,7 +2510,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2583/* 2510/*
2584 * This function is used as a standard get_block_t calback function 2511 * This function is used as a standard get_block_t calback function
2585 * when there is no desire to allocate any blocks. It is used as a 2512 * when there is no desire to allocate any blocks. It is used as a
2586 * callback function for block_prepare_write() and block_write_full_page(). 2513 * callback function for block_write_begin() and block_write_full_page().
2587 * These functions should only try to map a single block at a time. 2514 * These functions should only try to map a single block at a time.
2588 * 2515 *
2589 * Since this function doesn't do block allocations even if the caller 2516 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2550,7 @@ static int __ext4_journalled_writepage(struct page *page,
2623 int ret = 0; 2550 int ret = 0;
2624 int err; 2551 int err;
2625 2552
2553 ClearPageChecked(page);
2626 page_bufs = page_buffers(page); 2554 page_bufs = page_buffers(page);
2627 BUG_ON(!page_bufs); 2555 BUG_ON(!page_bufs);
2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2661,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2661 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2662 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2663 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2664 * we are writing back data modified via mmap(), noone guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2665 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2666 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2667 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
@@ -2700,84 +2628,57 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2700static int ext4_writepage(struct page *page, 2628static int ext4_writepage(struct page *page,
2701 struct writeback_control *wbc) 2629 struct writeback_control *wbc)
2702{ 2630{
2703 int ret = 0; 2631 int ret = 0, commit_write = 0;
2704 loff_t size; 2632 loff_t size;
2705 unsigned int len; 2633 unsigned int len;
2706 struct buffer_head *page_bufs = NULL; 2634 struct buffer_head *page_bufs = NULL;
2707 struct inode *inode = page->mapping->host; 2635 struct inode *inode = page->mapping->host;
2708 2636
2709 trace_ext4_writepage(inode, page); 2637 trace_ext4_writepage(page);
2710 size = i_size_read(inode); 2638 size = i_size_read(inode);
2711 if (page->index == size >> PAGE_CACHE_SHIFT) 2639 if (page->index == size >> PAGE_CACHE_SHIFT)
2712 len = size & ~PAGE_CACHE_MASK; 2640 len = size & ~PAGE_CACHE_MASK;
2713 else 2641 else
2714 len = PAGE_CACHE_SIZE; 2642 len = PAGE_CACHE_SIZE;
2715 2643
2716 if (page_has_buffers(page)) { 2644 /*
2717 page_bufs = page_buffers(page); 2645 * If the page does not have buffers (for whatever reason),
2718 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2646 * try to create them using __block_write_begin. If this
2719 ext4_bh_delay_or_unwritten)) { 2647 * fails, redirty the page and move on.
2720 /* 2648 */
2721 * We don't want to do block allocation 2649 if (!page_has_buffers(page)) {
2722 * So redirty the page and return 2650 if (__block_write_begin(page, 0, len,
2723 * We may reach here when we do a journal commit 2651 noalloc_get_block_write)) {
2724 * via journal_submit_inode_data_buffers. 2652 redirty_page:
2725 * If we don't have mapping block we just ignore
2726 * them. We can also reach here via shrink_page_list
2727 */
2728 redirty_page_for_writepage(wbc, page); 2653 redirty_page_for_writepage(wbc, page);
2729 unlock_page(page); 2654 unlock_page(page);
2730 return 0; 2655 return 0;
2731 } 2656 }
2732 } else { 2657 commit_write = 1;
2658 }
2659 page_bufs = page_buffers(page);
2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2661 ext4_bh_delay_or_unwritten)) {
2733 /* 2662 /*
2734 * The test for page_has_buffers() is subtle: 2663 * We don't want to do block allocation, so redirty
2735 * We know the page is dirty but it lost buffers. That means 2664 * the page and return. We may reach here when we do
2736 * that at some moment in time after write_begin()/write_end() 2665 * a journal commit via journal_submit_inode_data_buffers.
2737 * has been called all buffers have been clean and thus they 2666 * We can also reach here via shrink_page_list
2738 * must have been written at least once. So they are all
2739 * mapped and we can happily proceed with mapping them
2740 * and writing the page.
2741 *
2742 * Try to initialize the buffer_heads and check whether
2743 * all are mapped and non delay. We don't want to
2744 * do block allocation here.
2745 */ 2667 */
2746 ret = block_prepare_write(page, 0, len, 2668 goto redirty_page;
2747 noalloc_get_block_write); 2669 }
2748 if (!ret) { 2670 if (commit_write)
2749 page_bufs = page_buffers(page);
2750 /* check whether all are mapped and non delay */
2751 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2752 ext4_bh_delay_or_unwritten)) {
2753 redirty_page_for_writepage(wbc, page);
2754 unlock_page(page);
2755 return 0;
2756 }
2757 } else {
2758 /*
2759 * We can't do block allocation here
2760 * so just redity the page and unlock
2761 * and return
2762 */
2763 redirty_page_for_writepage(wbc, page);
2764 unlock_page(page);
2765 return 0;
2766 }
2767 /* now mark the buffer_heads as dirty and uptodate */ 2671 /* now mark the buffer_heads as dirty and uptodate */
2768 block_commit_write(page, 0, len); 2672 block_commit_write(page, 0, len);
2769 }
2770 2673
2771 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2674 if (PageChecked(page) && ext4_should_journal_data(inode))
2772 /* 2675 /*
2773 * It's mmapped pagecache. Add buffers and journal it. There 2676 * It's mmapped pagecache. Add buffers and journal it. There
2774 * doesn't seem much point in redirtying the page here. 2677 * doesn't seem much point in redirtying the page here.
2775 */ 2678 */
2776 ClearPageChecked(page);
2777 return __ext4_journalled_writepage(page, len); 2679 return __ext4_journalled_writepage(page, len);
2778 }
2779 2680
2780 if (page_bufs && buffer_uninit(page_bufs)) { 2681 if (buffer_uninit(page_bufs)) {
2781 ext4_set_bh_endio(page_bufs, inode); 2682 ext4_set_bh_endio(page_bufs, inode);
2782 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2683 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2783 wbc, ext4_end_io_buffer_write); 2684 wbc, ext4_end_io_buffer_write);
@@ -2790,7 +2691,7 @@ static int ext4_writepage(struct page *page,
2790 2691
2791/* 2692/*
2792 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2793 * calulate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2794 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2795 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2796 * the block allocation. 2697 * the block allocation.
@@ -2815,37 +2716,42 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2815 2716
2816/* 2717/*
2817 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2818 * address space and call the callback function (which usually writes 2719 * address space and accumulate pages that need writing, and call
2819 * the pages). 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2820 * 2721 * and then write them.
2821 * This is a forked version of write_cache_pages(). Differences:
2822 * Range cyclic is ignored.
2823 * no_nrwrite_index_update is always presumed true
2824 */ 2722 */
2825static int write_cache_pages_da(struct address_space *mapping, 2723static int write_cache_pages_da(struct address_space *mapping,
2826 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2827 struct mpage_da_data *mpd) 2725 struct mpage_da_data *mpd,
2726 pgoff_t *done_index)
2828{ 2727{
2829 int ret = 0; 2728 struct buffer_head *bh, *head;
2830 int done = 0; 2729 struct inode *inode = mapping->host;
2831 struct pagevec pvec; 2730 struct pagevec pvec;
2832 int nr_pages; 2731 unsigned int nr_pages;
2833 pgoff_t index; 2732 sector_t logical;
2834 pgoff_t end; /* Inclusive */ 2733 pgoff_t index, end;
2835 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2836 2735 int i, tag, ret = 0;
2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc;
2739 mpd->inode = inode;
2837 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2838 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2839 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2840 2743
2841 while (!done && (index <= end)) { 2744 if (wbc->sync_mode == WB_SYNC_ALL)
2842 int i; 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 else
2747 tag = PAGECACHE_TAG_DIRTY;
2843 2748
2844 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2749 *done_index = index;
2845 PAGECACHE_TAG_DIRTY, 2750 while (index <= end) {
2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2847 if (nr_pages == 0) 2753 if (nr_pages == 0)
2848 break; 2754 return 0;
2849 2755
2850 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2851 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
@@ -2857,58 +2763,98 @@ static int write_cache_pages_da(struct address_space *mapping,
2857 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2858 * because we have a reference on the page. 2764 * because we have a reference on the page.
2859 */ 2765 */
2860 if (page->index > end) { 2766 if (page->index > end)
2861 done = 1; 2767 goto out;
2862 break; 2768
2769 *done_index = page->index + 1;
2770
2771 /*
2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it
2774 */
2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail;
2863 } 2779 }
2864 2780
2865 lock_page(page); 2781 lock_page(page);
2866 2782
2867 /* 2783 /*
2868 * Page truncated or invalidated. We can freely skip it 2784 * If the page is no longer dirty, or its
2869 * then, even for data integrity operations: the page 2785 * mapping no longer corresponds to inode we
2870 * has disappeared concurrently, so there could be no 2786 * are writing (which means it has been
2871 * real expectation of this data interity operation 2787 * truncated or invalidated), or the page is
2872 * even if there is now a new, dirty page at the same 2788 * already under writeback and we are not
2873 * pagecache address. 2789 * doing a data integrity writeback, skip the page
2874 */ 2790 */
2875 if (unlikely(page->mapping != mapping)) { 2791 if (!PageDirty(page) ||
2876continue_unlock: 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) {
2877 unlock_page(page); 2795 unlock_page(page);
2878 continue; 2796 continue;
2879 } 2797 }
2880 2798
2881 if (!PageDirty(page)) { 2799 wait_on_page_writeback(page);
2882 /* someone wrote it for us */
2883 goto continue_unlock;
2884 }
2885
2886 if (PageWriteback(page)) {
2887 if (wbc->sync_mode != WB_SYNC_NONE)
2888 wait_on_page_writeback(page);
2889 else
2890 goto continue_unlock;
2891 }
2892
2893 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2894 if (!clear_page_dirty_for_io(page))
2895 goto continue_unlock;
2896 2801
2897 ret = __mpage_da_writepage(page, wbc, mpd); 2802 if (mpd->next_page != page->index)
2898 if (unlikely(ret)) { 2803 mpd->first_page = page->index;
2899 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2804 mpd->next_page = page->index + 1;
2900 unlock_page(page); 2805 logical = (sector_t) page->index <<
2901 ret = 0; 2806 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2902 } else { 2807
2903 done = 1; 2808 if (!page_has_buffers(page)) {
2904 break; 2809 mpage_add_bh_to_extent(mpd, logical,
2905 } 2810 PAGE_CACHE_SIZE,
2811 (1 << BH_Dirty) | (1 << BH_Uptodate));
2812 if (mpd->io_done)
2813 goto ret_extent_tail;
2814 } else {
2815 /*
2816 * Page with regular buffer heads,
2817 * just add all dirty ones
2818 */
2819 head = page_buffers(page);
2820 bh = head;
2821 do {
2822 BUG_ON(buffer_locked(bh));
2823 /*
2824 * We need to try to allocate
2825 * unmapped blocks in the same page.
2826 * Otherwise we won't make progress
2827 * with the page in ext4_writepage
2828 */
2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2830 mpage_add_bh_to_extent(mpd, logical,
2831 bh->b_size,
2832 bh->b_state);
2833 if (mpd->io_done)
2834 goto ret_extent_tail;
2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2836 /*
2837 * mapped dirty buffer. We need
2838 * to update the b_state
2839 * because we look at b_state
2840 * in mpage_da_map_blocks. We
2841 * don't update b_size because
2842 * if we find an unmapped
2843 * buffer_head later we need to
2844 * use the b_state flag of that
2845 * buffer_head.
2846 */
2847 if (mpd->b_size == 0)
2848 mpd->b_state = bh->b_state & BH_FLAGS;
2849 }
2850 logical++;
2851 } while ((bh = bh->b_this_page) != head);
2906 } 2852 }
2907 2853
2908 if (nr_to_write > 0) { 2854 if (nr_to_write > 0) {
2909 nr_to_write--; 2855 nr_to_write--;
2910 if (nr_to_write == 0 && 2856 if (nr_to_write == 0 &&
2911 wbc->sync_mode == WB_SYNC_NONE) { 2857 wbc->sync_mode == WB_SYNC_NONE)
2912 /* 2858 /*
2913 * We stop writing back only if we are 2859 * We stop writing back only if we are
2914 * not doing integrity sync. In case of 2860 * not doing integrity sync. In case of
@@ -2919,14 +2865,18 @@ continue_unlock:
2919 * pages, but have not synced all of the 2865 * pages, but have not synced all of the
2920 * old dirty pages. 2866 * old dirty pages.
2921 */ 2867 */
2922 done = 1; 2868 goto out;
2923 break;
2924 }
2925 } 2869 }
2926 } 2870 }
2927 pagevec_release(&pvec); 2871 pagevec_release(&pvec);
2928 cond_resched(); 2872 cond_resched();
2929 } 2873 }
2874 return 0;
2875ret_extent_tail:
2876 ret = MPAGE_DA_EXTENT_TAIL;
2877out:
2878 pagevec_release(&pvec);
2879 cond_resched();
2930 return ret; 2880 return ret;
2931} 2881}
2932 2882
@@ -2940,13 +2890,14 @@ static int ext4_da_writepages(struct address_space *mapping,
2940 struct mpage_da_data mpd; 2890 struct mpage_da_data mpd;
2941 struct inode *inode = mapping->host; 2891 struct inode *inode = mapping->host;
2942 int pages_written = 0; 2892 int pages_written = 0;
2943 long pages_skipped;
2944 unsigned int max_pages; 2893 unsigned int max_pages;
2945 int range_cyclic, cycled = 1, io_done = 0; 2894 int range_cyclic, cycled = 1, io_done = 0;
2946 int needed_blocks, ret = 0; 2895 int needed_blocks, ret = 0;
2947 long desired_nr_to_write, nr_to_writebump = 0; 2896 long desired_nr_to_write, nr_to_writebump = 0;
2948 loff_t range_start = wbc->range_start; 2897 loff_t range_start = wbc->range_start;
2949 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2899 pgoff_t done_index = 0;
2900 pgoff_t end;
2950 2901
2951 trace_ext4_da_writepages(inode, wbc); 2902 trace_ext4_da_writepages(inode, wbc);
2952 2903
@@ -2982,8 +2933,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2982 wbc->range_start = index << PAGE_CACHE_SHIFT; 2933 wbc->range_start = index << PAGE_CACHE_SHIFT;
2983 wbc->range_end = LLONG_MAX; 2934 wbc->range_end = LLONG_MAX;
2984 wbc->range_cyclic = 0; 2935 wbc->range_cyclic = 0;
2985 } else 2936 end = -1;
2937 } else {
2986 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2938 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2939 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2940 }
2987 2941
2988 /* 2942 /*
2989 * This works around two forms of stupidity. The first is in 2943 * This works around two forms of stupidity. The first is in
@@ -3002,9 +2956,12 @@ static int ext4_da_writepages(struct address_space *mapping,
3002 * sbi->max_writeback_mb_bump whichever is smaller. 2956 * sbi->max_writeback_mb_bump whichever is smaller.
3003 */ 2957 */
3004 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3005 if (!range_cyclic && range_whole) 2959 if (!range_cyclic && range_whole) {
3006 desired_nr_to_write = wbc->nr_to_write * 8; 2960 if (wbc->nr_to_write == LONG_MAX)
3007 else 2961 desired_nr_to_write = wbc->nr_to_write;
2962 else
2963 desired_nr_to_write = wbc->nr_to_write * 8;
2964 } else
3008 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3009 max_pages); 2966 max_pages);
3010 if (desired_nr_to_write > max_pages) 2967 if (desired_nr_to_write > max_pages)
@@ -3015,12 +2972,10 @@ static int ext4_da_writepages(struct address_space *mapping,
3015 wbc->nr_to_write = desired_nr_to_write; 2972 wbc->nr_to_write = desired_nr_to_write;
3016 } 2973 }
3017 2974
3018 mpd.wbc = wbc;
3019 mpd.inode = mapping->host;
3020
3021 pages_skipped = wbc->pages_skipped;
3022
3023retry: 2975retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL)
2977 tag_pages_for_writeback(mapping, index, end);
2978
3024 while (!ret && wbc->nr_to_write > 0) { 2979 while (!ret && wbc->nr_to_write > 0) {
3025 2980
3026 /* 2981 /*
@@ -3043,32 +2998,18 @@ retry:
3043 } 2998 }
3044 2999
3045 /* 3000 /*
3046 * Now call __mpage_da_writepage to find the next 3001 * Now call write_cache_pages_da() to find the next
3047 * contiguous region of logical blocks that need 3002 * contiguous region of logical blocks that need
3048 * blocks to be allocated by ext4. We don't actually 3003 * blocks to be allocated by ext4 and submit them.
3049 * submit the blocks for I/O here, even though
3050 * write_cache_pages thinks it will, and will set the
3051 * pages as clean for write before calling
3052 * __mpage_da_writepage().
3053 */ 3004 */
3054 mpd.b_size = 0; 3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3055 mpd.b_state = 0;
3056 mpd.b_blocknr = 0;
3057 mpd.first_page = 0;
3058 mpd.next_page = 0;
3059 mpd.io_done = 0;
3060 mpd.pages_written = 0;
3061 mpd.retval = 0;
3062 ret = write_cache_pages_da(mapping, wbc, &mpd);
3063 /* 3006 /*
3064 * If we have a contiguous extent of pages and we 3007 * If we have a contiguous extent of pages and we
3065 * haven't done the I/O yet, map the blocks and submit 3008 * haven't done the I/O yet, map the blocks and submit
3066 * them for I/O. 3009 * them for I/O.
3067 */ 3010 */
3068 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3069 if (mpage_da_map_blocks(&mpd) == 0) 3012 mpage_da_map_and_submit(&mpd);
3070 mpage_da_submit_io(&mpd);
3071 mpd.io_done = 1;
3072 ret = MPAGE_DA_EXTENT_TAIL; 3013 ret = MPAGE_DA_EXTENT_TAIL;
3073 } 3014 }
3074 trace_ext4_da_write_pages(inode, &mpd); 3015 trace_ext4_da_write_pages(inode, &mpd);
@@ -3082,7 +3023,6 @@ retry:
3082 * and try again 3023 * and try again
3083 */ 3024 */
3084 jbd2_journal_force_commit_nested(sbi->s_journal); 3025 jbd2_journal_force_commit_nested(sbi->s_journal);
3085 wbc->pages_skipped = pages_skipped;
3086 ret = 0; 3026 ret = 0;
3087 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3088 /* 3028 /*
@@ -3090,7 +3030,6 @@ retry:
3090 * rest of the pages 3030 * rest of the pages
3091 */ 3031 */
3092 pages_written += mpd.pages_written; 3032 pages_written += mpd.pages_written;
3093 wbc->pages_skipped = pages_skipped;
3094 ret = 0; 3033 ret = 0;
3095 io_done = 1; 3034 io_done = 1;
3096 } else if (wbc->nr_to_write) 3035 } else if (wbc->nr_to_write)
@@ -3108,21 +3047,15 @@ retry:
3108 wbc->range_end = mapping->writeback_index - 1; 3047 wbc->range_end = mapping->writeback_index - 1;
3109 goto retry; 3048 goto retry;
3110 } 3049 }
3111 if (pages_skipped != wbc->pages_skipped)
3112 ext4_msg(inode->i_sb, KERN_CRIT,
3113 "This should not happen leaving %s "
3114 "with nr_to_write = %ld ret = %d",
3115 __func__, wbc->nr_to_write, ret);
3116 3050
3117 /* Update index */ 3051 /* Update index */
3118 index += pages_written;
3119 wbc->range_cyclic = range_cyclic; 3052 wbc->range_cyclic = range_cyclic;
3120 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3121 /* 3054 /*
3122 * set the writeback_index so that range_cyclic 3055 * set the writeback_index so that range_cyclic
3123 * mode will write it back later 3056 * mode will write it back later
3124 */ 3057 */
3125 mapping->writeback_index = index; 3058 mapping->writeback_index = done_index;
3126 3059
3127out_writepages: 3060out_writepages:
3128 wbc->nr_to_write -= nr_to_writebump; 3061 wbc->nr_to_write -= nr_to_writebump;
@@ -3367,10 +3300,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
3367 * doing I/O at all. 3300 * doing I/O at all.
3368 * 3301 *
3369 * We could call write_cache_pages(), and then redirty all of 3302 * We could call write_cache_pages(), and then redirty all of
3370 * the pages by calling redirty_page_for_writeback() but that 3303 * the pages by calling redirty_page_for_writepage() but that
3371 * would be ugly in the extreme. So instead we would need to 3304 * would be ugly in the extreme. So instead we would need to
3372 * replicate parts of the code in the above functions, 3305 * replicate parts of the code in the above functions,
3373 * simplifying them becuase we wouldn't actually intend to 3306 * simplifying them because we wouldn't actually intend to
3374 * write out the pages, but rather only collect contiguous 3307 * write out the pages, but rather only collect contiguous
3375 * logical block extents, call the multi-block allocator, and 3308 * logical block extents, call the multi-block allocator, and
3376 * then update the buffer heads with the block allocations. 3309 * then update the buffer heads with the block allocations.
@@ -3447,6 +3380,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3447 3380
3448static int ext4_readpage(struct file *file, struct page *page) 3381static int ext4_readpage(struct file *file, struct page *page)
3449{ 3382{
3383 trace_ext4_readpage(page);
3450 return mpage_readpage(page, ext4_get_block); 3384 return mpage_readpage(page, ext4_get_block);
3451} 3385}
3452 3386
@@ -3457,15 +3391,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3457 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3458} 3392}
3459 3393
3460static void ext4_free_io_end(ext4_io_end_t *io)
3461{
3462 BUG_ON(!io);
3463 if (io->page)
3464 put_page(io->page);
3465 iput(io->inode);
3466 kfree(io);
3467}
3468
3469static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3394static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3470{ 3395{
3471 struct buffer_head *head, *bh; 3396 struct buffer_head *head, *bh;
@@ -3490,6 +3415,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
3490{ 3415{
3491 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3492 3417
3418 trace_ext4_invalidatepage(page, offset);
3419
3493 /* 3420 /*
3494 * free any io_end structure allocated for buffers to be discarded 3421 * free any io_end structure allocated for buffers to be discarded
3495 */ 3422 */
@@ -3511,6 +3438,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3511{ 3438{
3512 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3513 3440
3441 trace_ext4_releasepage(page);
3442
3514 WARN_ON(PageChecked(page)); 3443 WARN_ON(PageChecked(page));
3515 if (!page_has_buffers(page)) 3444 if (!page_has_buffers(page))
3516 return 0; 3445 return 0;
@@ -3582,7 +3511,7 @@ retry:
3582 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3583 3512
3584 if (end > isize) 3513 if (end > isize)
3585 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3586 } 3515 }
3587 } 3516 }
3588 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3642,173 +3571,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3642 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3571 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3643} 3572}
3644 3573
3645static void dump_completed_IO(struct inode * inode)
3646{
3647#ifdef EXT4_DEBUG
3648 struct list_head *cur, *before, *after;
3649 ext4_io_end_t *io, *io0, *io1;
3650 unsigned long flags;
3651
3652 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3653 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3654 return;
3655 }
3656
3657 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3658 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3659 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3660 cur = &io->list;
3661 before = cur->prev;
3662 io0 = container_of(before, ext4_io_end_t, list);
3663 after = cur->next;
3664 io1 = container_of(after, ext4_io_end_t, list);
3665
3666 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3667 io, inode->i_ino, io0, io1);
3668 }
3669 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3670#endif
3671}
3672
3673/*
3674 * check a range of space and convert unwritten extents to written.
3675 */
3676static int ext4_end_io_nolock(ext4_io_end_t *io)
3677{
3678 struct inode *inode = io->inode;
3679 loff_t offset = io->offset;
3680 ssize_t size = io->size;
3681 int ret = 0;
3682
3683 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3684 "list->prev 0x%p\n",
3685 io, inode->i_ino, io->list.next, io->list.prev);
3686
3687 if (list_empty(&io->list))
3688 return ret;
3689
3690 if (io->flag != EXT4_IO_UNWRITTEN)
3691 return ret;
3692
3693 ret = ext4_convert_unwritten_extents(inode, offset, size);
3694 if (ret < 0) {
3695 printk(KERN_EMERG "%s: failed to convert unwritten"
3696 "extents to written extents, error is %d"
3697 " io is still on inode %lu aio dio list\n",
3698 __func__, ret, inode->i_ino);
3699 return ret;
3700 }
3701
3702 if (io->iocb)
3703 aio_complete(io->iocb, io->result, 0);
3704 /* clear the DIO AIO unwritten flag */
3705 io->flag = 0;
3706 return ret;
3707}
3708
3709/*
3710 * work on completed aio dio IO, to convert unwritten extents to extents
3711 */
3712static void ext4_end_io_work(struct work_struct *work)
3713{
3714 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3715 struct inode *inode = io->inode;
3716 struct ext4_inode_info *ei = EXT4_I(inode);
3717 unsigned long flags;
3718 int ret;
3719
3720 mutex_lock(&inode->i_mutex);
3721 ret = ext4_end_io_nolock(io);
3722 if (ret < 0) {
3723 mutex_unlock(&inode->i_mutex);
3724 return;
3725 }
3726
3727 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3728 if (!list_empty(&io->list))
3729 list_del_init(&io->list);
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3731 mutex_unlock(&inode->i_mutex);
3732 ext4_free_io_end(io);
3733}
3734
3735/*
3736 * This function is called from ext4_sync_file().
3737 *
3738 * When IO is completed, the work to convert unwritten extents to
3739 * written is queued on workqueue but may not get immediately
3740 * scheduled. When fsync is called, we need to ensure the
3741 * conversion is complete before fsync returns.
3742 * The inode keeps track of a list of pending/completed IO that
3743 * might needs to do the conversion. This function walks through
3744 * the list and convert the related unwritten extents for completed IO
3745 * to written.
3746 * The function return the number of pending IOs on success.
3747 */
3748int flush_completed_IO(struct inode *inode)
3749{
3750 ext4_io_end_t *io;
3751 struct ext4_inode_info *ei = EXT4_I(inode);
3752 unsigned long flags;
3753 int ret = 0;
3754 int ret2 = 0;
3755
3756 if (list_empty(&ei->i_completed_io_list))
3757 return ret;
3758
3759 dump_completed_IO(inode);
3760 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3761 while (!list_empty(&ei->i_completed_io_list)){
3762 io = list_entry(ei->i_completed_io_list.next,
3763 ext4_io_end_t, list);
3764 /*
3765 * Calling ext4_end_io_nolock() to convert completed
3766 * IO to written.
3767 *
3768 * When ext4_sync_file() is called, run_queue() may already
3769 * about to flush the work corresponding to this io structure.
3770 * It will be upset if it founds the io structure related
3771 * to the work-to-be schedule is freed.
3772 *
3773 * Thus we need to keep the io structure still valid here after
3774 * convertion finished. The io structure has a flag to
3775 * avoid double converting from both fsync and background work
3776 * queue work.
3777 */
3778 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3779 ret = ext4_end_io_nolock(io);
3780 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3781 if (ret < 0)
3782 ret2 = ret;
3783 else
3784 list_del_init(&io->list);
3785 }
3786 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3787 return (ret2 < 0) ? ret2 : 0;
3788}
3789
3790static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3791{
3792 ext4_io_end_t *io = NULL;
3793
3794 io = kmalloc(sizeof(*io), flags);
3795
3796 if (io) {
3797 igrab(inode);
3798 io->inode = inode;
3799 io->flag = 0;
3800 io->offset = 0;
3801 io->size = 0;
3802 io->page = NULL;
3803 io->iocb = NULL;
3804 io->result = 0;
3805 INIT_WORK(&io->work, ext4_end_io_work);
3806 INIT_LIST_HEAD(&io->list);
3807 }
3808
3809 return io;
3810}
3811
3812static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3574static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3813 ssize_t size, void *private, int ret, 3575 ssize_t size, void *private, int ret,
3814 bool is_async) 3576 bool is_async)
@@ -3828,7 +3590,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3828 size); 3590 size);
3829 3591
3830 /* if not aio dio with unwritten extents, just free io and return */ 3592 /* if not aio dio with unwritten extents, just free io and return */
3831 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3593 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3832 ext4_free_io_end(io_end); 3594 ext4_free_io_end(io_end);
3833 iocb->private = NULL; 3595 iocb->private = NULL;
3834out: 3596out:
@@ -3845,14 +3607,14 @@ out:
3845 } 3607 }
3846 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3847 3609
3848 /* queue the work to convert unwritten extents to written */
3849 queue_work(wq, &io_end->work);
3850
3851 /* Add the io_end to per-inode completed aio dio list*/ 3610 /* Add the io_end to per-inode completed aio dio list*/
3852 ei = EXT4_I(io_end->inode); 3611 ei = EXT4_I(io_end->inode);
3853 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3854 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3613 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3855 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3615
3616 /* queue the work to convert unwritten extents to written */
3617 queue_work(wq, &io_end->work);
3856 iocb->private = NULL; 3618 iocb->private = NULL;
3857} 3619}
3858 3620
@@ -3873,7 +3635,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3873 goto out; 3635 goto out;
3874 } 3636 }
3875 3637
3876 io_end->flag = EXT4_IO_UNWRITTEN; 3638 io_end->flag = EXT4_IO_END_UNWRITTEN;
3877 inode = io_end->inode; 3639 inode = io_end->inode;
3878 3640
3879 /* Add the io_end to per-inode completed io list*/ 3641 /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3663,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3901retry: 3663retry:
3902 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3664 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3903 if (!io_end) { 3665 if (!io_end) {
3904 if (printk_ratelimit()) 3666 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3905 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3906 schedule(); 3667 schedule();
3907 goto retry; 3668 goto retry;
3908 } 3669 }
@@ -3926,13 +3687,13 @@ retry:
3926 * preallocated extents, and those write extend the file, no need to 3687 * preallocated extents, and those write extend the file, no need to
3927 * fall back to buffered IO. 3688 * fall back to buffered IO.
3928 * 3689 *
3929 * For holes, we fallocate those blocks, mark them as unintialized 3690 * For holes, we fallocate those blocks, mark them as uninitialized
3930 * If those blocks were preallocated, we mark sure they are splited, but 3691 * If those blocks were preallocated, we mark sure they are splited, but
3931 * still keep the range to write as unintialized. 3692 * still keep the range to write as uninitialized.
3932 * 3693 *
3933 * The unwrritten extents will be converted to written when DIO is completed. 3694 * The unwrritten extents will be converted to written when DIO is completed.
3934 * For async direct IO, since the IO may still pending when return, we 3695 * For async direct IO, since the IO may still pending when return, we
3935 * set up an end_io call back function, which will do the convertion 3696 * set up an end_io call back function, which will do the conversion
3936 * when async direct IO completed. 3697 * when async direct IO completed.
3937 * 3698 *
3938 * If the O_DIRECT write will extend the file then add this inode to the 3699 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3955,7 +3716,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3955 * We could direct write to holes and fallocate. 3716 * We could direct write to holes and fallocate.
3956 * 3717 *
3957 * Allocated blocks to fill the hole are marked as uninitialized 3718 * Allocated blocks to fill the hole are marked as uninitialized
3958 * to prevent paralel buffered read to expose the stale data 3719 * to prevent parallel buffered read to expose the stale data
3959 * before DIO complete the data IO. 3720 * before DIO complete the data IO.
3960 * 3721 *
3961 * As to previously fallocated extents, ext4 get_block 3722 * As to previously fallocated extents, ext4 get_block
@@ -4016,7 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
4016 int err; 3777 int err;
4017 /* 3778 /*
4018 * for non AIO case, since the IO is already 3779 * for non AIO case, since the IO is already
4019 * completed, we could do the convertion right here 3780 * completed, we could do the conversion right here
4020 */ 3781 */
4021 err = ext4_convert_unwritten_extents(inode, 3782 err = ext4_convert_unwritten_extents(inode,
4022 offset, ret); 3783 offset, ret);
@@ -4037,11 +3798,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
4037{ 3798{
4038 struct file *file = iocb->ki_filp; 3799 struct file *file = iocb->ki_filp;
4039 struct inode *inode = file->f_mapping->host; 3800 struct inode *inode = file->f_mapping->host;
3801 ssize_t ret;
4040 3802
3803 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
4041 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3804 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4042 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3805 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
4043 3806 else
4044 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3807 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3808 trace_ext4_direct_IO_exit(inode, offset,
3809 iov_length(iov, nr_segs), rw, ret);
3810 return ret;
4045} 3811}
4046 3812
4047/* 3813/*
@@ -4067,7 +3833,6 @@ static const struct address_space_operations ext4_ordered_aops = {
4067 .readpage = ext4_readpage, 3833 .readpage = ext4_readpage,
4068 .readpages = ext4_readpages, 3834 .readpages = ext4_readpages,
4069 .writepage = ext4_writepage, 3835 .writepage = ext4_writepage,
4070 .sync_page = block_sync_page,
4071 .write_begin = ext4_write_begin, 3836 .write_begin = ext4_write_begin,
4072 .write_end = ext4_ordered_write_end, 3837 .write_end = ext4_ordered_write_end,
4073 .bmap = ext4_bmap, 3838 .bmap = ext4_bmap,
@@ -4083,7 +3848,6 @@ static const struct address_space_operations ext4_writeback_aops = {
4083 .readpage = ext4_readpage, 3848 .readpage = ext4_readpage,
4084 .readpages = ext4_readpages, 3849 .readpages = ext4_readpages,
4085 .writepage = ext4_writepage, 3850 .writepage = ext4_writepage,
4086 .sync_page = block_sync_page,
4087 .write_begin = ext4_write_begin, 3851 .write_begin = ext4_write_begin,
4088 .write_end = ext4_writeback_write_end, 3852 .write_end = ext4_writeback_write_end,
4089 .bmap = ext4_bmap, 3853 .bmap = ext4_bmap,
@@ -4099,7 +3863,6 @@ static const struct address_space_operations ext4_journalled_aops = {
4099 .readpage = ext4_readpage, 3863 .readpage = ext4_readpage,
4100 .readpages = ext4_readpages, 3864 .readpages = ext4_readpages,
4101 .writepage = ext4_writepage, 3865 .writepage = ext4_writepage,
4102 .sync_page = block_sync_page,
4103 .write_begin = ext4_write_begin, 3866 .write_begin = ext4_write_begin,
4104 .write_end = ext4_journalled_write_end, 3867 .write_end = ext4_journalled_write_end,
4105 .set_page_dirty = ext4_journalled_set_page_dirty, 3868 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -4115,7 +3878,6 @@ static const struct address_space_operations ext4_da_aops = {
4115 .readpages = ext4_readpages, 3878 .readpages = ext4_readpages,
4116 .writepage = ext4_writepage, 3879 .writepage = ext4_writepage,
4117 .writepages = ext4_da_writepages, 3880 .writepages = ext4_da_writepages,
4118 .sync_page = block_sync_page,
4119 .write_begin = ext4_da_write_begin, 3881 .write_begin = ext4_da_write_begin,
4120 .write_end = ext4_da_write_end, 3882 .write_end = ext4_da_write_end,
4121 .bmap = ext4_bmap, 3883 .bmap = ext4_bmap,
@@ -4152,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
4152int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
4153 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
4154{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
4155 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
4156 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
4157 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
4158 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
4159 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
4160 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -4167,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
4167 return -EINVAL; 3950 return -EINVAL;
4168 3951
4169 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
4170 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
4171 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4172 3963
4173 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4226,7 +4017,7 @@ int ext4_block_truncate_page(handle_t *handle,
4226 if (ext4_should_journal_data(inode)) { 4017 if (ext4_should_journal_data(inode)) {
4227 err = ext4_handle_dirty_metadata(handle, inode, bh); 4018 err = ext4_handle_dirty_metadata(handle, inode, bh);
4228 } else { 4019 } else {
4229 if (ext4_should_order_data(inode)) 4020 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4230 err = ext4_jbd2_file_inode(handle, inode); 4021 err = ext4_jbd2_file_inode(handle, inode);
4231 mark_buffer_dirty(bh); 4022 mark_buffer_dirty(bh);
4232 } 4023 }
@@ -4262,7 +4053,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
4262 * 4053 *
4263 * When we do truncate() we may have to clean the ends of several 4054 * When we do truncate() we may have to clean the ends of several
4264 * indirect blocks but leave the blocks themselves alive. Block is 4055 * indirect blocks but leave the blocks themselves alive. Block is
4265 * partially truncated if some data below the new i_size is refered 4056 * partially truncated if some data below the new i_size is referred
4266 * from it (and it is on the path to the first completely truncated 4057 * from it (and it is on the path to the first completely truncated
4267 * data block, indeed). We have to free the top of that path along 4058 * data block, indeed). We have to free the top of that path along
4268 * with everything to the right of the path. Since no allocation 4059 * with everything to the right of the path. Since no allocation
@@ -4341,6 +4132,9 @@ no_top:
4341 * 4132 *
4342 * We release `count' blocks on disk, but (last - first) may be greater 4133 * We release `count' blocks on disk, but (last - first) may be greater
4343 * than `count' because there can be holes in there. 4134 * than `count' because there can be holes in there.
4135 *
4136 * Return 0 on success, 1 on invalid block range
4137 * and < 0 on fatal error.
4344 */ 4138 */
4345static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4139static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4346 struct buffer_head *bh, 4140 struct buffer_head *bh,
@@ -4350,6 +4144,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4350{ 4144{
4351 __le32 *p; 4145 __le32 *p;
4352 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4146 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4147 int err;
4353 4148
4354 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4149 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4355 flags |= EXT4_FREE_BLOCKS_METADATA; 4150 flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,22 +4160,33 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4365 if (try_to_extend_transaction(handle, inode)) { 4160 if (try_to_extend_transaction(handle, inode)) {
4366 if (bh) { 4161 if (bh) {
4367 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4162 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4368 ext4_handle_dirty_metadata(handle, inode, bh); 4163 err = ext4_handle_dirty_metadata(handle, inode, bh);
4164 if (unlikely(err))
4165 goto out_err;
4369 } 4166 }
4370 ext4_mark_inode_dirty(handle, inode); 4167 err = ext4_mark_inode_dirty(handle, inode);
4371 ext4_truncate_restart_trans(handle, inode, 4168 if (unlikely(err))
4372 blocks_for_truncate(inode)); 4169 goto out_err;
4170 err = ext4_truncate_restart_trans(handle, inode,
4171 blocks_for_truncate(inode));
4172 if (unlikely(err))
4173 goto out_err;
4373 if (bh) { 4174 if (bh) {
4374 BUFFER_TRACE(bh, "retaking write access"); 4175 BUFFER_TRACE(bh, "retaking write access");
4375 ext4_journal_get_write_access(handle, bh); 4176 err = ext4_journal_get_write_access(handle, bh);
4177 if (unlikely(err))
4178 goto out_err;
4376 } 4179 }
4377 } 4180 }
4378 4181
4379 for (p = first; p < last; p++) 4182 for (p = first; p < last; p++)
4380 *p = 0; 4183 *p = 0;
4381 4184
4382 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4185 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4383 return 0; 4186 return 0;
4187out_err:
4188 ext4_std_error(inode->i_sb, err);
4189 return err;
4384} 4190}
4385 4191
4386/** 4192/**
@@ -4391,7 +4197,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4391 * @first: array of block numbers 4197 * @first: array of block numbers
4392 * @last: points immediately past the end of array 4198 * @last: points immediately past the end of array
4393 * 4199 *
4394 * We are freeing all blocks refered from that array (numbers are stored as 4200 * We are freeing all blocks referred from that array (numbers are stored as
4395 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4201 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4396 * 4202 *
4397 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4203 * We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -4414,7 +4220,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4414 ext4_fsblk_t nr; /* Current block # */ 4220 ext4_fsblk_t nr; /* Current block # */
4415 __le32 *p; /* Pointer into inode/ind 4221 __le32 *p; /* Pointer into inode/ind
4416 for current block */ 4222 for current block */
4417 int err; 4223 int err = 0;
4418 4224
4419 if (this_bh) { /* For indirect block */ 4225 if (this_bh) { /* For indirect block */
4420 BUFFER_TRACE(this_bh, "get_write_access"); 4226 BUFFER_TRACE(this_bh, "get_write_access");
@@ -4436,9 +4242,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4436 } else if (nr == block_to_free + count) { 4242 } else if (nr == block_to_free + count) {
4437 count++; 4243 count++;
4438 } else { 4244 } else {
4439 if (ext4_clear_blocks(handle, inode, this_bh, 4245 err = ext4_clear_blocks(handle, inode, this_bh,
4440 block_to_free, count, 4246 block_to_free, count,
4441 block_to_free_p, p)) 4247 block_to_free_p, p);
4248 if (err)
4442 break; 4249 break;
4443 block_to_free = nr; 4250 block_to_free = nr;
4444 block_to_free_p = p; 4251 block_to_free_p = p;
@@ -4447,9 +4254,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4447 } 4254 }
4448 } 4255 }
4449 4256
4450 if (count > 0) 4257 if (!err && count > 0)
4451 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4258 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4452 count, block_to_free_p, p); 4259 count, block_to_free_p, p);
4260 if (err < 0)
4261 /* fatal error */
4262 return;
4453 4263
4454 if (this_bh) { 4264 if (this_bh) {
4455 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4265 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4479,7 +4289,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4479 * @last: pointer immediately past the end of array 4289 * @last: pointer immediately past the end of array
4480 * @depth: depth of the branches to free 4290 * @depth: depth of the branches to free
4481 * 4291 *
4482 * We are freeing all blocks refered from these branches (numbers are 4292 * We are freeing all blocks referred from these branches (numbers are
4483 * stored as little-endian 32-bit) and updating @inode->i_blocks 4293 * stored as little-endian 32-bit) and updating @inode->i_blocks
4484 * appropriately. 4294 * appropriately.
4485 */ 4295 */
@@ -4530,6 +4340,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4530 (__le32 *) bh->b_data, 4340 (__le32 *) bh->b_data,
4531 (__le32 *) bh->b_data + addr_per_block, 4341 (__le32 *) bh->b_data + addr_per_block,
4532 depth); 4342 depth);
4343 brelse(bh);
4533 4344
4534 /* 4345 /*
4535 * Everything below this this pointer has been 4346 * Everything below this this pointer has been
@@ -4566,7 +4377,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4566 * transaction where the data blocks are 4377 * transaction where the data blocks are
4567 * actually freed. 4378 * actually freed.
4568 */ 4379 */
4569 ext4_free_blocks(handle, inode, 0, nr, 1, 4380 ext4_free_blocks(handle, inode, NULL, nr, 1,
4570 EXT4_FREE_BLOCKS_METADATA| 4381 EXT4_FREE_BLOCKS_METADATA|
4571 EXT4_FREE_BLOCKS_FORGET); 4382 EXT4_FREE_BLOCKS_FORGET);
4572 4383
@@ -4596,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4596 4407
4597int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4598{ 4409{
4599 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4600 return 0;
4601 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4602 return 1; 4411 return 1;
4603 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4608,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4608} 4417}
4609 4418
4610/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4611 * ext4_truncate() 4445 * ext4_truncate()
4612 * 4446 *
4613 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4646,10 +4480,12 @@ void ext4_truncate(struct inode *inode)
4646 Indirect chain[4]; 4480 Indirect chain[4];
4647 Indirect *partial; 4481 Indirect *partial;
4648 __le32 nr = 0; 4482 __le32 nr = 0;
4649 int n; 4483 int n = 0;
4650 ext4_lblk_t last_block; 4484 ext4_lblk_t last_block, max_block;
4651 unsigned blocksize = inode->i_sb->s_blocksize; 4485 unsigned blocksize = inode->i_sb->s_blocksize;
4652 4486
4487 trace_ext4_truncate_enter(inode);
4488
4653 if (!ext4_can_truncate(inode)) 4489 if (!ext4_can_truncate(inode))
4654 return; 4490 return;
4655 4491
@@ -4660,6 +4496,7 @@ void ext4_truncate(struct inode *inode)
4660 4496
4661 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4497 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4662 ext4_ext_truncate(inode); 4498 ext4_ext_truncate(inode);
4499 trace_ext4_truncate_exit(inode);
4663 return; 4500 return;
4664 } 4501 }
4665 4502
@@ -4669,14 +4506,18 @@ void ext4_truncate(struct inode *inode)
4669 4506
4670 last_block = (inode->i_size + blocksize-1) 4507 last_block = (inode->i_size + blocksize-1)
4671 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4508 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4509 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4510 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4672 4511
4673 if (inode->i_size & (blocksize - 1)) 4512 if (inode->i_size & (blocksize - 1))
4674 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4513 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4675 goto out_stop; 4514 goto out_stop;
4676 4515
4677 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4516 if (last_block != max_block) {
4678 if (n == 0) 4517 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4679 goto out_stop; /* error */ 4518 if (n == 0)
4519 goto out_stop; /* error */
4520 }
4680 4521
4681 /* 4522 /*
4682 * OK. This truncate is going to happen. We add the inode to the 4523 * OK. This truncate is going to happen. We add the inode to the
@@ -4707,7 +4548,13 @@ void ext4_truncate(struct inode *inode)
4707 */ 4548 */
4708 ei->i_disksize = inode->i_size; 4549 ei->i_disksize = inode->i_size;
4709 4550
4710 if (n == 1) { /* direct blocks */ 4551 if (last_block == max_block) {
4552 /*
4553 * It is unnecessary to free any data blocks if last_block is
4554 * equal to the indirect block limit.
4555 */
4556 goto out_unlock;
4557 } else if (n == 1) { /* direct blocks */
4711 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4558 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4712 i_data + EXT4_NDIR_BLOCKS); 4559 i_data + EXT4_NDIR_BLOCKS);
4713 goto do_indirects; 4560 goto do_indirects;
@@ -4767,6 +4614,7 @@ do_indirects:
4767 ; 4614 ;
4768 } 4615 }
4769 4616
4617out_unlock:
4770 up_write(&ei->i_data_sem); 4618 up_write(&ei->i_data_sem);
4771 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4619 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4772 ext4_mark_inode_dirty(handle, inode); 4620 ext4_mark_inode_dirty(handle, inode);
@@ -4789,6 +4637,7 @@ out_stop:
4789 ext4_orphan_del(handle, inode); 4637 ext4_orphan_del(handle, inode);
4790 4638
4791 ext4_journal_stop(handle); 4639 ext4_journal_stop(handle);
4640 trace_ext4_truncate_exit(inode);
4792} 4641}
4793 4642
4794/* 4643/*
@@ -4818,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4818 /* 4667 /*
4819 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4820 */ 4669 */
4821 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4822 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4823 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4824 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -4920,6 +4769,7 @@ make_io:
4920 * has in-inode xattrs, or we don't have this inode in memory. 4769 * has in-inode xattrs, or we don't have this inode in memory.
4921 * Read the block from disk. 4770 * Read the block from disk.
4922 */ 4771 */
4772 trace_ext4_load_inode(inode);
4923 get_bh(bh); 4773 get_bh(bh);
4924 bh->b_end_io = end_buffer_read_sync; 4774 bh->b_end_io = end_buffer_read_sync;
4925 submit_bh(READ_META, bh); 4775 submit_bh(READ_META, bh);
@@ -5025,7 +4875,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5025 return inode; 4875 return inode;
5026 4876
5027 ei = EXT4_I(inode); 4877 ei = EXT4_I(inode);
5028 iloc.bh = 0; 4878 iloc.bh = NULL;
5029 4879
5030 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4880 ret = __ext4_get_inode_loc(inode, &iloc, 0);
5031 if (ret < 0) 4881 if (ret < 0)
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5040 } 4890 }
5041 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
5042 4892
5043 ei->i_state_flags = 0; 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
5044 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
5045 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
5046 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
5299 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5300 goto out_brelse; 5150 goto out_brelse;
5301 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5302 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5303 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5304 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5305 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5464{ 5314{
5465 struct inode *inode = dentry->d_inode; 5315 struct inode *inode = dentry->d_inode;
5466 int error, rc = 0; 5316 int error, rc = 0;
5317 int orphan = 0;
5467 const unsigned int ia_valid = attr->ia_valid; 5318 const unsigned int ia_valid = attr->ia_valid;
5468 5319
5469 error = inode_change_ok(inode, attr); 5320 error = inode_change_ok(inode, attr);
@@ -5510,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5510 5361
5511 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5512 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5513 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5514 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5515 handle_t *handle; 5365 handle_t *handle;
5516 5366
5517 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5519,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5519 error = PTR_ERR(handle); 5369 error = PTR_ERR(handle);
5520 goto err_out; 5370 goto err_out;
5521 } 5371 }
5522 5372 if (ext4_handle_valid(handle)) {
5523 error = ext4_orphan_add(handle, inode); 5373 error = ext4_orphan_add(handle, inode);
5374 orphan = 1;
5375 }
5524 EXT4_I(inode)->i_disksize = attr->ia_size; 5376 EXT4_I(inode)->i_disksize = attr->ia_size;
5525 rc = ext4_mark_inode_dirty(handle, inode); 5377 rc = ext4_mark_inode_dirty(handle, inode);
5526 if (!error) 5378 if (!error)
@@ -5538,18 +5390,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5538 goto err_out; 5390 goto err_out;
5539 } 5391 }
5540 ext4_orphan_del(handle, inode); 5392 ext4_orphan_del(handle, inode);
5393 orphan = 0;
5541 ext4_journal_stop(handle); 5394 ext4_journal_stop(handle);
5542 goto err_out; 5395 goto err_out;
5543 } 5396 }
5544 } 5397 }
5545 /* ext4_truncate will clear the flag */
5546 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5547 ext4_truncate(inode);
5548 } 5398 }
5549 5399
5550 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5551 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5552 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5553 5407
5554 if (!rc) { 5408 if (!rc) {
5555 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5560 * If the call to ext4_truncate failed to get a transaction handle at 5414 * If the call to ext4_truncate failed to get a transaction handle at
5561 * all, we need to clean up the in-core orphan list manually. 5415 * all, we need to clean up the in-core orphan list manually.
5562 */ 5416 */
5563 if (inode->i_nlink) 5417 if (orphan && inode->i_nlink)
5564 ext4_orphan_del(NULL, inode); 5418 ext4_orphan_del(NULL, inode);
5565 5419
5566 if (!rc && (ia_valid & ATTR_MODE)) 5420 if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5592 * will return the blocks that include the delayed allocation 5446 * will return the blocks that include the delayed allocation
5593 * blocks for this file. 5447 * blocks for this file.
5594 */ 5448 */
5595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5596 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5449 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5598 5450
5599 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5451 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5600 return 0; 5452 return 0;
@@ -5608,13 +5460,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5608 /* if nrblocks are contiguous */ 5460 /* if nrblocks are contiguous */
5609 if (chunk) { 5461 if (chunk) {
5610 /* 5462 /*
5611 * With N contiguous data blocks, it need at most 5463 * With N contiguous data blocks, we need at most
5612 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 5464 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5613 * 2 dindirect blocks 5465 * 2 dindirect blocks, and 1 tindirect block
5614 * 1 tindirect block
5615 */ 5466 */
5616 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 5467 return DIV_ROUND_UP(nrblocks,
5617 return indirects + 3; 5468 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5618 } 5469 }
5619 /* 5470 /*
5620 * if nrblocks are not contiguous, worse case, each block touch 5471 * if nrblocks are not contiguous, worse case, each block touch
@@ -5643,7 +5494,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5643 * 5494 *
5644 * Also account for superblock, inode, quota and xattr blocks 5495 * Also account for superblock, inode, quota and xattr blocks
5645 */ 5496 */
5646int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5497static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5647{ 5498{
5648 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5499 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5649 int gdpblocks; 5500 int gdpblocks;
@@ -5688,7 +5539,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5688} 5539}
5689 5540
5690/* 5541/*
5691 * Calulate the total number of credits to reserve to fit 5542 * Calculate the total number of credits to reserve to fit
5692 * the modification of a single pages into a single transaction, 5543 * the modification of a single pages into a single transaction,
5693 * which may include multiple chunks of block allocations. 5544 * which may include multiple chunks of block allocations.
5694 * 5545 *
@@ -5831,6 +5682,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5831 int err, ret; 5682 int err, ret;
5832 5683
5833 might_sleep(); 5684 might_sleep();
5685 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5834 err = ext4_reserve_inode_write(handle, inode, &iloc); 5686 err = ext4_reserve_inode_write(handle, inode, &iloc);
5835 if (ext4_handle_valid(handle) && 5687 if (ext4_handle_valid(handle) &&
5836 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5688 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
@@ -5881,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5881 * so would cause a commit on atime updates, which we don't bother doing. 5733 * so would cause a commit on atime updates, which we don't bother doing.
5882 * We handle synchronous inodes at the highest possible level. 5734 * We handle synchronous inodes at the highest possible level.
5883 */ 5735 */
5884void ext4_dirty_inode(struct inode *inode) 5736void ext4_dirty_inode(struct inode *inode, int flags)
5885{ 5737{
5886 handle_t *handle; 5738 handle_t *handle;
5887 5739
@@ -6009,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6009 goto out_unlock; 5861 goto out_unlock;
6010 } 5862 }
6011 ret = 0; 5863 ret = 0;
6012 if (PageMappedToDisk(page)) 5864
6013 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
6014 5871
6015 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
6016 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
6017 else 5874 else
6018 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
6019 5876
6020 lock_page(page);
6021 /* 5877 /*
6022 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
6023 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -6027,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6027 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
6028 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
6029 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
6030 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
6031 goto out_unlock; 5887 return VM_FAULT_LOCKED;
6032 } 5888 }
6033 } 5889 }
6034 unlock_page(page); 5890 unlock_page(page);
@@ -6048,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6048 if (ret < 0) 5904 if (ret < 0)
6049 goto out_unlock; 5905 goto out_unlock;
6050 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
6051out_unlock: 5917out_unlock:
6052 if (ret) 5918 if (ret)
6053 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
146 __u32 generation; 146 __u32 generation;
147 int err; 147 int err;
148 148
149 if (!is_owner_or_cap(inode)) 149 if (!inode_owner_or_capable(inode))
150 return -EPERM; 150 return -EPERM;
151 151
152 err = mnt_want_write(filp->f_path.mnt); 152 err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
298 case EXT4_IOC_MIGRATE: 298 case EXT4_IOC_MIGRATE:
299 { 299 {
300 int err; 300 int err;
301 if (!is_owner_or_cap(inode)) 301 if (!inode_owner_or_capable(inode))
302 return -EACCES; 302 return -EACCES;
303 303
304 err = mnt_want_write(filp->f_path.mnt); 304 err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
320 case EXT4_IOC_ALLOC_DA_BLKS: 320 case EXT4_IOC_ALLOC_DA_BLKS:
321 { 321 {
322 int err; 322 int err;
323 if (!is_owner_or_cap(inode)) 323 if (!inode_owner_or_capable(inode))
324 return -EACCES; 324 return -EACCES;
325 325
326 err = mnt_want_write(filp->f_path.mnt); 326 err = mnt_want_write(filp->f_path.mnt);
@@ -331,6 +331,36 @@ mext_out:
331 return err; 331 return err;
332 } 332 }
333 333
334 case FITRIM:
335 {
336 struct super_block *sb = inode->i_sb;
337 struct request_queue *q = bdev_get_queue(sb->s_bdev);
338 struct fstrim_range range;
339 int ret = 0;
340
341 if (!capable(CAP_SYS_ADMIN))
342 return -EPERM;
343
344 if (!blk_queue_discard(q))
345 return -EOPNOTSUPP;
346
347 if (copy_from_user(&range, (struct fstrim_range *)arg,
348 sizeof(range)))
349 return -EFAULT;
350
351 range.minlen = max((unsigned int)range.minlen,
352 q->limits.discard_granularity);
353 ret = ext4_trim_fs(sb, &range);
354 if (ret < 0)
355 return ret;
356
357 if (copy_to_user((struct fstrim_range *)arg, &range,
358 sizeof(range)))
359 return -EFAULT;
360
361 return 0;
362 }
363
334 default: 364 default:
335 return -ENOTTY; 365 return -ENOTTY;
336 } 366 }
@@ -397,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
397 return err; 427 return err;
398 } 428 }
399 case EXT4_IOC_MOVE_EXT: 429 case EXT4_IOC_MOVE_EXT:
430 case FITRIM:
400 break; 431 break;
401 default: 432 default:
402 return -ENOIOCTLCMD; 433 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..6ed859d56850 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
92 * between CPUs. It is possible to get scheduled at this point. 92 * between CPUs. It is possible to get scheduled at this point.
93 * 93 *
94 * The locality group prealloc space is used looking at whether we have 94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) withing the prealloc space. 95 * enough free space (pa_free) within the prealloc space.
96 * 96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group 97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented 98 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -338,6 +338,19 @@
338static struct kmem_cache *ext4_pspace_cachep; 338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep; 339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep; 340static struct kmem_cache *ext4_free_ext_cachep;
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES 8
346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
347
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
341static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
342 ext4_group_t group); 355 ext4_group_t group);
343static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -419,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
419 } 432 }
420 433
421 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
422 *max = 1 << (e4b->bd_blkbits + 3); 435 if (order == 0) {
423 if (order == 0) 436 *max = 1 << (e4b->bd_blkbits + 3);
424 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 }
425 439
426 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
427 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -603,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
603 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
604 618
605 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
606 buddy = mb_find_buddy(e4b, 0, &max);
607 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
608 ext4_group_t groupnr; 621 ext4_group_t groupnr;
609 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
@@ -622,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622#define mb_check_buddy(e4b) 635#define mb_check_buddy(e4b)
623#endif 636#endif
624 637
625/* FIXME!! need more doc */ 638/*
639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size.
643 */
626static void ext4_mb_mark_free_simple(struct super_block *sb, 644static void ext4_mb_mark_free_simple(struct super_block *sb,
627 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
628 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
@@ -769,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
769 struct inode *inode; 787 struct inode *inode;
770 char *data; 788 char *data;
771 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
772 791
773 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
774 793
@@ -801,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
801 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
802 break; 821 break;
803 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
804 err = -EIO; 835 err = -EIO;
805 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
806 if (desc == NULL) 837 if (desc == NULL)
@@ -853,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
853 } 884 }
854 885
855 /* wait for I/O completion */ 886 /* wait for I/O completion */
856 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
857 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
858 890
859 err = -EIO; 891 err = -EIO;
860 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
861 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
862 goto out; 894 goto out;
863 895
864 err = 0; 896 err = 0;
865 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
866 /* init the page */
867 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
868 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
869 int group; 899 int group;
870 struct ext4_group_info *grinfo;
871 900
872 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
873 if (group >= ngroups) 902 if (group >= ngroups)
874 break; 903 break;
875 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
876 /* 909 /*
877 * data carry information regarding this 910 * data carry information regarding this
878 * particular group in the format specified 911 * particular group in the format specified
@@ -901,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
901 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
902 */ 935 */
903 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
904 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
905 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
906 incore = NULL; 941 incore = NULL;
@@ -930,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
930 965
931out: 966out:
932 if (bh) { 967 if (bh) {
933 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
934 brelse(bh[i]); 969 brelse(bh[i]);
935 if (bh != &bhs) 970 if (bh != &bhs)
936 kfree(bh); 971 kfree(bh);
@@ -939,6 +974,67 @@ out:
939} 974}
940 975
941/* 976/*
977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
978 * on the same buddy page doesn't happen whild holding the buddy page lock.
979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
981 */
982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
983 ext4_group_t group, struct ext4_buddy *e4b)
984{
985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
986 int block, pnum, poff;
987 int blocks_per_page;
988 struct page *page;
989
990 e4b->bd_buddy_page = NULL;
991 e4b->bd_bitmap_page = NULL;
992
993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
994 /*
995 * the buddy cache inode stores the block bitmap
996 * and buddy information in consecutive blocks.
997 * So for each group we need two blocks.
998 */
999 block = group * 2;
1000 pnum = block / blocks_per_page;
1001 poff = block % blocks_per_page;
1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1003 if (!page)
1004 return -EIO;
1005 BUG_ON(page->mapping != inode->i_mapping);
1006 e4b->bd_bitmap_page = page;
1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1008
1009 if (blocks_per_page >= 2) {
1010 /* buddy and bitmap are on the same page */
1011 return 0;
1012 }
1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1023}
1024
1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1026{
1027 if (e4b->bd_bitmap_page) {
1028 unlock_page(e4b->bd_bitmap_page);
1029 page_cache_release(e4b->bd_bitmap_page);
1030 }
1031 if (e4b->bd_buddy_page) {
1032 unlock_page(e4b->bd_buddy_page);
1033 page_cache_release(e4b->bd_buddy_page);
1034 }
1035}
1036
1037/*
942 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1038 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
943 * block group lock of all groups for this page; do not hold the BG lock when 1039 * block group lock of all groups for this page; do not hold the BG lock when
944 * calling this routine! 1040 * calling this routine!
@@ -947,93 +1043,60 @@ static noinline_for_stack
947int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
948{ 1044{
949 1045
950 int ret = 0;
951 void *bitmap;
952 int blocks_per_page;
953 int block, pnum, poff;
954 int num_grp_locked = 0;
955 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
956 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
957 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
958 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
959 1050
960 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
961 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
962 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
963 /* 1053 /*
964 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
965 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
966 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
967 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
968 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
969 */ 1059 */
970 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
971 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
972 /* 1062 /*
973 * somebody initialized the group 1063 * somebody initialized the group
974 * return without doing anything 1064 * return without doing anything
975 */ 1065 */
976 ret = 0;
977 goto err; 1066 goto err;
978 } 1067 }
979 /* 1068
980 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
981 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
982 * So for each group we need two blocks. 1071 if (ret)
983 */ 1072 goto err;
984 block = group * 2; 1073 if (!PageUptodate(page)) {
985 pnum = block / blocks_per_page;
986 poff = block % blocks_per_page;
987 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
988 if (page) {
989 BUG_ON(page->mapping != inode->i_mapping);
990 ret = ext4_mb_init_cache(page, NULL);
991 if (ret) {
992 unlock_page(page);
993 goto err;
994 }
995 unlock_page(page);
996 }
997 if (page == NULL || !PageUptodate(page)) {
998 ret = -EIO; 1074 ret = -EIO;
999 goto err; 1075 goto err;
1000 } 1076 }
1001 mark_page_accessed(page); 1077 mark_page_accessed(page);
1002 bitmap_page = page;
1003 bitmap = page_address(page) + (poff * sb->s_blocksize);
1004 1078
1005 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1006 block++;
1007 pnum = block / blocks_per_page;
1008 poff = block % blocks_per_page;
1009 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1010 if (page == bitmap_page) {
1011 /* 1080 /*
1012 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1013 * the same page we don't need to force 1082 * the same page we don't need to force
1014 * init the buddy 1083 * init the buddy
1015 */ 1084 */
1016 unlock_page(page); 1085 ret = 0;
1017 } else if (page) { 1086 goto err;
1018 BUG_ON(page->mapping != inode->i_mapping);
1019 ret = ext4_mb_init_cache(page, bitmap);
1020 if (ret) {
1021 unlock_page(page);
1022 goto err;
1023 }
1024 unlock_page(page);
1025 } 1087 }
1026 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1027 ret = -EIO; 1094 ret = -EIO;
1028 goto err; 1095 goto err;
1029 } 1096 }
1030 mark_page_accessed(page); 1097 mark_page_accessed(page);
1031err: 1098err:
1032 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1033 if (bitmap_page)
1034 page_cache_release(bitmap_page);
1035 if (page)
1036 page_cache_release(page);
1037 return ret; 1100 return ret;
1038} 1101}
1039 1102
@@ -1067,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1067 e4b->bd_group = group; 1130 e4b->bd_group = group;
1068 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1069 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1070 e4b->alloc_semp = &grp->alloc_sem;
1071
1072 /* Take the read lock on the group alloc
1073 * sem. This would make sure a parallel
1074 * ext4_mb_init_group happening on other
1075 * groups mapped by the page is blocked
1076 * till we are done with allocation
1077 */
1078repeat_load_buddy:
1079 down_read(e4b->alloc_semp);
1080 1133
1081 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1082 /* we need to check for group need init flag
1083 * with alloc_semp held so that we can be sure
1084 * that new blocks didn't get added to the group
1085 * when we are loading the buddy cache
1086 */
1087 up_read(e4b->alloc_semp);
1088 /* 1135 /*
1089 * we need full data about the group 1136 * we need full data about the group
1090 * to make a good selection 1137 * to make a good selection
@@ -1092,7 +1139,6 @@ repeat_load_buddy:
1092 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1093 if (ret) 1140 if (ret)
1094 return ret; 1141 return ret;
1095 goto repeat_load_buddy;
1096 } 1142 }
1097 1143
1098 /* 1144 /*
@@ -1176,15 +1222,14 @@ repeat_load_buddy:
1176 return 0; 1222 return 0;
1177 1223
1178err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1179 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1180 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1181 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1182 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1183 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1184 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1185
1186 /* Done with the buddy cache */
1187 up_read(e4b->alloc_semp);
1188 return ret; 1233 return ret;
1189} 1234}
1190 1235
@@ -1194,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1194 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1195 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1196 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1197 /* Done with the buddy cache */
1198 if (e4b->alloc_semp)
1199 up_read(e4b->alloc_semp);
1200} 1242}
1201 1243
1202 1244
@@ -1509,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1509 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1510 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1511 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1512 /* on allocation we use ac to track the held semaphore */
1513 ac->alloc_semp = e4b->alloc_semp;
1514 e4b->alloc_semp = NULL;
1515 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1516 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1517 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -1915,84 +1954,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1915 return 0; 1954 return 0;
1916} 1955}
1917 1956
1918/*
1919 * lock the group_info alloc_sem of all the groups
1920 * belonging to the same buddy cache page. This
1921 * make sure other parallel operation on the buddy
1922 * cache doesn't happen whild holding the buddy cache
1923 * lock
1924 */
1925int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1926{
1927 int i;
1928 int block, pnum;
1929 int blocks_per_page;
1930 int groups_per_page;
1931 ext4_group_t ngroups = ext4_get_groups_count(sb);
1932 ext4_group_t first_group;
1933 struct ext4_group_info *grp;
1934
1935 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1936 /*
1937 * the buddy cache inode stores the block bitmap
1938 * and buddy information in consecutive blocks.
1939 * So for each group we need two blocks.
1940 */
1941 block = group * 2;
1942 pnum = block / blocks_per_page;
1943 first_group = pnum * blocks_per_page / 2;
1944
1945 groups_per_page = blocks_per_page >> 1;
1946 if (groups_per_page == 0)
1947 groups_per_page = 1;
1948 /* read all groups the page covers into the cache */
1949 for (i = 0; i < groups_per_page; i++) {
1950
1951 if ((first_group + i) >= ngroups)
1952 break;
1953 grp = ext4_get_group_info(sb, first_group + i);
1954 /* take all groups write allocation
1955 * semaphore. This make sure there is
1956 * no block allocation going on in any
1957 * of that groups
1958 */
1959 down_write_nested(&grp->alloc_sem, i);
1960 }
1961 return i;
1962}
1963
1964void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1965 ext4_group_t group, int locked_group)
1966{
1967 int i;
1968 int block, pnum;
1969 int blocks_per_page;
1970 ext4_group_t first_group;
1971 struct ext4_group_info *grp;
1972
1973 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1974 /*
1975 * the buddy cache inode stores the block bitmap
1976 * and buddy information in consecutive blocks.
1977 * So for each group we need two blocks.
1978 */
1979 block = group * 2;
1980 pnum = block / blocks_per_page;
1981 first_group = pnum * blocks_per_page / 2;
1982 /* release locks on all the groups */
1983 for (i = 0; i < locked_group; i++) {
1984
1985 grp = ext4_get_group_info(sb, first_group + i);
1986 /* take all groups write allocation
1987 * semaphore. This make sure there is
1988 * no block allocation going on in any
1989 * of that groups
1990 */
1991 up_write(&grp->alloc_sem);
1992 }
1993
1994}
1995
1996static noinline_for_stack int 1957static noinline_for_stack int
1997ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1958ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1998{ 1959{
@@ -2233,15 +2194,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2233 .release = seq_release, 2194 .release = seq_release,
2234}; 2195};
2235 2196
2197static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2198{
2199 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2200 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2201
2202 BUG_ON(!cachep);
2203 return cachep;
2204}
2236 2205
2237/* Create and initialize ext4_group_info data for the given group. */ 2206/* Create and initialize ext4_group_info data for the given group. */
2238int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2207int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2239 struct ext4_group_desc *desc) 2208 struct ext4_group_desc *desc)
2240{ 2209{
2241 int i, len; 2210 int i;
2242 int metalen = 0; 2211 int metalen = 0;
2243 struct ext4_sb_info *sbi = EXT4_SB(sb); 2212 struct ext4_sb_info *sbi = EXT4_SB(sb);
2244 struct ext4_group_info **meta_group_info; 2213 struct ext4_group_info **meta_group_info;
2214 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2245 2215
2246 /* 2216 /*
2247 * First check if this group is the first of a reserved block. 2217 * First check if this group is the first of a reserved block.
@@ -2261,22 +2231,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2261 meta_group_info; 2231 meta_group_info;
2262 } 2232 }
2263 2233
2264 /*
2265 * calculate needed size. if change bb_counters size,
2266 * don't forget about ext4_mb_generate_buddy()
2267 */
2268 len = offsetof(typeof(**meta_group_info),
2269 bb_counters[sb->s_blocksize_bits + 2]);
2270
2271 meta_group_info = 2234 meta_group_info =
2272 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2235 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2273 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2236 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2274 2237
2275 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2238 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2276 if (meta_group_info[i] == NULL) { 2239 if (meta_group_info[i] == NULL) {
2277 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2240 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2278 goto exit_group_info; 2241 goto exit_group_info;
2279 } 2242 }
2243 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2280 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2244 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2281 &(meta_group_info[i]->bb_state)); 2245 &(meta_group_info[i]->bb_state));
2282 2246
@@ -2331,6 +2295,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2331 int num_meta_group_infos_max; 2295 int num_meta_group_infos_max;
2332 int array_size; 2296 int array_size;
2333 struct ext4_group_desc *desc; 2297 struct ext4_group_desc *desc;
2298 struct kmem_cache *cachep;
2334 2299
2335 /* This is the number of blocks used by GDT */ 2300 /* This is the number of blocks used by GDT */
2336 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2301 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2363,7 +2328,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2363 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2328 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2364 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2329 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2365 * So a two level scheme suffices for now. */ 2330 * So a two level scheme suffices for now. */
2366 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2331 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2367 if (sbi->s_group_info == NULL) { 2332 if (sbi->s_group_info == NULL) {
2368 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2333 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2369 return -ENOMEM; 2334 return -ENOMEM;
@@ -2373,6 +2338,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2338 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2374 goto err_freesgi; 2339 goto err_freesgi;
2375 } 2340 }
2341 sbi->s_buddy_cache->i_ino = get_next_ino();
2376 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2342 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2377 for (i = 0; i < ngroups; i++) { 2343 for (i = 0; i < ngroups; i++) {
2378 desc = ext4_get_group_desc(sb, i, NULL); 2344 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2354,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
2388 return 0; 2354 return 0;
2389 2355
2390err_freebuddy: 2356err_freebuddy:
2357 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2391 while (i-- > 0) 2358 while (i-- > 0)
2392 kfree(ext4_get_group_info(sb, i)); 2359 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2393 i = num_meta_group_infos; 2360 i = num_meta_group_infos;
2394 while (i-- > 0) 2361 while (i-- > 0)
2395 kfree(sbi->s_group_info[i]); 2362 kfree(sbi->s_group_info[i]);
@@ -2399,6 +2366,55 @@ err_freesgi:
2399 return -ENOMEM; 2366 return -ENOMEM;
2400} 2367}
2401 2368
2369static void ext4_groupinfo_destroy_slabs(void)
2370{
2371 int i;
2372
2373 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2374 if (ext4_groupinfo_caches[i])
2375 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2376 ext4_groupinfo_caches[i] = NULL;
2377 }
2378}
2379
2380static int ext4_groupinfo_create_slab(size_t size)
2381{
2382 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2383 int slab_size;
2384 int blocksize_bits = order_base_2(size);
2385 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2386 struct kmem_cache *cachep;
2387
2388 if (cache_index >= NR_GRPINFO_CACHES)
2389 return -EINVAL;
2390
2391 if (unlikely(cache_index < 0))
2392 cache_index = 0;
2393
2394 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2395 if (ext4_groupinfo_caches[cache_index]) {
2396 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2397 return 0; /* Already created */
2398 }
2399
2400 slab_size = offsetof(struct ext4_group_info,
2401 bb_counters[blocksize_bits + 2]);
2402
2403 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2404 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2405 NULL);
2406
2407 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2408 if (!cachep) {
2409 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2410 return -ENOMEM;
2411 }
2412
2413 ext4_groupinfo_caches[cache_index] = cachep;
2414
2415 return 0;
2416}
2417
2402int ext4_mb_init(struct super_block *sb, int needs_recovery) 2418int ext4_mb_init(struct super_block *sb, int needs_recovery)
2403{ 2419{
2404 struct ext4_sb_info *sbi = EXT4_SB(sb); 2420 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2411,16 +2427,21 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2411 2427
2412 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2428 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2413 if (sbi->s_mb_offsets == NULL) { 2429 if (sbi->s_mb_offsets == NULL) {
2414 return -ENOMEM; 2430 ret = -ENOMEM;
2431 goto out;
2415 } 2432 }
2416 2433
2417 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2434 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2418 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2435 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2419 if (sbi->s_mb_maxs == NULL) { 2436 if (sbi->s_mb_maxs == NULL) {
2420 kfree(sbi->s_mb_offsets); 2437 ret = -ENOMEM;
2421 return -ENOMEM; 2438 goto out;
2422 } 2439 }
2423 2440
2441 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2442 if (ret < 0)
2443 goto out;
2444
2424 /* order 0 is regular bitmap */ 2445 /* order 0 is regular bitmap */
2425 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2446 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2426 sbi->s_mb_offsets[0] = 0; 2447 sbi->s_mb_offsets[0] = 0;
@@ -2439,9 +2460,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2439 /* init file for buddy data */ 2460 /* init file for buddy data */
2440 ret = ext4_mb_init_backend(sb); 2461 ret = ext4_mb_init_backend(sb);
2441 if (ret != 0) { 2462 if (ret != 0) {
2442 kfree(sbi->s_mb_offsets); 2463 goto out;
2443 kfree(sbi->s_mb_maxs);
2444 return ret;
2445 } 2464 }
2446 2465
2447 spin_lock_init(&sbi->s_md_lock); 2466 spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2475,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2456 2475
2457 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2476 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2458 if (sbi->s_locality_groups == NULL) { 2477 if (sbi->s_locality_groups == NULL) {
2459 kfree(sbi->s_mb_offsets); 2478 ret = -ENOMEM;
2460 kfree(sbi->s_mb_maxs); 2479 goto out;
2461 return -ENOMEM;
2462 } 2480 }
2463 for_each_possible_cpu(i) { 2481 for_each_possible_cpu(i) {
2464 struct ext4_locality_group *lg; 2482 struct ext4_locality_group *lg;
@@ -2475,7 +2493,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2475 2493
2476 if (sbi->s_journal) 2494 if (sbi->s_journal)
2477 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2495 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2478 return 0; 2496out:
2497 if (ret) {
2498 kfree(sbi->s_mb_offsets);
2499 kfree(sbi->s_mb_maxs);
2500 }
2501 return ret;
2479} 2502}
2480 2503
2481/* need to called with the ext4 group lock held */ 2504/* need to called with the ext4 group lock held */
@@ -2503,6 +2526,7 @@ int ext4_mb_release(struct super_block *sb)
2503 int num_meta_group_infos; 2526 int num_meta_group_infos;
2504 struct ext4_group_info *grinfo; 2527 struct ext4_group_info *grinfo;
2505 struct ext4_sb_info *sbi = EXT4_SB(sb); 2528 struct ext4_sb_info *sbi = EXT4_SB(sb);
2529 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2506 2530
2507 if (sbi->s_group_info) { 2531 if (sbi->s_group_info) {
2508 for (i = 0; i < ngroups; i++) { 2532 for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2537,7 @@ int ext4_mb_release(struct super_block *sb)
2513 ext4_lock_group(sb, i); 2537 ext4_lock_group(sb, i);
2514 ext4_mb_cleanup_pa(grinfo); 2538 ext4_mb_cleanup_pa(grinfo);
2515 ext4_unlock_group(sb, i); 2539 ext4_unlock_group(sb, i);
2516 kfree(grinfo); 2540 kmem_cache_free(cachep, grinfo);
2517 } 2541 }
2518 num_meta_group_infos = (ngroups + 2542 num_meta_group_infos = (ngroups +
2519 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2543 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,20 +2581,15 @@ int ext4_mb_release(struct super_block *sb)
2557 return 0; 2581 return 0;
2558} 2582}
2559 2583
2560static inline void ext4_issue_discard(struct super_block *sb, 2584static inline int ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count) 2585 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{ 2586{
2563 int ret;
2564 ext4_fsblk_t discard_block; 2587 ext4_fsblk_t discard_block;
2565 2588
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2589 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2590 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2591 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2592 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 }
2574} 2593}
2575 2594
2576/* 2595/*
@@ -2594,7 +2613,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2594 2613
2595 if (test_opt(sb, DISCARD)) 2614 if (test_opt(sb, DISCARD))
2596 ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2597 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2598 2617
2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2600 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2677,22 @@ static void ext4_remove_debugfs_entry(void)
2658 2677
2659#endif 2678#endif
2660 2679
2661int __init init_ext4_mballoc(void) 2680int __init ext4_init_mballoc(void)
2662{ 2681{
2663 ext4_pspace_cachep = 2682 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2664 kmem_cache_create("ext4_prealloc_space", 2683 SLAB_RECLAIM_ACCOUNT);
2665 sizeof(struct ext4_prealloc_space),
2666 0, SLAB_RECLAIM_ACCOUNT, NULL);
2667 if (ext4_pspace_cachep == NULL) 2684 if (ext4_pspace_cachep == NULL)
2668 return -ENOMEM; 2685 return -ENOMEM;
2669 2686
2670 ext4_ac_cachep = 2687 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2671 kmem_cache_create("ext4_alloc_context", 2688 SLAB_RECLAIM_ACCOUNT);
2672 sizeof(struct ext4_allocation_context),
2673 0, SLAB_RECLAIM_ACCOUNT, NULL);
2674 if (ext4_ac_cachep == NULL) { 2689 if (ext4_ac_cachep == NULL) {
2675 kmem_cache_destroy(ext4_pspace_cachep); 2690 kmem_cache_destroy(ext4_pspace_cachep);
2676 return -ENOMEM; 2691 return -ENOMEM;
2677 } 2692 }
2678 2693
2679 ext4_free_ext_cachep = 2694 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2680 kmem_cache_create("ext4_free_block_extents", 2695 SLAB_RECLAIM_ACCOUNT);
2681 sizeof(struct ext4_free_data),
2682 0, SLAB_RECLAIM_ACCOUNT, NULL);
2683 if (ext4_free_ext_cachep == NULL) { 2696 if (ext4_free_ext_cachep == NULL) {
2684 kmem_cache_destroy(ext4_pspace_cachep); 2697 kmem_cache_destroy(ext4_pspace_cachep);
2685 kmem_cache_destroy(ext4_ac_cachep); 2698 kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,7 +2702,7 @@ int __init init_ext4_mballoc(void)
2689 return 0; 2702 return 0;
2690} 2703}
2691 2704
2692void exit_ext4_mballoc(void) 2705void ext4_exit_mballoc(void)
2693{ 2706{
2694 /* 2707 /*
2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2708 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
@@ -2699,6 +2712,7 @@ void exit_ext4_mballoc(void)
2699 kmem_cache_destroy(ext4_pspace_cachep); 2712 kmem_cache_destroy(ext4_pspace_cachep);
2700 kmem_cache_destroy(ext4_ac_cachep); 2713 kmem_cache_destroy(ext4_ac_cachep);
2701 kmem_cache_destroy(ext4_free_ext_cachep); 2714 kmem_cache_destroy(ext4_free_ext_cachep);
2715 ext4_groupinfo_destroy_slabs();
2702 ext4_remove_debugfs_entry(); 2716 ext4_remove_debugfs_entry();
2703} 2717}
2704 2718
@@ -3135,7 +3149,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3135 cur_distance = abs(goal_block - cpa->pa_pstart); 3149 cur_distance = abs(goal_block - cpa->pa_pstart);
3136 new_distance = abs(goal_block - pa->pa_pstart); 3150 new_distance = abs(goal_block - pa->pa_pstart);
3137 3151
3138 if (cur_distance < new_distance) 3152 if (cur_distance <= new_distance)
3139 return cpa; 3153 return cpa;
3140 3154
3141 /* drop the previous reference */ 3155 /* drop the previous reference */
@@ -3535,8 +3549,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3535 */ 3549 */
3536static noinline_for_stack int 3550static noinline_for_stack int
3537ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3551ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3538 struct ext4_prealloc_space *pa, 3552 struct ext4_prealloc_space *pa)
3539 struct ext4_allocation_context *ac)
3540{ 3553{
3541 struct super_block *sb = e4b->bd_sb; 3554 struct super_block *sb = e4b->bd_sb;
3542 struct ext4_sb_info *sbi = EXT4_SB(sb); 3555 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3567,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3554 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3567 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3555 end = bit + pa->pa_len; 3568 end = bit + pa->pa_len;
3556 3569
3557 if (ac) {
3558 ac->ac_sb = sb;
3559 ac->ac_inode = pa->pa_inode;
3560 }
3561
3562 while (bit < end) { 3570 while (bit < end) {
3563 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3571 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3564 if (bit >= end) 3572 if (bit >= end)
@@ -3569,15 +3577,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3569 (unsigned) next - bit, (unsigned) group); 3577 (unsigned) next - bit, (unsigned) group);
3570 free += next - bit; 3578 free += next - bit;
3571 3579
3572 if (ac) { 3580 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3573 ac->ac_b_ex.fe_group = group; 3581 trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
3574 ac->ac_b_ex.fe_start = bit;
3575 ac->ac_b_ex.fe_len = next - bit;
3576 ac->ac_b_ex.fe_logical = 0;
3577 trace_ext4_mballoc_discard(ac);
3578 }
3579
3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3581 next - bit); 3582 next - bit);
3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3583 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3583 bit = next + 1; 3584 bit = next + 1;
@@ -3601,29 +3602,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3601 3602
3602static noinline_for_stack int 3603static noinline_for_stack int
3603ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3604ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3604 struct ext4_prealloc_space *pa, 3605 struct ext4_prealloc_space *pa)
3605 struct ext4_allocation_context *ac)
3606{ 3606{
3607 struct super_block *sb = e4b->bd_sb; 3607 struct super_block *sb = e4b->bd_sb;
3608 ext4_group_t group; 3608 ext4_group_t group;
3609 ext4_grpblk_t bit; 3609 ext4_grpblk_t bit;
3610 3610
3611 trace_ext4_mb_release_group_pa(sb, ac, pa); 3611 trace_ext4_mb_release_group_pa(pa);
3612 BUG_ON(pa->pa_deleted == 0); 3612 BUG_ON(pa->pa_deleted == 0);
3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3617 3617 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3618 if (ac) {
3619 ac->ac_sb = sb;
3620 ac->ac_inode = NULL;
3621 ac->ac_b_ex.fe_group = group;
3622 ac->ac_b_ex.fe_start = bit;
3623 ac->ac_b_ex.fe_len = pa->pa_len;
3624 ac->ac_b_ex.fe_logical = 0;
3625 trace_ext4_mballoc_discard(ac);
3626 }
3627 3618
3628 return 0; 3619 return 0;
3629} 3620}
@@ -3644,7 +3635,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3644 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3635 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3645 struct buffer_head *bitmap_bh = NULL; 3636 struct buffer_head *bitmap_bh = NULL;
3646 struct ext4_prealloc_space *pa, *tmp; 3637 struct ext4_prealloc_space *pa, *tmp;
3647 struct ext4_allocation_context *ac;
3648 struct list_head list; 3638 struct list_head list;
3649 struct ext4_buddy e4b; 3639 struct ext4_buddy e4b;
3650 int err; 3640 int err;
@@ -3673,9 +3663,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3673 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3663 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3674 3664
3675 INIT_LIST_HEAD(&list); 3665 INIT_LIST_HEAD(&list);
3676 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3677 if (ac)
3678 ac->ac_sb = sb;
3679repeat: 3666repeat:
3680 ext4_lock_group(sb, group); 3667 ext4_lock_group(sb, group);
3681 list_for_each_entry_safe(pa, tmp, 3668 list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3717,9 @@ repeat:
3730 spin_unlock(pa->pa_obj_lock); 3717 spin_unlock(pa->pa_obj_lock);
3731 3718
3732 if (pa->pa_type == MB_GROUP_PA) 3719 if (pa->pa_type == MB_GROUP_PA)
3733 ext4_mb_release_group_pa(&e4b, pa, ac); 3720 ext4_mb_release_group_pa(&e4b, pa);
3734 else 3721 else
3735 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3722 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3736 3723
3737 list_del(&pa->u.pa_tmp_list); 3724 list_del(&pa->u.pa_tmp_list);
3738 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3725 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3727,6 @@ repeat:
3740 3727
3741out: 3728out:
3742 ext4_unlock_group(sb, group); 3729 ext4_unlock_group(sb, group);
3743 if (ac)
3744 kmem_cache_free(ext4_ac_cachep, ac);
3745 ext4_mb_unload_buddy(&e4b); 3730 ext4_mb_unload_buddy(&e4b);
3746 put_bh(bitmap_bh); 3731 put_bh(bitmap_bh);
3747 return free; 3732 return free;
@@ -3762,7 +3747,6 @@ void ext4_discard_preallocations(struct inode *inode)
3762 struct super_block *sb = inode->i_sb; 3747 struct super_block *sb = inode->i_sb;
3763 struct buffer_head *bitmap_bh = NULL; 3748 struct buffer_head *bitmap_bh = NULL;
3764 struct ext4_prealloc_space *pa, *tmp; 3749 struct ext4_prealloc_space *pa, *tmp;
3765 struct ext4_allocation_context *ac;
3766 ext4_group_t group = 0; 3750 ext4_group_t group = 0;
3767 struct list_head list; 3751 struct list_head list;
3768 struct ext4_buddy e4b; 3752 struct ext4_buddy e4b;
@@ -3778,11 +3762,6 @@ void ext4_discard_preallocations(struct inode *inode)
3778 3762
3779 INIT_LIST_HEAD(&list); 3763 INIT_LIST_HEAD(&list);
3780 3764
3781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3782 if (ac) {
3783 ac->ac_sb = sb;
3784 ac->ac_inode = inode;
3785 }
3786repeat: 3765repeat:
3787 /* first, collect all pa's in the inode */ 3766 /* first, collect all pa's in the inode */
3788 spin_lock(&ei->i_prealloc_lock); 3767 spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3831,7 @@ repeat:
3852 3831
3853 ext4_lock_group(sb, group); 3832 ext4_lock_group(sb, group);
3854 list_del(&pa->pa_group_list); 3833 list_del(&pa->pa_group_list);
3855 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3834 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3856 ext4_unlock_group(sb, group); 3835 ext4_unlock_group(sb, group);
3857 3836
3858 ext4_mb_unload_buddy(&e4b); 3837 ext4_mb_unload_buddy(&e4b);
@@ -3861,30 +3840,16 @@ repeat:
3861 list_del(&pa->u.pa_tmp_list); 3840 list_del(&pa->u.pa_tmp_list);
3862 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3841 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3863 } 3842 }
3864 if (ac)
3865 kmem_cache_free(ext4_ac_cachep, ac);
3866} 3843}
3867 3844
3868/*
3869 * finds all preallocated spaces and return blocks being freed to them
3870 * if preallocated space becomes full (no block is used from the space)
3871 * then the function frees space in buddy
3872 * XXX: at the moment, truncate (which is the only way to free blocks)
3873 * discards all preallocations
3874 */
3875static void ext4_mb_return_to_preallocation(struct inode *inode,
3876 struct ext4_buddy *e4b,
3877 sector_t block, int count)
3878{
3879 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
3880}
3881#ifdef CONFIG_EXT4_DEBUG 3845#ifdef CONFIG_EXT4_DEBUG
3882static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3846static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3883{ 3847{
3884 struct super_block *sb = ac->ac_sb; 3848 struct super_block *sb = ac->ac_sb;
3885 ext4_group_t ngroups, i; 3849 ext4_group_t ngroups, i;
3886 3850
3887 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 3851 if (!mb_enable_debug ||
3852 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3888 return; 3853 return;
3889 3854
3890 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3855 printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4060,14 +4025,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4060 struct ext4_buddy e4b; 4025 struct ext4_buddy e4b;
4061 struct list_head discard_list; 4026 struct list_head discard_list;
4062 struct ext4_prealloc_space *pa, *tmp; 4027 struct ext4_prealloc_space *pa, *tmp;
4063 struct ext4_allocation_context *ac;
4064 4028
4065 mb_debug(1, "discard locality group preallocation\n"); 4029 mb_debug(1, "discard locality group preallocation\n");
4066 4030
4067 INIT_LIST_HEAD(&discard_list); 4031 INIT_LIST_HEAD(&discard_list);
4068 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4069 if (ac)
4070 ac->ac_sb = sb;
4071 4032
4072 spin_lock(&lg->lg_prealloc_lock); 4033 spin_lock(&lg->lg_prealloc_lock);
4073 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4034 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4080,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4119 } 4080 }
4120 ext4_lock_group(sb, group); 4081 ext4_lock_group(sb, group);
4121 list_del(&pa->pa_group_list); 4082 list_del(&pa->pa_group_list);
4122 ext4_mb_release_group_pa(&e4b, pa, ac); 4083 ext4_mb_release_group_pa(&e4b, pa);
4123 ext4_unlock_group(sb, group); 4084 ext4_unlock_group(sb, group);
4124 4085
4125 ext4_mb_unload_buddy(&e4b); 4086 ext4_mb_unload_buddy(&e4b);
4126 list_del(&pa->u.pa_tmp_list); 4087 list_del(&pa->u.pa_tmp_list);
4127 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4088 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4128 } 4089 }
4129 if (ac)
4130 kmem_cache_free(ext4_ac_cachep, ac);
4131} 4090}
4132 4091
4133/* 4092/*
@@ -4203,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4203 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4204 } 4163 }
4205 } 4164 }
4206 if (ac->alloc_semp)
4207 up_read(ac->alloc_semp);
4208 if (pa) { 4165 if (pa) {
4209 /* 4166 /*
4210 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4211 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4212 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4213 * doesn't grow big. We need to release 4170 * doesn't grow big.
4214 * alloc_semp before calling ext4_mb_add_n_trim()
4215 */ 4171 */
4216 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4217 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4273,14 +4229,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4273 * EDQUOT check, as blocks and quotas have been already 4229 * EDQUOT check, as blocks and quotas have been already
4274 * reserved when data being copied into pagecache. 4230 * reserved when data being copied into pagecache.
4275 */ 4231 */
4276 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4232 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4277 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4233 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4278 else { 4234 else {
4279 /* Without delayed allocation we need to verify 4235 /* Without delayed allocation we need to verify
4280 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4281 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4282 */ 4238 */
4283 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4284 /* let others to free the space */ 4242 /* let others to free the space */
4285 yield(); 4243 yield();
4286 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4290,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4290 return 0; 4248 return 0;
4291 } 4249 }
4292 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4293 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4294 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4295 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4296 } 4260 }
4297 inquota = ar->len; 4261 inquota = ar->len;
4298 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4370,7 +4334,8 @@ out:
4370 if (inquota && ar->len < inquota) 4334 if (inquota && ar->len < inquota)
4371 dquot_free_block(ar->inode, inquota - ar->len); 4335 dquot_free_block(ar->inode, inquota - ar->len);
4372 if (!ar->len) { 4336 if (!ar->len) {
4373 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4337 if (!ext4_test_inode_state(ar->inode,
4338 EXT4_STATE_DELALLOC_RESERVED))
4374 /* release all the reserved blocks if non delalloc */ 4339 /* release all the reserved blocks if non delalloc */
4375 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4340 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4376 reserv_blks); 4341 reserv_blks);
@@ -4483,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 * @inode: inode 4448 * @inode: inode
4484 * @block: start physical block to free 4449 * @block: start physical block to free
4485 * @count: number of blocks to count 4450 * @count: number of blocks to count
4486 * @metadata: Are these metadata blocks 4451 * @flags: flags used by ext4_free_blocks
4487 */ 4452 */
4488void ext4_free_blocks(handle_t *handle, struct inode *inode, 4453void ext4_free_blocks(handle_t *handle, struct inode *inode,
4489 struct buffer_head *bh, ext4_fsblk_t block, 4454 struct buffer_head *bh, ext4_fsblk_t block,
@@ -4491,7 +4456,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4491{ 4456{
4492 struct buffer_head *bitmap_bh = NULL; 4457 struct buffer_head *bitmap_bh = NULL;
4493 struct super_block *sb = inode->i_sb; 4458 struct super_block *sb = inode->i_sb;
4494 struct ext4_allocation_context *ac = NULL;
4495 struct ext4_group_desc *gdp; 4459 struct ext4_group_desc *gdp;
4496 unsigned long freed = 0; 4460 unsigned long freed = 0;
4497 unsigned int overflow; 4461 unsigned int overflow;
@@ -4531,6 +4495,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4531 if (!bh) 4495 if (!bh)
4532 tbh = sb_find_get_block(inode->i_sb, 4496 tbh = sb_find_get_block(inode->i_sb,
4533 block + i); 4497 block + i);
4498 if (unlikely(!tbh))
4499 continue;
4534 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4500 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4535 inode, tbh, block + i); 4501 inode, tbh, block + i);
4536 } 4502 }
@@ -4546,12 +4512,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4546 if (!ext4_should_writeback_data(inode)) 4512 if (!ext4_should_writeback_data(inode))
4547 flags |= EXT4_FREE_BLOCKS_METADATA; 4513 flags |= EXT4_FREE_BLOCKS_METADATA;
4548 4514
4549 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4550 if (ac) {
4551 ac->ac_inode = inode;
4552 ac->ac_sb = sb;
4553 }
4554
4555do_more: 4515do_more:
4556 overflow = 0; 4516 overflow = 0;
4557 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4517 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4569,7 @@ do_more:
4609 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4569 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4610 } 4570 }
4611#endif 4571#endif
4612 if (ac) { 4572 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4613 ac->ac_b_ex.fe_group = block_group;
4614 ac->ac_b_ex.fe_start = bit;
4615 ac->ac_b_ex.fe_len = count;
4616 trace_ext4_mballoc_free(ac);
4617 }
4618 4573
4619 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4574 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4620 if (err) 4575 if (err)
@@ -4626,7 +4581,11 @@ do_more:
4626 * blocks being freed are metadata. these blocks shouldn't 4581 * blocks being freed are metadata. these blocks shouldn't
4627 * be used until this transaction is committed 4582 * be used until this transaction is committed
4628 */ 4583 */
4629 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4584 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4585 if (!new_entry) {
4586 err = -ENOMEM;
4587 goto error_return;
4588 }
4630 new_entry->start_blk = bit; 4589 new_entry->start_blk = bit;
4631 new_entry->group = block_group; 4590 new_entry->group = block_group;
4632 new_entry->count = count; 4591 new_entry->count = count;
@@ -4643,9 +4602,6 @@ do_more:
4643 ext4_lock_group(sb, block_group); 4602 ext4_lock_group(sb, block_group);
4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4603 mb_clear_bits(bitmap_bh->b_data, bit, count);
4645 mb_free_blocks(inode, &e4b, bit, count); 4604 mb_free_blocks(inode, &e4b, bit, count);
4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4649 } 4605 }
4650 4606
4651 ret = ext4_free_blks_count(sb, gdp) + count; 4607 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4641,316 @@ error_return:
4685 dquot_free_block(inode, freed); 4641 dquot_free_block(inode, freed);
4686 brelse(bitmap_bh); 4642 brelse(bitmap_bh);
4687 ext4_std_error(sb, err); 4643 ext4_std_error(sb, err);
4688 if (ac)
4689 kmem_cache_free(ext4_ac_cachep, ac);
4690 return; 4644 return;
4691} 4645}
4646
4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4770 * @sb: super block for the file system
4771 * @start: starting block of the free extent in the alloc. group
4772 * @count: number of blocks to TRIM
4773 * @group: alloc. group we are working with
4774 * @e4b: ext4 buddy for the group
4775 *
4776 * Trim "count" blocks starting at "start" in the "group". To assure that no
4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4778 * be called with under the group lock.
4779 */
4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4781 ext4_group_t group, struct ext4_buddy *e4b)
4782{
4783 struct ext4_free_extent ex;
4784
4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4786
4787 ex.fe_start = start;
4788 ex.fe_group = group;
4789 ex.fe_len = count;
4790
4791 /*
4792 * Mark blocks used, so no one can reuse them while
4793 * being trimmed.
4794 */
4795 mb_mark_used(e4b, &ex);
4796 ext4_unlock_group(sb, group);
4797 ext4_issue_discard(sb, group, start, count);
4798 ext4_lock_group(sb, group);
4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4800}
4801
4802/**
4803 * ext4_trim_all_free -- function to trim all free space in alloc. group
4804 * @sb: super block for file system
4805 * @e4b: ext4 buddy
4806 * @start: first group block to examine
4807 * @max: last group block to examine
4808 * @minblocks: minimum extent block count
4809 *
4810 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4811 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4812 * the extent.
4813 *
4814 *
4815 * ext4_trim_all_free walks through group's block bitmap searching for free
4816 * extents. When the free extent is found, mark it as used in group buddy
4817 * bitmap. Then issue a TRIM command on this extent and free the extent in
4818 * the group buddy bitmap. This is done until whole group is scanned.
4819 */
4820static ext4_grpblk_t
4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4824{
4825 void *bitmap;
4826 ext4_grpblk_t next, count = 0;
4827 struct ext4_buddy e4b;
4828 int ret;
4829
4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4837
4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4841
4842 while (start < max) {
4843 start = mb_find_next_zero_bit(bitmap, max, start);
4844 if (start >= max)
4845 break;
4846 next = mb_find_next_bit(bitmap, max, start);
4847
4848 if ((next - start) >= minblocks) {
4849 ext4_trim_extent(sb, start,
4850 next - start, group, &e4b);
4851 count += next - start;
4852 }
4853 start = next + 1;
4854
4855 if (fatal_signal_pending(current)) {
4856 count = -ERESTARTSYS;
4857 break;
4858 }
4859
4860 if (need_resched()) {
4861 ext4_unlock_group(sb, group);
4862 cond_resched();
4863 ext4_lock_group(sb, group);
4864 }
4865
4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4867 break;
4868 }
4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4871
4872 ext4_debug("trimmed %d blocks in the group %d\n",
4873 count, group);
4874
4875 return count;
4876}
4877
4878/**
4879 * ext4_trim_fs() -- trim ioctl handle function
4880 * @sb: superblock for filesystem
4881 * @range: fstrim_range structure
4882 *
4883 * start: First Byte to trim
4884 * len: number of Bytes to trim from start
4885 * minlen: minimum extent length in Bytes
4886 * ext4_trim_fs goes through all allocation groups containing Bytes from
4887 * start to start+len. For each such a group ext4_trim_all_free function
4888 * is invoked to trim all free space.
4889 */
4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4891{
4892 struct ext4_group_info *grp;
4893 ext4_group_t first_group, last_group;
4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4896 uint64_t start, len, minlen, trimmed = 0;
4897 ext4_fsblk_t first_data_blk =
4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4899 int ret = 0;
4900
4901 start = range->start >> sb->s_blocksize_bits;
4902 len = range->len >> sb->s_blocksize_bits;
4903 minlen = range->minlen >> sb->s_blocksize_bits;
4904
4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4906 return -EINVAL;
4907 if (start < first_data_blk) {
4908 len -= first_data_blk - start;
4909 start = first_data_blk;
4910 }
4911
4912 /* Determine first and last group to examine based on start and len */
4913 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4914 &first_group, &first_block);
4915 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4916 &last_group, &last_block);
4917 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4918 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4919
4920 if (first_group > last_group)
4921 return -EINVAL;
4922
4923 for (group = first_group; group <= last_group; group++) {
4924 grp = ext4_get_group_info(sb, group);
4925 /* We only do this if the grp has never been initialized */
4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4927 ret = ext4_mb_init_group(sb, group);
4928 if (ret)
4929 break;
4930 }
4931
4932 /*
4933 * For all the groups except the last one, last block will
4934 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4935 * change it for the last group in which case start +
4936 * len < EXT4_BLOCKS_PER_GROUP(sb).
4937 */
4938 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4939 last_block = first_block + len;
4940 len -= last_block - first_block;
4941
4942 if (grp->bb_free >= minlen) {
4943 cnt = ext4_trim_all_free(sb, group, first_block,
4944 last_block, minlen);
4945 if (cnt < 0) {
4946 ret = cnt;
4947 break;
4948 }
4949 }
4950 trimmed += cnt;
4951 first_block = 0;
4952 }
4953 range->len = trimmed * sb->s_blocksize;
4954
4955 return ret;
4956}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
169 /* original request */ 169 /* original request */
170 struct ext4_free_extent ac_o_ex; 170 struct ext4_free_extent ac_o_ex;
171 171
172 /* goal request (after normalization) */ 172 /* goal request (normalized ac_o_ex) */
173 struct ext4_free_extent ac_g_ex; 173 struct ext4_free_extent ac_g_ex;
174 174
175 /* the best found extent */ 175 /* the best found extent */
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
193 __u8 ac_op; /* operation, for history only */ 193 __u8 ac_op; /* operation, for history only */
194 struct page *ac_bitmap_page; 194 struct page *ac_bitmap_page;
195 struct page *ac_buddy_page; 195 struct page *ac_buddy_page;
196 /*
197 * pointer to the held semaphore upon successful
198 * block allocation
199 */
200 struct rw_semaphore *alloc_semp;
201 struct ext4_prealloc_space *ac_pa; 196 struct ext4_prealloc_space *ac_pa;
202 struct ext4_locality_group *ac_lg; 197 struct ext4_locality_group *ac_lg;
203}; 198};
@@ -215,7 +210,6 @@ struct ext4_buddy {
215 struct super_block *bd_sb; 210 struct super_block *bd_sb;
216 __u16 bd_blkbits; 211 __u16 bd_blkbits;
217 ext4_group_t bd_group; 212 ext4_group_t bd_group;
218 struct rw_semaphore *alloc_semp;
219}; 213};
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 214#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 215#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
263 for (i = 0; i < max_entries; i++) { 263 for (i = 0; i < max_entries; i++) {
264 if (tmp_idata[i]) { 264 if (tmp_idata[i]) {
265 extend_credit_for_blkdel(handle, inode); 265 extend_credit_for_blkdel(handle, inode);
266 ext4_free_blocks(handle, inode, 0, 266 ext4_free_blocks(handle, inode, NULL,
267 le32_to_cpu(tmp_idata[i]), 1, 267 le32_to_cpu(tmp_idata[i]), 1,
268 EXT4_FREE_BLOCKS_METADATA | 268 EXT4_FREE_BLOCKS_METADATA |
269 EXT4_FREE_BLOCKS_FORGET); 269 EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
271 } 271 }
272 put_bh(bh); 272 put_bh(bh);
273 extend_credit_for_blkdel(handle, inode); 273 extend_credit_for_blkdel(handle, inode);
274 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
275 EXT4_FREE_BLOCKS_METADATA | 275 EXT4_FREE_BLOCKS_METADATA |
276 EXT4_FREE_BLOCKS_FORGET); 276 EXT4_FREE_BLOCKS_FORGET);
277 return 0; 277 return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
302 } 302 }
303 put_bh(bh); 303 put_bh(bh);
304 extend_credit_for_blkdel(handle, inode); 304 extend_credit_for_blkdel(handle, inode);
305 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 305 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
306 EXT4_FREE_BLOCKS_METADATA | 306 EXT4_FREE_BLOCKS_METADATA |
307 EXT4_FREE_BLOCKS_FORGET); 307 EXT4_FREE_BLOCKS_FORGET);
308 return 0; 308 return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
315 /* ei->i_data[EXT4_IND_BLOCK] */ 315 /* ei->i_data[EXT4_IND_BLOCK] */
316 if (i_data[0]) { 316 if (i_data[0]) {
317 extend_credit_for_blkdel(handle, inode); 317 extend_credit_for_blkdel(handle, inode);
318 ext4_free_blocks(handle, inode, 0, 318 ext4_free_blocks(handle, inode, NULL,
319 le32_to_cpu(i_data[0]), 1, 319 le32_to_cpu(i_data[0]), 1,
320 EXT4_FREE_BLOCKS_METADATA | 320 EXT4_FREE_BLOCKS_METADATA |
321 EXT4_FREE_BLOCKS_FORGET); 321 EXT4_FREE_BLOCKS_FORGET);
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); 379 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
412 struct buffer_head *bh; 412 struct buffer_head *bh;
413 struct ext4_extent_header *eh; 413 struct ext4_extent_header *eh;
414 414
415 block = idx_pblock(ix); 415 block = ext4_idx_pblock(ix);
416 bh = sb_bread(inode->i_sb, block); 416 bh = sb_bread(inode->i_sb, block);
417 if (!bh) 417 if (!bh)
418 return -EIO; 418 return -EIO;
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
428 } 428 }
429 put_bh(bh); 429 put_bh(bh);
430 extend_credit_for_blkdel(handle, inode); 430 extend_credit_for_blkdel(handle, inode);
431 ext4_free_blocks(handle, inode, 0, block, 1, 431 ext4_free_blocks(handle, inode, NULL, block, 1,
432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
433 return retval; 433 return retval;
434} 434}
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, 0, goal); 499 S_IFREG, NULL, goal);
500 if (IS_ERR(tmp_inode)) { 500 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 501 retval = -ENOMEM;
502 ext4_journal_stop(handle); 502 ext4_journal_stop(handle);
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
517 * start with one credit accounted for 517 * start with one credit accounted for
518 * superblock modification. 518 * superblock modification.
519 * 519 *
520 * For the tmp_inode we already have commited the 520 * For the tmp_inode we already have committed the
521 * trascation that created the inode. Later as and 521 * trascation that created the inode. Later as and
522 * when we add extents we extent the journal 522 * when we add extents we extent the journal
523 */ 523 */
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
1#include <linux/fs.h>
2#include <linux/random.h>
3#include <linux/buffer_head.h>
4#include <linux/utsname.h>
5#include <linux/kthread.h>
6
7#include "ext4.h"
8
9/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster.
12 */
13static int write_mmp_block(struct buffer_head *bh)
14{
15 mark_buffer_dirty(bh);
16 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync;
18 get_bh(bh);
19 submit_bh(WRITE_SYNC, bh);
20 wait_on_buffer(bh);
21 if (unlikely(!buffer_uptodate(bh)))
22 return 1;
23
24 return 0;
25}
26
27/*
28 * Read the MMP block. It _must_ be read from disk and hence we clear the
29 * uptodate flag on the buffer.
30 */
31static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
32 ext4_fsblk_t mmp_block)
33{
34 struct mmp_struct *mmp;
35
36 if (*bh)
37 clear_buffer_uptodate(*bh);
38
39 /* This would be sb_bread(sb, mmp_block), except we need to be sure
40 * that the MD RAID device cache has been bypassed, and that the read
41 * is not blocked in the elevator. */
42 if (!*bh)
43 *bh = sb_getblk(sb, mmp_block);
44 if (*bh) {
45 get_bh(*bh);
46 lock_buffer(*bh);
47 (*bh)->b_end_io = end_buffer_read_sync;
48 submit_bh(READ_SYNC, *bh);
49 wait_on_buffer(*bh);
50 if (!buffer_uptodate(*bh)) {
51 brelse(*bh);
52 *bh = NULL;
53 }
54 }
55 if (!*bh) {
56 ext4_warning(sb, "Error while reading MMP block %llu",
57 mmp_block);
58 return -EIO;
59 }
60
61 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
63 return -EINVAL;
64
65 return 0;
66}
67
68/*
69 * Dump as much information as possible to help the admin.
70 */
71void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
72 const char *function, unsigned int line, const char *msg)
73{
74 __ext4_warning(sb, function, line, msg);
75 __ext4_warning(sb, function, line,
76 "MMP failure info: last update time: %llu, last update "
77 "node: %s, last update device: %s\n",
78 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
79 mmp->mmp_nodename, mmp->mmp_bdevname);
80}
81
82/*
83 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
84 */
85static int kmmpd(void *data)
86{
87 struct super_block *sb = ((struct mmpd_data *) data)->sb;
88 struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
89 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
90 struct mmp_struct *mmp;
91 ext4_fsblk_t mmp_block;
92 u32 seq = 0;
93 unsigned long failed_writes = 0;
94 int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
95 unsigned mmp_check_interval;
96 unsigned long last_update_time;
97 unsigned long diff;
98 int retval;
99
100 mmp_block = le64_to_cpu(es->s_mmp_block);
101 mmp = (struct mmp_struct *)(bh->b_data);
102 mmp->mmp_time = cpu_to_le64(get_seconds());
103 /*
104 * Start with the higher mmp_check_interval and reduce it if
105 * the MMP block is being updated on time.
106 */
107 mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
108 EXT4_MMP_MIN_CHECK_INTERVAL);
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname,
113 sizeof(mmp->mmp_nodename));
114
115 while (!kthread_should_stop()) {
116 if (++seq > EXT4_MMP_SEQ_MAX)
117 seq = 1;
118
119 mmp->mmp_seq = cpu_to_le32(seq);
120 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies;
122
123 retval = write_mmp_block(bh);
124 /*
125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds.
127 */
128 if (retval && (failed_writes % 60) == 0) {
129 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++;
131 }
132
133 if (!(le32_to_cpu(es->s_feature_incompat) &
134 EXT4_FEATURE_INCOMPAT_MMP)) {
135 ext4_warning(sb, "kmmpd being stopped since MMP feature"
136 " has been disabled.");
137 EXT4_SB(sb)->s_mmp_tsk = NULL;
138 goto failed;
139 }
140
141 if (sb->s_flags & MS_RDONLY) {
142 ext4_warning(sb, "kmmpd being stopped since filesystem "
143 "has been remounted as readonly.");
144 EXT4_SB(sb)->s_mmp_tsk = NULL;
145 goto failed;
146 }
147
148 diff = jiffies - last_update_time;
149 if (diff < mmp_update_interval * HZ)
150 schedule_timeout_interruptible(mmp_update_interval *
151 HZ - diff);
152
153 /*
154 * We need to make sure that more than mmp_check_interval
155 * seconds have not passed since writing. If that has happened
156 * we need to check if the MMP block is as we left it.
157 */
158 diff = jiffies - last_update_time;
159 if (diff > mmp_check_interval * HZ) {
160 struct buffer_head *bh_check = NULL;
161 struct mmp_struct *mmp_check;
162
163 retval = read_mmp_block(sb, &bh_check, mmp_block);
164 if (retval) {
165 ext4_error(sb, "error reading MMP data: %d",
166 retval);
167
168 EXT4_SB(sb)->s_mmp_tsk = NULL;
169 goto failed;
170 }
171
172 mmp_check = (struct mmp_struct *)(bh_check->b_data);
173 if (mmp->mmp_seq != mmp_check->mmp_seq ||
174 memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
175 sizeof(mmp->mmp_nodename))) {
176 dump_mmp_msg(sb, mmp_check,
177 "Error while updating MMP info. "
178 "The filesystem seems to have been"
179 " multiply mounted.");
180 ext4_error(sb, "abort");
181 goto failed;
182 }
183 put_bh(bh_check);
184 }
185
186 /*
187 * Adjust the mmp_check_interval depending on how much time
188 * it took for the MMP block to be written.
189 */
190 mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
191 EXT4_MMP_MAX_CHECK_INTERVAL),
192 EXT4_MMP_MIN_CHECK_INTERVAL);
193 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
194 }
195
196 /*
197 * Unmount seems to be clean.
198 */
199 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
200 mmp->mmp_time = cpu_to_le64(get_seconds());
201
202 retval = write_mmp_block(bh);
203
204failed:
205 kfree(data);
206 brelse(bh);
207 return retval;
208}
209
210/*
211 * Get a random new sequence number but make sure it is not greater than
212 * EXT4_MMP_SEQ_MAX.
213 */
214static unsigned int mmp_new_seq(void)
215{
216 u32 new_seq;
217
218 do {
219 get_random_bytes(&new_seq, sizeof(u32));
220 } while (new_seq > EXT4_MMP_SEQ_MAX);
221
222 return new_seq;
223}
224
225/*
226 * Protect the filesystem from being mounted more than once.
227 */
228int ext4_multi_mount_protect(struct super_block *sb,
229 ext4_fsblk_t mmp_block)
230{
231 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
232 struct buffer_head *bh = NULL;
233 struct mmp_struct *mmp = NULL;
234 struct mmpd_data *mmpd_data;
235 u32 seq;
236 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
237 unsigned int wait_time = 0;
238 int retval;
239
240 if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
241 mmp_block >= ext4_blocks_count(es)) {
242 ext4_warning(sb, "Invalid MMP block in superblock");
243 goto failed;
244 }
245
246 retval = read_mmp_block(sb, &bh, mmp_block);
247 if (retval)
248 goto failed;
249
250 mmp = (struct mmp_struct *)(bh->b_data);
251
252 if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
253 mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
254
255 /*
256 * If check_interval in MMP block is larger, use that instead of
257 * update_interval from the superblock.
258 */
259 if (mmp->mmp_check_interval > mmp_check_interval)
260 mmp_check_interval = mmp->mmp_check_interval;
261
262 seq = le32_to_cpu(mmp->mmp_seq);
263 if (seq == EXT4_MMP_SEQ_CLEAN)
264 goto skip;
265
266 if (seq == EXT4_MMP_SEQ_FSCK) {
267 dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
268 goto failed;
269 }
270
271 wait_time = min(mmp_check_interval * 2 + 1,
272 mmp_check_interval + 60);
273
274 /* Print MMP interval if more than 20 secs. */
275 if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
276 ext4_warning(sb, "MMP interval %u higher than expected, please"
277 " wait.\n", wait_time * 2);
278
279 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
280 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
281 goto failed;
282 }
283
284 retval = read_mmp_block(sb, &bh, mmp_block);
285 if (retval)
286 goto failed;
287 mmp = (struct mmp_struct *)(bh->b_data);
288 if (seq != le32_to_cpu(mmp->mmp_seq)) {
289 dump_mmp_msg(sb, mmp,
290 "Device is already active on another node.");
291 goto failed;
292 }
293
294skip:
295 /*
296 * write a new random sequence number.
297 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
299
300 retval = write_mmp_block(bh);
301 if (retval)
302 goto failed;
303
304 /*
305 * wait for MMP interval and check mmp_seq.
306 */
307 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
308 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
309 goto failed;
310 }
311
312 retval = read_mmp_block(sb, &bh, mmp_block);
313 if (retval)
314 goto failed;
315 mmp = (struct mmp_struct *)(bh->b_data);
316 if (seq != le32_to_cpu(mmp->mmp_seq)) {
317 dump_mmp_msg(sb, mmp,
318 "Device is already active on another node.");
319 goto failed;
320 }
321
322 mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
323 if (!mmpd_data) {
324 ext4_warning(sb, "not enough memory for mmpd_data");
325 goto failed;
326 }
327 mmpd_data->sb = sb;
328 mmpd_data->bh = bh;
329
330 /*
331 * Start a kernel thread to update the MMP block periodically.
332 */
333 EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
334 bdevname(bh->b_bdev,
335 mmp->mmp_bdevname));
336 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
337 EXT4_SB(sb)->s_mmp_tsk = NULL;
338 kfree(mmpd_data);
339 ext4_warning(sb, "Unable to create kmmpd thread for %s.",
340 sb->s_id);
341 goto failed;
342 }
343
344 return 0;
345
346failed:
347 brelse(bh);
348 return 1;
349}
350
351
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..f57455a1b1b2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
86 /* leaf block */ 86 /* leaf block */
87 *extent = ++path[ppos].p_ext; 87 *extent = ++path[ppos].p_ext;
88 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 88 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
89 return 0; 89 return 0;
90 } 90 }
91 91
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
96 96
97 /* index block */ 97 /* index block */
98 path[ppos].p_idx++; 98 path[ppos].p_idx++;
99 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 99 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
100 if (path[ppos+1].p_bh) 100 if (path[ppos+1].p_bh)
101 brelse(path[ppos+1].p_bh); 101 brelse(path[ppos+1].p_bh);
102 path[ppos+1].p_bh = 102 path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
111 path[cur_ppos].p_idx = 111 path[cur_ppos].p_idx =
112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr); 112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
113 path[cur_ppos].p_block = 113 path[cur_ppos].p_block =
114 idx_pblock(path[cur_ppos].p_idx); 114 ext4_idx_pblock(path[cur_ppos].p_idx);
115 if (path[cur_ppos+1].p_bh) 115 if (path[cur_ppos+1].p_bh)
116 brelse(path[cur_ppos+1].p_bh); 116 brelse(path[cur_ppos+1].p_bh);
117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, 117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
133 path[leaf_ppos].p_ext = *extent = 133 path[leaf_ppos].p_ext = *extent =
134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
135 path[leaf_ppos].p_block = 135 path[leaf_ppos].p_block =
136 ext_pblock(path[leaf_ppos].p_ext); 136 ext4_ext_pblock(path[leaf_ppos].p_ext);
137 return 0; 137 return 0;
138 } 138 }
139 } 139 }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
249 */ 249 */
250 o_end->ee_block = end_ext->ee_block; 250 o_end->ee_block = end_ext->ee_block;
251 o_end->ee_len = end_ext->ee_len; 251 o_end->ee_len = end_ext->ee_len;
252 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 252 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
253 } 253 }
254 254
255 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
276 */ 276 */
277 o_end->ee_block = end_ext->ee_block; 277 o_end->ee_block = end_ext->ee_block;
278 o_end->ee_len = end_ext->ee_len; 278 o_end->ee_len = end_ext->ee_len;
279 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 279 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
280 280
281 /* 281 /*
282 * Set 0 to the extent block if new_ext was 282 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
361 /* Insert new entry */ 361 /* Insert new entry */
362 if (new_ext->ee_len) { 362 if (new_ext->ee_len) {
363 o_start[i] = *new_ext; 363 o_start[i] = *new_ext;
364 ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); 364 ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
365 } 365 }
366 366
367 /* Insert end entry */ 367 /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
488 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
489 489
490 new_ext.ee_block = cpu_to_le32(*from); 490 new_ext.ee_block = cpu_to_le32(*from);
491 ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); 491 ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
492 new_ext.ee_len = dext->ee_len; 492 new_ext.ee_len = dext->ee_len;
493 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 493 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
553 copy_extent_status(oext, &end_ext); 553 copy_extent_status(oext, &end_ext);
554 end_ext_alen = ext4_ext_get_actual_len(&end_ext); 554 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
555 ext4_ext_store_pblock(&end_ext, 555 ext4_ext_store_pblock(&end_ext,
556 (ext_pblock(o_end) + oext_alen - end_ext_alen)); 556 (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
557 end_ext.ee_block = 557 end_ext.ee_block =
558 cpu_to_le32(le32_to_cpu(o_end->ee_block) + 558 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
559 oext_alen - end_ext_alen); 559 oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 /* When tmp_dext is too large, pick up the target range. */ 604 /* When tmp_dext is too large, pick up the target range. */
605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
606 606
607 ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); 607 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
608 tmp_dext->ee_block = 608 tmp_dext->ee_block =
609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); 610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
613 tmp_dext->ee_len = cpu_to_le16(max_count); 613 tmp_dext->ee_len = cpu_to_le16(max_count);
614 614
615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); 615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
616 ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); 616 ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
617 617
618 /* Adjust extent length if donor extent is larger than orig */ 618 /* Adjust extent length if donor extent is larger than orig */
619 if (ext4_ext_get_actual_len(tmp_dext) > 619 if (ext4_ext_get_actual_len(tmp_dext) >
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
876 * It needs to call wait_on_page_writeback() to wait for the 876 * It needs to call wait_on_page_writeback() to wait for the
877 * writeback of the page. 877 * writeback of the page.
878 */ 878 */
879 if (PageWriteback(page)) 879 wait_on_page_writeback(page);
880 wait_on_page_writeback(page);
881 880
882 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
883 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
@@ -1003,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode,
1003 return -EINVAL; 1002 return -EINVAL;
1004 } 1003 }
1005 1004
1006 if ((orig_start > EXT_MAX_BLOCK) || 1005 if ((orig_start >= EXT_MAX_BLOCKS) ||
1007 (donor_start > EXT_MAX_BLOCK) || 1006 (donor_start >= EXT_MAX_BLOCKS) ||
1008 (*len > EXT_MAX_BLOCK) || 1007 (*len > EXT_MAX_BLOCKS) ||
1009 (orig_start + *len > EXT_MAX_BLOCK)) { 1008 (orig_start + *len >= EXT_MAX_BLOCKS)) {
1010 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 1009 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
1011 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, 1010 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
1012 orig_inode->i_ino, donor_inode->i_ino); 1011 orig_inode->i_ino, donor_inode->i_ino);
1013 return -EINVAL; 1012 return -EINVAL;
1014 } 1013 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
40#include "xattr.h" 40#include "xattr.h"
41#include "acl.h" 41#include "acl.h"
42 42
43#include <trace/events/ext4.h>
43/* 44/*
44 * define how far ahead to read directories while searching them. 45 * define how far ahead to read directories while searching them.
45 */ 46 */
@@ -581,9 +582,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
581 dir->i_sb->s_blocksize - 582 dir->i_sb->s_blocksize -
582 EXT4_DIR_REC_LEN(0)); 583 EXT4_DIR_REC_LEN(0));
583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 584 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
584 if (!ext4_check_dir_entry(dir, de, bh, 585 if (ext4_check_dir_entry(dir, NULL, de, bh,
585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 586 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
586 +((char *)de - bh->b_data))) { 587 + ((char *)de - bh->b_data))) {
587 /* On error, skip the f_pos to the next block. */ 588 /* On error, skip the f_pos to the next block. */
588 dir_file->f_pos = (dir_file->f_pos | 589 dir_file->f_pos = (dir_file->f_pos |
589 (dir->i_sb->s_blocksize - 1)) + 1; 590 (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +821,7 @@ static inline int search_dirblock(struct buffer_head *bh,
820 if ((char *) de + namelen <= dlimit && 821 if ((char *) de + namelen <= dlimit &&
821 ext4_match (namelen, name, de)) { 822 ext4_match (namelen, name, de)) {
822 /* found a match - just to be sure, do a full check */ 823 /* found a match - just to be sure, do a full check */
823 if (!ext4_check_dir_entry(dir, de, bh, offset)) 824 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
824 return -1; 825 return -1;
825 *res_dir = de; 826 *res_dir = de;
826 return 1; 827 return 1;
@@ -856,6 +857,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
856 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 857 struct buffer_head *bh_use[NAMEI_RA_SIZE];
857 struct buffer_head *bh, *ret = NULL; 858 struct buffer_head *bh, *ret = NULL;
858 ext4_lblk_t start, block, b; 859 ext4_lblk_t start, block, b;
860 const u8 *name = d_name->name;
859 int ra_max = 0; /* Number of bh's in the readahead 861 int ra_max = 0; /* Number of bh's in the readahead
860 buffer, bh_use[] */ 862 buffer, bh_use[] */
861 int ra_ptr = 0; /* Current index into readahead 863 int ra_ptr = 0; /* Current index into readahead
@@ -870,6 +872,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
870 namelen = d_name->len; 872 namelen = d_name->len;
871 if (namelen > EXT4_NAME_LEN) 873 if (namelen > EXT4_NAME_LEN)
872 return NULL; 874 return NULL;
875 if ((namelen <= 2) && (name[0] == '.') &&
876 (name[1] == '.' || name[1] == '\0')) {
877 /*
878 * "." or ".." will only be in the first block
879 * NFS may look up ".."; "." should be handled by the VFS
880 */
881 block = start = 0;
882 nblocks = 1;
883 goto restart;
884 }
873 if (is_dx(dir)) { 885 if (is_dx(dir)) {
874 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 886 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
875 /* 887 /*
@@ -960,55 +972,35 @@ cleanup_and_exit:
960static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 972static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
961 struct ext4_dir_entry_2 **res_dir, int *err) 973 struct ext4_dir_entry_2 **res_dir, int *err)
962{ 974{
963 struct super_block * sb; 975 struct super_block * sb = dir->i_sb;
964 struct dx_hash_info hinfo; 976 struct dx_hash_info hinfo;
965 u32 hash;
966 struct dx_frame frames[2], *frame; 977 struct dx_frame frames[2], *frame;
967 struct ext4_dir_entry_2 *de, *top;
968 struct buffer_head *bh; 978 struct buffer_head *bh;
969 ext4_lblk_t block; 979 ext4_lblk_t block;
970 int retval; 980 int retval;
971 int namelen = d_name->len;
972 const u8 *name = d_name->name;
973 981
974 sb = dir->i_sb; 982 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
975 /* NFS may look up ".." - look at dx_root directory block */ 983 return NULL;
976 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
977 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
978 return NULL;
979 } else {
980 frame = frames;
981 frame->bh = NULL; /* for dx_release() */
982 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
983 dx_set_block(frame->at, 0); /* dx_root block is 0 */
984 }
985 hash = hinfo.hash;
986 do { 984 do {
987 block = dx_get_block(frame->at); 985 block = dx_get_block(frame->at);
988 if (!(bh = ext4_bread (NULL,dir, block, 0, err))) 986 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
989 goto errout; 987 goto errout;
990 de = (struct ext4_dir_entry_2 *) bh->b_data;
991 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
992 EXT4_DIR_REC_LEN(0));
993 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
994 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
995 + ((char *) de - bh->b_data);
996
997 if (!ext4_check_dir_entry(dir, de, bh, off)) {
998 brelse(bh);
999 *err = ERR_BAD_DX_DIR;
1000 goto errout;
1001 }
1002 988
1003 if (ext4_match(namelen, name, de)) { 989 retval = search_dirblock(bh, dir, d_name,
1004 *res_dir = de; 990 block << EXT4_BLOCK_SIZE_BITS(sb),
1005 dx_release(frames); 991 res_dir);
1006 return bh; 992 if (retval == 1) { /* Success! */
1007 } 993 dx_release(frames);
994 return bh;
1008 } 995 }
1009 brelse(bh); 996 brelse(bh);
997 if (retval == -1) {
998 *err = ERR_BAD_DX_DIR;
999 goto errout;
1000 }
1001
1010 /* Check to see if we should continue to search */ 1002 /* Check to see if we should continue to search */
1011 retval = ext4_htree_next_block(dir, hash, frame, 1003 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1012 frames, NULL); 1004 frames, NULL);
1013 if (retval < 0) { 1005 if (retval < 0) {
1014 ext4_warning(sb, 1006 ext4_warning(sb,
@@ -1045,7 +1037,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1045 return ERR_PTR(-EIO); 1037 return ERR_PTR(-EIO);
1046 } 1038 }
1047 inode = ext4_iget(dir->i_sb, ino); 1039 inode = ext4_iget(dir->i_sb, ino);
1048 if (unlikely(IS_ERR(inode))) { 1040 if (IS_ERR(inode)) {
1049 if (PTR_ERR(inode) == -ESTALE) { 1041 if (PTR_ERR(inode) == -ESTALE) {
1050 EXT4_ERROR_INODE(dir, 1042 EXT4_ERROR_INODE(dir,
1051 "deleted inode referenced: %u", 1043 "deleted inode referenced: %u",
@@ -1278,7 +1270,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1278 de = (struct ext4_dir_entry_2 *)bh->b_data; 1270 de = (struct ext4_dir_entry_2 *)bh->b_data;
1279 top = bh->b_data + blocksize - reclen; 1271 top = bh->b_data + blocksize - reclen;
1280 while ((char *) de <= top) { 1272 while ((char *) de <= top) {
1281 if (!ext4_check_dir_entry(dir, de, bh, offset)) 1273 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1282 return -EIO; 1274 return -EIO;
1283 if (ext4_match(namelen, name, de)) 1275 if (ext4_match(namelen, name, de))
1284 return -EEXIST; 1276 return -EEXIST;
@@ -1421,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1421 frame->at = entries; 1413 frame->at = entries;
1422 frame->bh = bh; 1414 frame->bh = bh;
1423 bh = bh2; 1415 bh = bh2;
1416
1417 ext4_handle_dirty_metadata(handle, dir, frame->bh);
1418 ext4_handle_dirty_metadata(handle, dir, bh);
1419
1424 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1420 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1425 dx_release (frames); 1421 if (!de) {
1426 if (!(de)) 1422 /*
1423 * Even if the block split failed, we have to properly write
1424 * out all the changes we did so far. Otherwise we can end up
1425 * with corrupted filesystem.
1426 */
1427 ext4_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1427 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1428 1432
1429 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1430 brelse(bh); 1434 brelse(bh);
@@ -1611,7 +1615,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1611 if (err) 1615 if (err)
1612 goto journal_error; 1616 goto journal_error;
1613 } 1617 }
1614 ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1618 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1619 if (err) {
1620 ext4_std_error(inode->i_sb, err);
1621 goto cleanup;
1622 }
1615 } 1623 }
1616 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1624 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1617 if (!de) 1625 if (!de)
@@ -1639,17 +1647,21 @@ static int ext4_delete_entry(handle_t *handle,
1639{ 1647{
1640 struct ext4_dir_entry_2 *de, *pde; 1648 struct ext4_dir_entry_2 *de, *pde;
1641 unsigned int blocksize = dir->i_sb->s_blocksize; 1649 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1650 int i, err;
1643 1651
1644 i = 0; 1652 i = 0;
1645 pde = NULL; 1653 pde = NULL;
1646 de = (struct ext4_dir_entry_2 *) bh->b_data; 1654 de = (struct ext4_dir_entry_2 *) bh->b_data;
1647 while (i < bh->b_size) { 1655 while (i < bh->b_size) {
1648 if (!ext4_check_dir_entry(dir, de, bh, i)) 1656 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1649 return -EIO; 1657 return -EIO;
1650 if (de == de_del) { 1658 if (de == de_del) {
1651 BUFFER_TRACE(bh, "get_write_access"); 1659 BUFFER_TRACE(bh, "get_write_access");
1652 ext4_journal_get_write_access(handle, bh); 1660 err = ext4_journal_get_write_access(handle, bh);
1661 if (unlikely(err)) {
1662 ext4_std_error(dir->i_sb, err);
1663 return err;
1664 }
1653 if (pde) 1665 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1666 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len, 1667 ext4_rec_len_from_disk(pde->rec_len,
@@ -1661,7 +1673,11 @@ static int ext4_delete_entry(handle_t *handle,
1661 de->inode = 0; 1673 de->inode = 0;
1662 dir->i_version++; 1674 dir->i_version++;
1663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1675 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1664 ext4_handle_dirty_metadata(handle, dir, bh); 1676 err = ext4_handle_dirty_metadata(handle, dir, bh);
1677 if (unlikely(err)) {
1678 ext4_std_error(dir->i_sb, err);
1679 return err;
1680 }
1665 return 0; 1681 return 0;
1666 } 1682 }
1667 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 1683 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1798,7 +1814,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1798{ 1814{
1799 handle_t *handle; 1815 handle_t *handle;
1800 struct inode *inode; 1816 struct inode *inode;
1801 struct buffer_head *dir_block; 1817 struct buffer_head *dir_block = NULL;
1802 struct ext4_dir_entry_2 *de; 1818 struct ext4_dir_entry_2 *de;
1803 unsigned int blocksize = dir->i_sb->s_blocksize; 1819 unsigned int blocksize = dir->i_sb->s_blocksize;
1804 int err, retries = 0; 1820 int err, retries = 0;
@@ -1831,7 +1847,9 @@ retry:
1831 if (!dir_block) 1847 if (!dir_block)
1832 goto out_clear_inode; 1848 goto out_clear_inode;
1833 BUFFER_TRACE(dir_block, "get_write_access"); 1849 BUFFER_TRACE(dir_block, "get_write_access");
1834 ext4_journal_get_write_access(handle, dir_block); 1850 err = ext4_journal_get_write_access(handle, dir_block);
1851 if (err)
1852 goto out_clear_inode;
1835 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1853 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1836 de->inode = cpu_to_le32(inode->i_ino); 1854 de->inode = cpu_to_le32(inode->i_ino);
1837 de->name_len = 1; 1855 de->name_len = 1;
@@ -1848,10 +1866,12 @@ retry:
1848 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1866 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1849 inode->i_nlink = 2; 1867 inode->i_nlink = 2;
1850 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1868 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1851 ext4_handle_dirty_metadata(handle, dir, dir_block); 1869 err = ext4_handle_dirty_metadata(handle, dir, dir_block);
1852 brelse(dir_block); 1870 if (err)
1853 ext4_mark_inode_dirty(handle, inode); 1871 goto out_clear_inode;
1854 err = ext4_add_entry(handle, dentry, inode); 1872 err = ext4_mark_inode_dirty(handle, inode);
1873 if (!err)
1874 err = ext4_add_entry(handle, dentry, inode);
1855 if (err) { 1875 if (err) {
1856out_clear_inode: 1876out_clear_inode:
1857 clear_nlink(inode); 1877 clear_nlink(inode);
@@ -1862,10 +1882,13 @@ out_clear_inode:
1862 } 1882 }
1863 ext4_inc_count(handle, dir); 1883 ext4_inc_count(handle, dir);
1864 ext4_update_dx_flag(dir); 1884 ext4_update_dx_flag(dir);
1865 ext4_mark_inode_dirty(handle, dir); 1885 err = ext4_mark_inode_dirty(handle, dir);
1886 if (err)
1887 goto out_clear_inode;
1866 d_instantiate(dentry, inode); 1888 d_instantiate(dentry, inode);
1867 unlock_new_inode(inode); 1889 unlock_new_inode(inode);
1868out_stop: 1890out_stop:
1891 brelse(dir_block);
1869 ext4_journal_stop(handle); 1892 ext4_journal_stop(handle);
1870 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1893 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1871 goto retry; 1894 goto retry;
@@ -1928,7 +1951,7 @@ static int empty_dir(struct inode *inode)
1928 } 1951 }
1929 de = (struct ext4_dir_entry_2 *) bh->b_data; 1952 de = (struct ext4_dir_entry_2 *) bh->b_data;
1930 } 1953 }
1931 if (!ext4_check_dir_entry(inode, de, bh, offset)) { 1954 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
1932 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1955 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1933 sb->s_blocksize); 1956 sb->s_blocksize);
1934 offset = (offset | (sb->s_blocksize - 1)) + 1; 1957 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2173,6 +2196,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2173 struct ext4_dir_entry_2 *de; 2196 struct ext4_dir_entry_2 *de;
2174 handle_t *handle; 2197 handle_t *handle;
2175 2198
2199 trace_ext4_unlink_enter(dir, dentry);
2176 /* Initialize quotas before so that eventual writes go 2200 /* Initialize quotas before so that eventual writes go
2177 * in separate transaction */ 2201 * in separate transaction */
2178 dquot_initialize(dir); 2202 dquot_initialize(dir);
@@ -2218,6 +2242,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2218end_unlink: 2242end_unlink:
2219 ext4_journal_stop(handle); 2243 ext4_journal_stop(handle);
2220 brelse(bh); 2244 brelse(bh);
2245 trace_ext4_unlink_exit(dentry, retval);
2221 return retval; 2246 return retval;
2222} 2247}
2223 2248
@@ -2227,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
2227 handle_t *handle; 2252 handle_t *handle;
2228 struct inode *inode; 2253 struct inode *inode;
2229 int l, err, retries = 0; 2254 int l, err, retries = 0;
2255 int credits;
2230 2256
2231 l = strlen(symname)+1; 2257 l = strlen(symname)+1;
2232 if (l > dir->i_sb->s_blocksize) 2258 if (l > dir->i_sb->s_blocksize)
@@ -2234,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
2234 2260
2235 dquot_initialize(dir); 2261 dquot_initialize(dir);
2236 2262
2263 if (l > EXT4_N_BLOCKS * 4) {
2264 /*
2265 * For non-fast symlinks, we just allocate inode and put it on
2266 * orphan list in the first transaction => we need bitmap,
2267 * group descriptor, sb, inode block, quota blocks.
2268 */
2269 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2270 } else {
2271 /*
2272 * Fast symlink. We have to add entry to directory
2273 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2274 * allocate new inode (bitmap, group descriptor, inode block,
2275 * quota blocks, sb is already counted in previous macros).
2276 */
2277 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2278 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2279 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2280 }
2237retry: 2281retry:
2238 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 handle = ext4_journal_start(dir, credits);
2239 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2240 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2241 if (IS_ERR(handle)) 2283 if (IS_ERR(handle))
2242 return PTR_ERR(handle); 2284 return PTR_ERR(handle);
2243 2285
@@ -2250,21 +2292,44 @@ retry:
2250 if (IS_ERR(inode)) 2292 if (IS_ERR(inode))
2251 goto out_stop; 2293 goto out_stop;
2252 2294
2253 if (l > sizeof(EXT4_I(inode)->i_data)) { 2295 if (l > EXT4_N_BLOCKS * 4) {
2254 inode->i_op = &ext4_symlink_inode_operations; 2296 inode->i_op = &ext4_symlink_inode_operations;
2255 ext4_set_aops(inode); 2297 ext4_set_aops(inode);
2256 /* 2298 /*
2257 * page_symlink() calls into ext4_prepare/commit_write. 2299 * We cannot call page_symlink() with transaction started
2258 * We have a transaction open. All is sweetness. It also sets 2300 * because it calls into ext4_write_begin() which can wait
2259 * i_size in generic_commit_write(). 2301 * for transaction commit if we are running out of space
2302 * and thus we deadlock. So we have to stop transaction now
2303 * and restart it when symlink contents is written.
2304 *
2305 * To keep fs consistent in case of crash, we have to put inode
2306 * to orphan list in the mean time.
2260 */ 2307 */
2308 drop_nlink(inode);
2309 err = ext4_orphan_add(handle, inode);
2310 ext4_journal_stop(handle);
2311 if (err)
2312 goto err_drop_inode;
2261 err = __page_symlink(inode, symname, l, 1); 2313 err = __page_symlink(inode, symname, l, 1);
2314 if (err)
2315 goto err_drop_inode;
2316 /*
2317 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2318 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2319 */
2320 handle = ext4_journal_start(dir,
2321 EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2322 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2323 if (IS_ERR(handle)) {
2324 err = PTR_ERR(handle);
2325 goto err_drop_inode;
2326 }
2327 inc_nlink(inode);
2328 err = ext4_orphan_del(handle, inode);
2262 if (err) { 2329 if (err) {
2330 ext4_journal_stop(handle);
2263 clear_nlink(inode); 2331 clear_nlink(inode);
2264 unlock_new_inode(inode); 2332 goto err_drop_inode;
2265 ext4_mark_inode_dirty(handle, inode);
2266 iput(inode);
2267 goto out_stop;
2268 } 2333 }
2269 } else { 2334 } else {
2270 /* clear the extent format for fast symlink */ 2335 /* clear the extent format for fast symlink */
@@ -2280,6 +2345,10 @@ out_stop:
2280 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2345 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2281 goto retry; 2346 goto retry;
2282 return err; 2347 return err;
2348err_drop_inode:
2349 unlock_new_inode(inode);
2350 iput(inode);
2351 return err;
2283} 2352}
2284 2353
2285static int ext4_link(struct dentry *old_dentry, 2354static int ext4_link(struct dentry *old_dentry,
@@ -2294,13 +2363,6 @@ static int ext4_link(struct dentry *old_dentry,
2294 2363
2295 dquot_initialize(dir); 2364 dquot_initialize(dir);
2296 2365
2297 /*
2298 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2299 * otherwise has the potential to corrupt the orphan inode list.
2300 */
2301 if (inode->i_nlink == 0)
2302 return -ENOENT;
2303
2304retry: 2366retry:
2305 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2367 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2306 EXT4_INDEX_EXTRA_TRANS_BLOCKS); 2368 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
@@ -2312,7 +2374,7 @@ retry:
2312 2374
2313 inode->i_ctime = ext4_current_time(inode); 2375 inode->i_ctime = ext4_current_time(inode);
2314 ext4_inc_count(handle, inode); 2376 ext4_inc_count(handle, inode);
2315 atomic_inc(&inode->i_count); 2377 ihold(inode);
2316 2378
2317 err = ext4_add_entry(handle, dentry, inode); 2379 err = ext4_add_entry(handle, dentry, inode);
2318 if (!err) { 2380 if (!err) {
@@ -2399,6 +2461,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2399 if (!new_inode && new_dir != old_dir && 2461 if (!new_inode && new_dir != old_dir &&
2400 EXT4_DIR_LINK_MAX(new_dir)) 2462 EXT4_DIR_LINK_MAX(new_dir))
2401 goto end_rename; 2463 goto end_rename;
2464 BUFFER_TRACE(dir_bh, "get_write_access");
2465 retval = ext4_journal_get_write_access(handle, dir_bh);
2466 if (retval)
2467 goto end_rename;
2402 } 2468 }
2403 if (!new_bh) { 2469 if (!new_bh) {
2404 retval = ext4_add_entry(handle, new_dentry, old_inode); 2470 retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2406,7 +2472,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2406 goto end_rename; 2472 goto end_rename;
2407 } else { 2473 } else {
2408 BUFFER_TRACE(new_bh, "get write access"); 2474 BUFFER_TRACE(new_bh, "get write access");
2409 ext4_journal_get_write_access(handle, new_bh); 2475 retval = ext4_journal_get_write_access(handle, new_bh);
2476 if (retval)
2477 goto end_rename;
2410 new_de->inode = cpu_to_le32(old_inode->i_ino); 2478 new_de->inode = cpu_to_le32(old_inode->i_ino);
2411 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2479 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2412 EXT4_FEATURE_INCOMPAT_FILETYPE)) 2480 EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2416,7 +2484,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2416 ext4_current_time(new_dir); 2484 ext4_current_time(new_dir);
2417 ext4_mark_inode_dirty(handle, new_dir); 2485 ext4_mark_inode_dirty(handle, new_dir);
2418 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 2486 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2419 ext4_handle_dirty_metadata(handle, new_dir, new_bh); 2487 retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2488 if (unlikely(retval)) {
2489 ext4_std_error(new_dir->i_sb, retval);
2490 goto end_rename;
2491 }
2420 brelse(new_bh); 2492 brelse(new_bh);
2421 new_bh = NULL; 2493 new_bh = NULL;
2422 } 2494 }
@@ -2463,12 +2535,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2463 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 2535 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2464 ext4_update_dx_flag(old_dir); 2536 ext4_update_dx_flag(old_dir);
2465 if (dir_bh) { 2537 if (dir_bh) {
2466 BUFFER_TRACE(dir_bh, "get_write_access");
2467 ext4_journal_get_write_access(handle, dir_bh);
2468 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2538 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2469 cpu_to_le32(new_dir->i_ino); 2539 cpu_to_le32(new_dir->i_ino);
2470 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2540 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2471 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2541 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2542 if (retval) {
2543 ext4_std_error(old_dir->i_sb, retval);
2544 goto end_rename;
2545 }
2472 ext4_dec_count(handle, old_dir); 2546 ext4_dec_count(handle, old_dir);
2473 if (new_inode) { 2547 if (new_inode) {
2474 /* checked empty_dir above, can't have another parent, 2548 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7bb8f76d470a
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,417 @@
1/*
2 * linux/fs/ext4/page-io.c
3 *
4 * This contains the new page_io functions for ext4
5 *
6 * Written by Theodore Ts'o, 2010.
7 */
8
9#include <linux/module.h>
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/jbd2.h>
13#include <linux/highuid.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/string.h>
17#include <linux/buffer_head.h>
18#include <linux/writeback.h>
19#include <linux/pagevec.h>
20#include <linux/mpage.h>
21#include <linux/namei.h>
22#include <linux/uio.h>
23#include <linux/bio.h>
24#include <linux/workqueue.h>
25#include <linux/kernel.h>
26#include <linux/slab.h>
27
28#include "ext4_jbd2.h"
29#include "xattr.h"
30#include "acl.h"
31#include "ext4_extents.h"
32
33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34
35int __init ext4_init_pageio(void)
36{
37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
38 if (io_page_cachep == NULL)
39 return -ENOMEM;
40 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
41 if (io_end_cachep == NULL) {
42 kmem_cache_destroy(io_page_cachep);
43 return -ENOMEM;
44 }
45 return 0;
46}
47
48void ext4_exit_pageio(void)
49{
50 kmem_cache_destroy(io_end_cachep);
51 kmem_cache_destroy(io_page_cachep);
52}
53
54void ext4_ioend_wait(struct inode *inode)
55{
56 wait_queue_head_t *wq = ext4_ioend_wq(inode);
57
58 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
59}
60
61static void put_io_page(struct ext4_io_page *io_page)
62{
63 if (atomic_dec_and_test(&io_page->p_count)) {
64 end_page_writeback(io_page->p_page);
65 put_page(io_page->p_page);
66 kmem_cache_free(io_page_cachep, io_page);
67 }
68}
69
70void ext4_free_io_end(ext4_io_end_t *io)
71{
72 int i;
73 wait_queue_head_t *wq;
74
75 BUG_ON(!io);
76 if (io->page)
77 put_page(io->page);
78 for (i = 0; i < io->num_io_pages; i++)
79 put_io_page(io->pages[i]);
80 io->num_io_pages = 0;
81 wq = ext4_ioend_wq(io->inode);
82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
83 waitqueue_active(wq))
84 wake_up_all(wq);
85 kmem_cache_free(io_end_cachep, io);
86}
87
88/*
89 * check a range of space and convert unwritten extents to written.
90 */
91int ext4_end_io_nolock(ext4_io_end_t *io)
92{
93 struct inode *inode = io->inode;
94 loff_t offset = io->offset;
95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
97 int ret = 0;
98
99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
100 "list->prev 0x%p\n",
101 io, inode->i_ino, io->list.next, io->list.prev);
102
103 if (list_empty(&io->list))
104 return ret;
105
106 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
107 return ret;
108
109 ret = ext4_convert_unwritten_extents(inode, offset, size);
110 if (ret < 0) {
111 printk(KERN_EMERG "%s: failed to convert unwritten "
112 "extents to written extents, error is %d "
113 "io is still on inode %lu aio dio list\n",
114 __func__, ret, inode->i_ino);
115 return ret;
116 }
117
118 if (io->iocb)
119 aio_complete(io->iocb, io->result, 0);
120 /* clear the DIO AIO unwritten flag */
121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130
131 return ret;
132}
133
134/*
135 * work on completed aio dio IO, to convert unwritten extents to extents
136 */
137static void ext4_end_io_work(struct work_struct *work)
138{
139 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
140 struct inode *inode = io->inode;
141 struct ext4_inode_info *ei = EXT4_I(inode);
142 unsigned long flags;
143 int ret;
144
145 mutex_lock(&inode->i_mutex);
146 ret = ext4_end_io_nolock(io);
147 if (ret < 0) {
148 mutex_unlock(&inode->i_mutex);
149 return;
150 }
151
152 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
153 if (!list_empty(&io->list))
154 list_del_init(&io->list);
155 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
156 mutex_unlock(&inode->i_mutex);
157 ext4_free_io_end(io);
158}
159
160ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
161{
162 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
163 if (io) {
164 atomic_inc(&EXT4_I(inode)->i_ioend_count);
165 io->inode = inode;
166 INIT_WORK(&io->work, ext4_end_io_work);
167 INIT_LIST_HEAD(&io->list);
168 }
169 return io;
170}
171
172/*
173 * Print an buffer I/O error compatible with the fs/buffer.c. This
174 * provides compatibility with dmesg scrapers that look for a specific
175 * buffer I/O error message. We really need a unified error reporting
176 * structure to userspace ala Digital Unix's uerf system, but it's
177 * probably not going to happen in my lifetime, due to LKML politics...
178 */
179static void buffer_io_error(struct buffer_head *bh)
180{
181 char b[BDEVNAME_SIZE];
182 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
183 bdevname(bh->b_bdev, b),
184 (unsigned long long)bh->b_blocknr);
185}
186
187static void ext4_end_bio(struct bio *bio, int error)
188{
189 ext4_io_end_t *io_end = bio->bi_private;
190 struct workqueue_struct *wq;
191 struct inode *inode;
192 unsigned long flags;
193 int i;
194 sector_t bi_sector = bio->bi_sector;
195
196 BUG_ON(!io_end);
197 bio->bi_private = NULL;
198 bio->bi_end_io = NULL;
199 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
200 error = 0;
201 bio_put(bio);
202
203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head;
206 loff_t offset;
207 loff_t io_end_offset;
208
209 if (error) {
210 SetPageError(page);
211 set_bit(AS_EIO, &page->mapping->flags);
212 head = page_buffers(page);
213 BUG_ON(!head);
214
215 io_end_offset = io_end->offset + io_end->size;
216
217 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
218 bh = head;
219 do {
220 if ((offset >= io_end->offset) &&
221 (offset+bh->b_size <= io_end_offset))
222 buffer_io_error(bh);
223
224 offset += bh->b_size;
225 bh = bh->b_this_page;
226 } while (bh != head);
227 }
228
229 put_io_page(io_end->pages[i]);
230 }
231 io_end->num_io_pages = 0;
232 inode = io_end->inode;
233
234 if (error) {
235 io_end->flag |= EXT4_IO_END_ERROR;
236 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
237 "(offset %llu size %ld starting block %llu)",
238 inode->i_ino,
239 (unsigned long long) io_end->offset,
240 (long) io_end->size,
241 (unsigned long long)
242 bi_sector >> (inode->i_blkbits - 9));
243 }
244
245 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
246 ext4_free_io_end(io_end);
247 return;
248 }
249
250 /* Add the io_end to per-inode completed io list*/
251 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
252 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
253 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
254
255 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
256 /* queue the work to convert unwritten extents to written */
257 queue_work(wq, &io_end->work);
258}
259
260void ext4_io_submit(struct ext4_io_submit *io)
261{
262 struct bio *bio = io->io_bio;
263
264 if (bio) {
265 bio_get(io->io_bio);
266 submit_bio(io->io_op, io->io_bio);
267 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
268 bio_put(io->io_bio);
269 }
270 io->io_bio = NULL;
271 io->io_op = 0;
272 io->io_end = NULL;
273}
274
275static int io_submit_init(struct ext4_io_submit *io,
276 struct inode *inode,
277 struct writeback_control *wbc,
278 struct buffer_head *bh)
279{
280 ext4_io_end_t *io_end;
281 struct page *page = bh->b_page;
282 int nvecs = bio_get_nr_vecs(bh->b_bdev);
283 struct bio *bio;
284
285 io_end = ext4_init_io_end(inode, GFP_NOFS);
286 if (!io_end)
287 return -ENOMEM;
288 do {
289 bio = bio_alloc(GFP_NOIO, nvecs);
290 nvecs >>= 1;
291 } while (bio == NULL);
292
293 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
294 bio->bi_bdev = bh->b_bdev;
295 bio->bi_private = io->io_end = io_end;
296 bio->bi_end_io = ext4_end_bio;
297
298 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
299
300 io->io_bio = bio;
301 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
302 io->io_next_block = bh->b_blocknr;
303 return 0;
304}
305
306static int io_submit_add_bh(struct ext4_io_submit *io,
307 struct ext4_io_page *io_page,
308 struct inode *inode,
309 struct writeback_control *wbc,
310 struct buffer_head *bh)
311{
312 ext4_io_end_t *io_end;
313 int ret;
314
315 if (buffer_new(bh)) {
316 clear_buffer_new(bh);
317 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
318 }
319
320 if (!buffer_mapped(bh) || buffer_delay(bh)) {
321 if (!buffer_mapped(bh))
322 clear_buffer_dirty(bh);
323 if (io->io_bio)
324 ext4_io_submit(io);
325 return 0;
326 }
327
328 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
329submit_and_retry:
330 ext4_io_submit(io);
331 }
332 if (io->io_bio == NULL) {
333 ret = io_submit_init(io, inode, wbc, bh);
334 if (ret)
335 return ret;
336 }
337 io_end = io->io_end;
338 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
339 (io_end->pages[io_end->num_io_pages-1] != io_page))
340 goto submit_and_retry;
341 if (buffer_uninit(bh))
342 io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
343 io->io_end->size += bh->b_size;
344 io->io_next_block++;
345 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
346 if (ret != bh->b_size)
347 goto submit_and_retry;
348 if ((io_end->num_io_pages == 0) ||
349 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
350 io_end->pages[io_end->num_io_pages++] = io_page;
351 atomic_inc(&io_page->p_count);
352 }
353 return 0;
354}
355
356int ext4_bio_write_page(struct ext4_io_submit *io,
357 struct page *page,
358 int len,
359 struct writeback_control *wbc)
360{
361 struct inode *inode = page->mapping->host;
362 unsigned block_start, block_end, blocksize;
363 struct ext4_io_page *io_page;
364 struct buffer_head *bh, *head;
365 int ret = 0;
366
367 blocksize = 1 << inode->i_blkbits;
368
369 BUG_ON(!PageLocked(page));
370 BUG_ON(PageWriteback(page));
371
372 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
373 if (!io_page) {
374 set_page_dirty(page);
375 unlock_page(page);
376 return -ENOMEM;
377 }
378 io_page->p_page = page;
379 atomic_set(&io_page->p_count, 1);
380 get_page(page);
381 set_page_writeback(page);
382 ClearPageError(page);
383
384 for (bh = head = page_buffers(page), block_start = 0;
385 bh != head || !block_start;
386 block_start = block_end, bh = bh->b_this_page) {
387
388 block_end = block_start + blocksize;
389 if (block_start >= len) {
390 clear_buffer_dirty(bh);
391 set_buffer_uptodate(bh);
392 continue;
393 }
394 clear_buffer_dirty(bh);
395 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
396 if (ret) {
397 /*
398 * We only get here on ENOMEM. Not much else
399 * we can do but mark the page as dirty, and
400 * better luck next time.
401 */
402 set_page_dirty(page);
403 break;
404 }
405 }
406 unlock_page(page);
407 /*
408 * If the page was truncated before we could do the writeback,
409 * or we had a memory allocation error while trying to write
410 * the first buffer head, we won't have submitted any pages for
411 * I/O. In that case we need to make sure we've cleared the
412 * PageWriteback bit from the page to prevent the system from
413 * wedging later on.
414 */
415 put_io_page(io_page);
416 return ret;
417}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,29 +220,25 @@ static int setup_new_group_blocks(struct super_block *sb,
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 221 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb); 222 unlock_buffer(gdb);
223 ext4_handle_dirty_metadata(handle, NULL, gdb); 223 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) {
225 brelse(gdb);
226 goto exit_bh;
227 }
224 ext4_set_bit(bit, bh->b_data); 228 ext4_set_bit(bit, bh->b_data);
225 brelse(gdb); 229 brelse(gdb);
226 } 230 }
227 231
228 /* Zero out all of the reserved backup group descriptor table blocks */ 232 /* Zero out all of the reserved backup group descriptor table blocks */
229 for (i = 0, bit = gdblocks + 1, block = start + bit; 233 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
230 i < reserved_gdb; i++, block++, bit++) { 234 block, sbi->s_itb_per_group);
231 struct buffer_head *gdb; 235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
232 236 GFP_NOFS);
233 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); 237 if (err)
234 238 goto exit_bh;
235 if ((err = extend_or_restart_transaction(handle, 1, bh))) 239 for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
236 goto exit_bh;
237
238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
239 err = PTR_ERR(gdb);
240 goto exit_bh;
241 }
242 ext4_handle_dirty_metadata(handle, NULL, gdb);
243 ext4_set_bit(bit, bh->b_data); 240 ext4_set_bit(bit, bh->b_data);
244 brelse(gdb); 241
245 }
246 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 242 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
247 input->block_bitmap - start); 243 input->block_bitmap - start);
248 ext4_set_bit(input->block_bitmap - start, bh->b_data); 244 ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,29 +247,26 @@ static int setup_new_group_blocks(struct super_block *sb,
251 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 247 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
252 248
253 /* Zero out all of the inode table blocks */ 249 /* Zero out all of the inode table blocks */
254 for (i = 0, block = input->inode_table, bit = block - start; 250 block = input->inode_table;
255 i < sbi->s_itb_per_group; i++, bit++, block++) { 251 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
256 struct buffer_head *it; 252 block, sbi->s_itb_per_group);
257 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
258 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); 254 if (err)
259 255 goto exit_bh;
260 if ((err = extend_or_restart_transaction(handle, 1, bh))) 256 for (i = 0, bit = input->inode_table - start;
261 goto exit_bh; 257 i < sbi->s_itb_per_group; i++, bit++)
262
263 if (IS_ERR(it = bclean(handle, sb, block))) {
264 err = PTR_ERR(it);
265 goto exit_bh;
266 }
267 ext4_handle_dirty_metadata(handle, NULL, it);
268 brelse(it);
269 ext4_set_bit(bit, bh->b_data); 258 ext4_set_bit(bit, bh->b_data);
270 }
271 259
272 if ((err = extend_or_restart_transaction(handle, 2, bh))) 260 if ((err = extend_or_restart_transaction(handle, 2, bh)))
273 goto exit_bh; 261 goto exit_bh;
274 262
275 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); 263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
276 ext4_handle_dirty_metadata(handle, NULL, bh); 264 bh->b_data);
265 err = ext4_handle_dirty_metadata(handle, NULL, bh);
266 if (unlikely(err)) {
267 ext4_std_error(sb, err);
268 goto exit_bh;
269 }
277 brelse(bh); 270 brelse(bh);
278 /* Mark unused entries in inode bitmap used */ 271 /* Mark unused entries in inode bitmap used */
279 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 272 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -283,9 +276,11 @@ static int setup_new_group_blocks(struct super_block *sb,
283 goto exit_journal; 276 goto exit_journal;
284 } 277 }
285 278
286 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 279 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
287 bh->b_data); 280 bh->b_data);
288 ext4_handle_dirty_metadata(handle, NULL, bh); 281 err = ext4_handle_dirty_metadata(handle, NULL, bh);
282 if (unlikely(err))
283 ext4_std_error(sb, err);
289exit_bh: 284exit_bh:
290 brelse(bh); 285 brelse(bh);
291 286
@@ -437,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
437 goto exit_dind; 432 goto exit_dind;
438 } 433 }
439 434
440 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) 435 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
436 if (unlikely(err))
441 goto exit_dind; 437 goto exit_dind;
442 438
443 if ((err = ext4_journal_get_write_access(handle, *primary))) 439 err = ext4_journal_get_write_access(handle, *primary);
440 if (unlikely(err))
444 goto exit_sbh; 441 goto exit_sbh;
445 442
446 if ((err = ext4_journal_get_write_access(handle, dind))) 443 err = ext4_journal_get_write_access(handle, dind);
447 goto exit_primary; 444 if (unlikely(err))
445 ext4_std_error(sb, err);
448 446
449 /* ext4_reserve_inode_write() gets a reference on the iloc */ 447 /* ext4_reserve_inode_write() gets a reference on the iloc */
450 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 448 err = ext4_reserve_inode_write(handle, inode, &iloc);
449 if (unlikely(err))
451 goto exit_dindj; 450 goto exit_dindj;
452 451
453 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -469,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
469 * reserved inode, and will become GDT blocks (primary and backup). 468 * reserved inode, and will become GDT blocks (primary and backup).
470 */ 469 */
471 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 470 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
472 ext4_handle_dirty_metadata(handle, NULL, dind); 471 err = ext4_handle_dirty_metadata(handle, NULL, dind);
473 brelse(dind); 472 if (unlikely(err)) {
473 ext4_std_error(sb, err);
474 goto exit_inode;
475 }
474 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
475 ext4_mark_iloc_dirty(handle, inode, &iloc); 477 ext4_mark_iloc_dirty(handle, inode, &iloc);
476 memset((*primary)->b_data, 0, sb->s_blocksize); 478 memset((*primary)->b_data, 0, sb->s_blocksize);
477 ext4_handle_dirty_metadata(handle, NULL, *primary); 479 err = ext4_handle_dirty_metadata(handle, NULL, *primary);
480 if (unlikely(err)) {
481 ext4_std_error(sb, err);
482 goto exit_inode;
483 }
484 brelse(dind);
478 485
479 o_group_desc = EXT4_SB(sb)->s_group_desc; 486 o_group_desc = EXT4_SB(sb)->s_group_desc;
480 memcpy(n_group_desc, o_group_desc, 487 memcpy(n_group_desc, o_group_desc,
@@ -485,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
485 kfree(o_group_desc); 492 kfree(o_group_desc);
486 493
487 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
488 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
496 if (err)
497 ext4_std_error(sb, err);
489 498
490 return 0; 499 return err;
491 500
492exit_inode: 501exit_inode:
493 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_handle_release_buffer(handle, iloc.bh); */
494 brelse(iloc.bh); 503 brelse(iloc.bh);
495exit_dindj: 504exit_dindj:
496 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_handle_release_buffer(handle, dind); */
497exit_primary:
498 /* ext4_journal_release_buffer(handle, *primary); */
499exit_sbh: 506exit_sbh:
500 /* ext4_journal_release_buffer(handle, *primary); */ 507 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
501exit_dind: 508exit_dind:
502 brelse(dind); 509 brelse(dind);
503exit_bh: 510exit_bh:
@@ -579,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
579 /* 586 /*
580 int j; 587 int j;
581 for (j = 0; j < i; j++) 588 for (j = 0; j < i; j++)
582 ext4_journal_release_buffer(handle, primary[j]); 589 ext4_handle_release_buffer(handle, primary[j]);
583 */ 590 */
584 goto exit_bh; 591 goto exit_bh;
585 } 592 }
@@ -680,7 +687,9 @@ static void update_backups(struct super_block *sb,
680 memset(bh->b_data + size, 0, rest); 687 memset(bh->b_data + size, 0, rest);
681 set_buffer_uptodate(bh); 688 set_buffer_uptodate(bh);
682 unlock_buffer(bh); 689 unlock_buffer(bh);
683 ext4_handle_dirty_metadata(handle, NULL, bh); 690 err = ext4_handle_dirty_metadata(handle, NULL, bh);
691 if (unlikely(err))
692 ext4_std_error(sb, err);
684 brelse(bh); 693 brelse(bh);
685 } 694 }
686 if ((err2 = ext4_journal_stop(handle)) && !err) 695 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -898,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
898 /* Update the global fs size fields */ 907 /* Update the global fs size fields */
899 sbi->s_groups_count++; 908 sbi->s_groups_count++;
900 909
901 ext4_handle_dirty_metadata(handle, NULL, primary); 910 err = ext4_handle_dirty_metadata(handle, NULL, primary);
911 if (unlikely(err)) {
912 ext4_std_error(sb, err);
913 goto exit_journal;
914 }
902 915
903 /* Update the reserved block counts only once the new group is 916 /* Update the reserved block counts only once the new group is
904 * active. */ 917 * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..9ea71aa864b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/parser.h> 28#include <linux/parser.h>
29#include <linux/smp_lock.h>
30#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 30#include <linux/exportfs.h>
32#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -39,8 +38,12 @@
39#include <linux/ctype.h> 38#include <linux/ctype.h>
40#include <linux/log2.h> 39#include <linux/log2.h>
41#include <linux/crc16.h> 40#include <linux/crc16.h>
41#include <linux/cleancache.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43 43
44#include <linux/kthread.h>
45#include <linux/freezer.h>
46
44#include "ext4.h" 47#include "ext4.h"
45#include "ext4_jbd2.h" 48#include "ext4_jbd2.h"
46#include "xattr.h" 49#include "xattr.h"
@@ -50,8 +53,11 @@
50#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 54#include <trace/events/ext4.h>
52 55
53struct proc_dir_entry *ext4_proc_root; 56static struct proc_dir_entry *ext4_proc_root;
54static struct kset *ext4_kset; 57static struct kset *ext4_kset;
58static struct ext4_lazy_init *ext4_li_info;
59static struct mutex ext4_li_mtx;
60static struct ext4_features *ext4_feat;
55 61
56static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 62static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
57 unsigned long journal_devnum); 63 unsigned long journal_devnum);
@@ -68,14 +74,34 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 74static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 75static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 76static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags, 77static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt); 78 const char *dev_name, void *data);
79static inline int ext2_feature_set_ok(struct super_block *sb);
80static inline int ext3_feature_set_ok(struct super_block *sb);
81static int ext4_feature_set_ok(struct super_block *sb, int readonly);
82static void ext4_destroy_lazyinit_thread(void);
83static void ext4_unregister_li_request(struct super_block *sb);
84static void ext4_clear_request_list(void);
85
86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
87static struct file_system_type ext2_fs_type = {
88 .owner = THIS_MODULE,
89 .name = "ext2",
90 .mount = ext4_mount,
91 .kill_sb = kill_block_super,
92 .fs_flags = FS_REQUIRES_DEV,
93};
94#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
95#else
96#define IS_EXT2_SB(sb) (0)
97#endif
98
73 99
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 100#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = { 101static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE, 102 .owner = THIS_MODULE,
77 .name = "ext3", 103 .name = "ext3",
78 .get_sb = ext4_get_sb, 104 .mount = ext4_mount,
79 .kill_sb = kill_block_super, 105 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV, 106 .fs_flags = FS_REQUIRES_DEV,
81}; 107};
@@ -233,27 +259,44 @@ static void ext4_put_nojournal(handle_t *handle)
233 * journal_end calls result in the superblock being marked dirty, so 259 * journal_end calls result in the superblock being marked dirty, so
234 * that sync() will call the filesystem's write_super callback if 260 * that sync() will call the filesystem's write_super callback if
235 * appropriate. 261 * appropriate.
262 *
263 * To avoid j_barrier hold in userspace when a user calls freeze(),
264 * ext4 prevents a new handle from being started by s_frozen, which
265 * is in an upper layer.
236 */ 266 */
237handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 267handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
238{ 268{
239 journal_t *journal; 269 journal_t *journal;
270 handle_t *handle;
240 271
241 if (sb->s_flags & MS_RDONLY) 272 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 273 return ERR_PTR(-EROFS);
243 274
244 vfs_check_frozen(sb, SB_FREEZE_TRANS);
245 /* Special case here: if the journal has aborted behind our
246 * backs (eg. EIO in the commit thread), then we still need to
247 * take the FS itself readonly cleanly. */
248 journal = EXT4_SB(sb)->s_journal; 275 journal = EXT4_SB(sb)->s_journal;
249 if (journal) { 276 handle = ext4_journal_current_handle();
250 if (is_journal_aborted(journal)) { 277
251 ext4_abort(sb, "Detected aborted journal"); 278 /*
252 return ERR_PTR(-EROFS); 279 * If a handle has been started, it should be allowed to
253 } 280 * finish, otherwise deadlock could happen between freeze
254 return jbd2_journal_start(journal, nblocks); 281 * and others(e.g. truncate) due to the restart of the
282 * journal handle if the filesystem is forzen and active
283 * handles are not stopped.
284 */
285 if (!handle)
286 vfs_check_frozen(sb, SB_FREEZE_TRANS);
287
288 if (!journal)
289 return ext4_get_nojournal();
290 /*
291 * Special case here: if the journal has aborted behind our
292 * backs (eg. EIO in the commit thread), then we still need to
293 * take the FS itself readonly cleanly.
294 */
295 if (is_journal_aborted(journal)) {
296 ext4_abort(sb, "Detected aborted journal");
297 return ERR_PTR(-EROFS);
255 } 298 }
256 return ext4_get_nojournal(); 299 return jbd2_journal_start(journal, nblocks);
257} 300}
258 301
259/* 302/*
@@ -381,13 +424,14 @@ static void ext4_handle_error(struct super_block *sb)
381void __ext4_error(struct super_block *sb, const char *function, 424void __ext4_error(struct super_block *sb, const char *function,
382 unsigned int line, const char *fmt, ...) 425 unsigned int line, const char *fmt, ...)
383{ 426{
427 struct va_format vaf;
384 va_list args; 428 va_list args;
385 429
386 va_start(args, fmt); 430 va_start(args, fmt);
387 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", 431 vaf.fmt = fmt;
388 sb->s_id, function, line, current->comm); 432 vaf.va = &args;
389 vprintk(fmt, args); 433 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
390 printk("\n"); 434 sb->s_id, function, line, current->comm, &vaf);
391 va_end(args); 435 va_end(args);
392 436
393 ext4_handle_error(sb); 437 ext4_handle_error(sb);
@@ -398,28 +442,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
398 const char *fmt, ...) 442 const char *fmt, ...)
399{ 443{
400 va_list args; 444 va_list args;
445 struct va_format vaf;
401 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 446 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
402 447
403 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 448 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
404 es->s_last_error_block = cpu_to_le64(block); 449 es->s_last_error_block = cpu_to_le64(block);
405 save_error_info(inode->i_sb, function, line); 450 save_error_info(inode->i_sb, function, line);
406 va_start(args, fmt); 451 va_start(args, fmt);
452 vaf.fmt = fmt;
453 vaf.va = &args;
407 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", 454 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
408 inode->i_sb->s_id, function, line, inode->i_ino); 455 inode->i_sb->s_id, function, line, inode->i_ino);
409 if (block) 456 if (block)
410 printk("block %llu: ", block); 457 printk(KERN_CONT "block %llu: ", block);
411 printk("comm %s: ", current->comm); 458 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
412 vprintk(fmt, args);
413 printk("\n");
414 va_end(args); 459 va_end(args);
415 460
416 ext4_handle_error(inode->i_sb); 461 ext4_handle_error(inode->i_sb);
417} 462}
418 463
419void ext4_error_file(struct file *file, const char *function, 464void ext4_error_file(struct file *file, const char *function,
420 unsigned int line, const char *fmt, ...) 465 unsigned int line, ext4_fsblk_t block,
466 const char *fmt, ...)
421{ 467{
422 va_list args; 468 va_list args;
469 struct va_format vaf;
423 struct ext4_super_block *es; 470 struct ext4_super_block *es;
424 struct inode *inode = file->f_dentry->d_inode; 471 struct inode *inode = file->f_dentry->d_inode;
425 char pathname[80], *path; 472 char pathname[80], *path;
@@ -427,17 +474,18 @@ void ext4_error_file(struct file *file, const char *function,
427 es = EXT4_SB(inode->i_sb)->s_es; 474 es = EXT4_SB(inode->i_sb)->s_es;
428 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 475 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
429 save_error_info(inode->i_sb, function, line); 476 save_error_info(inode->i_sb, function, line);
430 va_start(args, fmt);
431 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 477 path = d_path(&(file->f_path), pathname, sizeof(pathname));
432 if (!path) 478 if (IS_ERR(path))
433 path = "(unknown)"; 479 path = "(unknown)";
434 printk(KERN_CRIT 480 printk(KERN_CRIT
435 "EXT4-fs error (device %s): %s:%d: inode #%lu " 481 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
436 "(comm %s path %s): ", 482 inode->i_sb->s_id, function, line, inode->i_ino);
437 inode->i_sb->s_id, function, line, inode->i_ino, 483 if (block)
438 current->comm, path); 484 printk(KERN_CONT "block %llu: ", block);
439 vprintk(fmt, args); 485 va_start(args, fmt);
440 printk("\n"); 486 vaf.fmt = fmt;
487 vaf.va = &args;
488 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
441 va_end(args); 489 va_end(args);
442 490
443 ext4_handle_error(inode->i_sb); 491 ext4_handle_error(inode->i_sb);
@@ -536,28 +584,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
536 panic("EXT4-fs panic from previous error\n"); 584 panic("EXT4-fs panic from previous error\n");
537} 585}
538 586
539void ext4_msg (struct super_block * sb, const char *prefix, 587void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
540 const char *fmt, ...)
541{ 588{
589 struct va_format vaf;
542 va_list args; 590 va_list args;
543 591
544 va_start(args, fmt); 592 va_start(args, fmt);
545 printk("%sEXT4-fs (%s): ", prefix, sb->s_id); 593 vaf.fmt = fmt;
546 vprintk(fmt, args); 594 vaf.va = &args;
547 printk("\n"); 595 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
548 va_end(args); 596 va_end(args);
549} 597}
550 598
551void __ext4_warning(struct super_block *sb, const char *function, 599void __ext4_warning(struct super_block *sb, const char *function,
552 unsigned int line, const char *fmt, ...) 600 unsigned int line, const char *fmt, ...)
553{ 601{
602 struct va_format vaf;
554 va_list args; 603 va_list args;
555 604
556 va_start(args, fmt); 605 va_start(args, fmt);
557 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", 606 vaf.fmt = fmt;
558 sb->s_id, function, line); 607 vaf.va = &args;
559 vprintk(fmt, args); 608 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
560 printk("\n"); 609 sb->s_id, function, line, &vaf);
561 va_end(args); 610 va_end(args);
562} 611}
563 612
@@ -568,21 +617,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
568__releases(bitlock) 617__releases(bitlock)
569__acquires(bitlock) 618__acquires(bitlock)
570{ 619{
620 struct va_format vaf;
571 va_list args; 621 va_list args;
572 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 622 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
573 623
574 es->s_last_error_ino = cpu_to_le32(ino); 624 es->s_last_error_ino = cpu_to_le32(ino);
575 es->s_last_error_block = cpu_to_le64(block); 625 es->s_last_error_block = cpu_to_le64(block);
576 __save_error_info(sb, function, line); 626 __save_error_info(sb, function, line);
627
577 va_start(args, fmt); 628 va_start(args, fmt);
578 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 629
630 vaf.fmt = fmt;
631 vaf.va = &args;
632 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
579 sb->s_id, function, line, grp); 633 sb->s_id, function, line, grp);
580 if (ino) 634 if (ino)
581 printk("inode %lu: ", ino); 635 printk(KERN_CONT "inode %lu: ", ino);
582 if (block) 636 if (block)
583 printk("block %llu:", (unsigned long long) block); 637 printk(KERN_CONT "block %llu:", (unsigned long long) block);
584 vprintk(fmt, args); 638 printk(KERN_CONT "%pV\n", &vaf);
585 printk("\n");
586 va_end(args); 639 va_end(args);
587 640
588 if (test_opt(sb, ERRORS_CONT)) { 641 if (test_opt(sb, ERRORS_CONT)) {
@@ -598,7 +651,7 @@ __acquires(bitlock)
598 * filesystem will have already been marked read/only and the 651 * filesystem will have already been marked read/only and the
599 * journal has been aborted. We return 1 as a hint to callers 652 * journal has been aborted. We return 1 as a hint to callers
600 * who might what to use the return value from 653 * who might what to use the return value from
601 * ext4_grp_locked_error() to distinguish beween the 654 * ext4_grp_locked_error() to distinguish between the
602 * ERRORS_CONT and ERRORS_RO case, and perhaps return more 655 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
603 * aggressively from the ext4 function in question, with a 656 * aggressively from the ext4 function in question, with a
604 * more appropriate error code. 657 * more appropriate error code.
@@ -640,7 +693,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
640 struct block_device *bdev; 693 struct block_device *bdev;
641 char b[BDEVNAME_SIZE]; 694 char b[BDEVNAME_SIZE];
642 695
643 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 696 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
644 if (IS_ERR(bdev)) 697 if (IS_ERR(bdev))
645 goto fail; 698 goto fail;
646 return bdev; 699 return bdev;
@@ -656,8 +709,7 @@ fail:
656 */ 709 */
657static int ext4_blkdev_put(struct block_device *bdev) 710static int ext4_blkdev_put(struct block_device *bdev)
658{ 711{
659 bd_release(bdev); 712 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
660 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
661} 713}
662 714
663static int ext4_blkdev_remove(struct ext4_sb_info *sbi) 715static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -702,13 +754,13 @@ static void ext4_put_super(struct super_block *sb)
702 struct ext4_super_block *es = sbi->s_es; 754 struct ext4_super_block *es = sbi->s_es;
703 int i, err; 755 int i, err;
704 756
757 ext4_unregister_li_request(sb);
705 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 758 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
706 759
707 flush_workqueue(sbi->dio_unwritten_wq); 760 flush_workqueue(sbi->dio_unwritten_wq);
708 destroy_workqueue(sbi->dio_unwritten_wq); 761 destroy_workqueue(sbi->dio_unwritten_wq);
709 762
710 lock_super(sb); 763 lock_super(sb);
711 lock_kernel();
712 if (sb->s_dirt) 764 if (sb->s_dirt)
713 ext4_commit_super(sb, 1); 765 ext4_commit_super(sb, 1);
714 766
@@ -719,6 +771,7 @@ static void ext4_put_super(struct super_block *sb)
719 ext4_abort(sb, "Couldn't clean up the journal"); 771 ext4_abort(sb, "Couldn't clean up the journal");
720 } 772 }
721 773
774 del_timer(&sbi->s_err_report);
722 ext4_release_system_zone(sb); 775 ext4_release_system_zone(sb);
723 ext4_mb_release(sb); 776 ext4_mb_release(sb);
724 ext4_ext_release(sb); 777 ext4_ext_release(sb);
@@ -770,12 +823,13 @@ static void ext4_put_super(struct super_block *sb)
770 invalidate_bdev(sbi->journal_bdev); 823 invalidate_bdev(sbi->journal_bdev);
771 ext4_blkdev_remove(sbi); 824 ext4_blkdev_remove(sbi);
772 } 825 }
826 if (sbi->s_mmp_tsk)
827 kthread_stop(sbi->s_mmp_tsk);
773 sb->s_fs_info = NULL; 828 sb->s_fs_info = NULL;
774 /* 829 /*
775 * Now that we are completely done shutting down the 830 * Now that we are completely done shutting down the
776 * superblock, we need to actually destroy the kobject. 831 * superblock, we need to actually destroy the kobject.
777 */ 832 */
778 unlock_kernel();
779 unlock_super(sb); 833 unlock_super(sb);
780 kobject_put(&sbi->s_kobj); 834 kobject_put(&sbi->s_kobj);
781 wait_for_completion(&sbi->s_kobj_unregister); 835 wait_for_completion(&sbi->s_kobj_unregister);
@@ -801,32 +855,44 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
801 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 855 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
802 INIT_LIST_HEAD(&ei->i_prealloc_list); 856 INIT_LIST_HEAD(&ei->i_prealloc_list);
803 spin_lock_init(&ei->i_prealloc_lock); 857 spin_lock_init(&ei->i_prealloc_lock);
804 /*
805 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
806 * therefore it can be null here. Don't check it, just initialize
807 * jinode.
808 */
809 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
810 ei->i_reserved_data_blocks = 0; 858 ei->i_reserved_data_blocks = 0;
811 ei->i_reserved_meta_blocks = 0; 859 ei->i_reserved_meta_blocks = 0;
812 ei->i_allocated_meta_blocks = 0; 860 ei->i_allocated_meta_blocks = 0;
813 ei->i_da_metadata_calc_len = 0; 861 ei->i_da_metadata_calc_len = 0;
814 ei->i_delalloc_reserved_flag = 0;
815 spin_lock_init(&(ei->i_block_reservation_lock)); 862 spin_lock_init(&(ei->i_block_reservation_lock));
816#ifdef CONFIG_QUOTA 863#ifdef CONFIG_QUOTA
817 ei->i_reserved_quota = 0; 864 ei->i_reserved_quota = 0;
818#endif 865#endif
866 ei->jinode = NULL;
819 INIT_LIST_HEAD(&ei->i_completed_io_list); 867 INIT_LIST_HEAD(&ei->i_completed_io_list);
820 spin_lock_init(&ei->i_completed_io_lock); 868 spin_lock_init(&ei->i_completed_io_lock);
821 ei->cur_aio_dio = NULL; 869 ei->cur_aio_dio = NULL;
822 ei->i_sync_tid = 0; 870 ei->i_sync_tid = 0;
823 ei->i_datasync_tid = 0; 871 ei->i_datasync_tid = 0;
872 atomic_set(&ei->i_ioend_count, 0);
873 atomic_set(&ei->i_aiodio_unwritten, 0);
824 874
825 return &ei->vfs_inode; 875 return &ei->vfs_inode;
826} 876}
827 877
878static int ext4_drop_inode(struct inode *inode)
879{
880 int drop = generic_drop_inode(inode);
881
882 trace_ext4_drop_inode(inode, drop);
883 return drop;
884}
885
886static void ext4_i_callback(struct rcu_head *head)
887{
888 struct inode *inode = container_of(head, struct inode, i_rcu);
889 INIT_LIST_HEAD(&inode->i_dentry);
890 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
891}
892
828static void ext4_destroy_inode(struct inode *inode) 893static void ext4_destroy_inode(struct inode *inode)
829{ 894{
895 ext4_ioend_wait(inode);
830 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 896 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
831 ext4_msg(inode->i_sb, KERN_ERR, 897 ext4_msg(inode->i_sb, KERN_ERR,
832 "Inode %lu (%p): orphan list check failed!", 898 "Inode %lu (%p): orphan list check failed!",
@@ -836,7 +902,7 @@ static void ext4_destroy_inode(struct inode *inode)
836 true); 902 true);
837 dump_stack(); 903 dump_stack();
838 } 904 }
839 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 905 call_rcu(&inode->i_rcu, ext4_i_callback);
840} 906}
841 907
842static void init_once(void *foo) 908static void init_once(void *foo)
@@ -874,9 +940,12 @@ void ext4_clear_inode(struct inode *inode)
874 end_writeback(inode); 940 end_writeback(inode);
875 dquot_drop(inode); 941 dquot_drop(inode);
876 ext4_discard_preallocations(inode); 942 ext4_discard_preallocations(inode);
877 if (EXT4_JOURNAL(inode)) 943 if (EXT4_I(inode)->jinode) {
878 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 944 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
879 &EXT4_I(inode)->jinode); 945 EXT4_I(inode)->jinode);
946 jbd2_free_inode(EXT4_I(inode)->jinode);
947 EXT4_I(inode)->jinode = NULL;
948 }
880} 949}
881 950
882static inline void ext4_show_quota_options(struct seq_file *seq, 951static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -965,13 +1034,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
965 if (test_opt(sb, OLDALLOC)) 1034 if (test_opt(sb, OLDALLOC))
966 seq_puts(seq, ",oldalloc"); 1035 seq_puts(seq, ",oldalloc");
967#ifdef CONFIG_EXT4_FS_XATTR 1036#ifdef CONFIG_EXT4_FS_XATTR
968 if (test_opt(sb, XATTR_USER) && 1037 if (test_opt(sb, XATTR_USER))
969 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
970 seq_puts(seq, ",user_xattr"); 1038 seq_puts(seq, ",user_xattr");
971 if (!test_opt(sb, XATTR_USER) && 1039 if (!test_opt(sb, XATTR_USER))
972 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
973 seq_puts(seq, ",nouser_xattr"); 1040 seq_puts(seq, ",nouser_xattr");
974 }
975#endif 1041#endif
976#ifdef CONFIG_EXT4_FS_POSIX_ACL 1042#ifdef CONFIG_EXT4_FS_POSIX_ACL
977 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 1043 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1009,6 +1075,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1009 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1075 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1010 seq_puts(seq, ",nodelalloc"); 1076 seq_puts(seq, ",nodelalloc");
1011 1077
1078 if (!test_opt(sb, MBLK_IO_SUBMIT))
1079 seq_puts(seq, ",nomblk_io_submit");
1012 if (sbi->s_stripe) 1080 if (sbi->s_stripe)
1013 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1081 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1014 /* 1082 /*
@@ -1045,6 +1113,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1045 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) 1113 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1046 seq_puts(seq, ",block_validity"); 1114 seq_puts(seq, ",block_validity");
1047 1115
1116 if (!test_opt(sb, INIT_INODE_TABLE))
1117 seq_puts(seq, ",noinit_inode_table");
1118 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1119 seq_printf(seq, ",init_inode_table=%u",
1120 (unsigned) sbi->s_li_wait_mult);
1121
1048 ext4_show_quota_options(seq, sb); 1122 ext4_show_quota_options(seq, sb);
1049 1123
1050 return 0; 1124 return 0;
@@ -1123,7 +1197,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1123static int ext4_mark_dquot_dirty(struct dquot *dquot); 1197static int ext4_mark_dquot_dirty(struct dquot *dquot);
1124static int ext4_write_info(struct super_block *sb, int type); 1198static int ext4_write_info(struct super_block *sb, int type);
1125static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1199static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1126 char *path); 1200 struct path *path);
1127static int ext4_quota_off(struct super_block *sb, int type); 1201static int ext4_quota_off(struct super_block *sb, int type);
1128static int ext4_quota_on_mount(struct super_block *sb, int type); 1202static int ext4_quota_on_mount(struct super_block *sb, int type);
1129static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1203static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1132,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1132 const char *data, size_t len, loff_t off); 1206 const char *data, size_t len, loff_t off);
1133 1207
1134static const struct dquot_operations ext4_quota_operations = { 1208static const struct dquot_operations ext4_quota_operations = {
1135#ifdef CONFIG_QUOTA
1136 .get_reserved_space = ext4_get_reserved_space, 1209 .get_reserved_space = ext4_get_reserved_space,
1137#endif
1138 .write_dquot = ext4_write_dquot, 1210 .write_dquot = ext4_write_dquot,
1139 .acquire_dquot = ext4_acquire_dquot, 1211 .acquire_dquot = ext4_acquire_dquot,
1140 .release_dquot = ext4_release_dquot, 1212 .release_dquot = ext4_release_dquot,
@@ -1160,6 +1232,7 @@ static const struct super_operations ext4_sops = {
1160 .destroy_inode = ext4_destroy_inode, 1232 .destroy_inode = ext4_destroy_inode,
1161 .write_inode = ext4_write_inode, 1233 .write_inode = ext4_write_inode,
1162 .dirty_inode = ext4_dirty_inode, 1234 .dirty_inode = ext4_dirty_inode,
1235 .drop_inode = ext4_drop_inode,
1163 .evict_inode = ext4_evict_inode, 1236 .evict_inode = ext4_evict_inode,
1164 .put_super = ext4_put_super, 1237 .put_super = ext4_put_super,
1165 .sync_fs = ext4_sync_fs, 1238 .sync_fs = ext4_sync_fs,
@@ -1180,6 +1253,7 @@ static const struct super_operations ext4_nojournal_sops = {
1180 .destroy_inode = ext4_destroy_inode, 1253 .destroy_inode = ext4_destroy_inode,
1181 .write_inode = ext4_write_inode, 1254 .write_inode = ext4_write_inode,
1182 .dirty_inode = ext4_dirty_inode, 1255 .dirty_inode = ext4_dirty_inode,
1256 .drop_inode = ext4_drop_inode,
1183 .evict_inode = ext4_evict_inode, 1257 .evict_inode = ext4_evict_inode,
1184 .write_super = ext4_write_super, 1258 .write_super = ext4_write_super,
1185 .put_super = ext4_put_super, 1259 .put_super = ext4_put_super,
@@ -1214,11 +1288,12 @@ enum {
1214 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1288 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1215 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1289 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1216 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1290 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1217 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1291 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1218 Opt_block_validity, Opt_noblock_validity, 1292 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1219 Opt_inode_readahead_blks, Opt_journal_ioprio, 1293 Opt_inode_readahead_blks, Opt_journal_ioprio,
1220 Opt_dioread_nolock, Opt_dioread_lock, 1294 Opt_dioread_nolock, Opt_dioread_lock,
1221 Opt_discard, Opt_nodiscard, 1295 Opt_discard, Opt_nodiscard,
1296 Opt_init_inode_table, Opt_noinit_inode_table,
1222}; 1297};
1223 1298
1224static const match_table_t tokens = { 1299static const match_table_t tokens = {
@@ -1278,6 +1353,8 @@ static const match_table_t tokens = {
1278 {Opt_resize, "resize"}, 1353 {Opt_resize, "resize"},
1279 {Opt_delalloc, "delalloc"}, 1354 {Opt_delalloc, "delalloc"},
1280 {Opt_nodelalloc, "nodelalloc"}, 1355 {Opt_nodelalloc, "nodelalloc"},
1356 {Opt_mblk_io_submit, "mblk_io_submit"},
1357 {Opt_nomblk_io_submit, "nomblk_io_submit"},
1281 {Opt_block_validity, "block_validity"}, 1358 {Opt_block_validity, "block_validity"},
1282 {Opt_noblock_validity, "noblock_validity"}, 1359 {Opt_noblock_validity, "noblock_validity"},
1283 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1360 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1289,6 +1366,9 @@ static const match_table_t tokens = {
1289 {Opt_dioread_lock, "dioread_lock"}, 1366 {Opt_dioread_lock, "dioread_lock"},
1290 {Opt_discard, "discard"}, 1367 {Opt_discard, "discard"},
1291 {Opt_nodiscard, "nodiscard"}, 1368 {Opt_nodiscard, "nodiscard"},
1369 {Opt_init_inode_table, "init_itable=%u"},
1370 {Opt_init_inode_table, "init_itable"},
1371 {Opt_noinit_inode_table, "noinit_itable"},
1292 {Opt_err, NULL}, 1372 {Opt_err, NULL},
1293}; 1373};
1294 1374
@@ -1353,7 +1433,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1353 sbi->s_qf_names[qtype] = NULL; 1433 sbi->s_qf_names[qtype] = NULL;
1354 return 0; 1434 return 0;
1355 } 1435 }
1356 set_opt(sbi->s_mount_opt, QUOTA); 1436 set_opt(sb, QUOTA);
1357 return 1; 1437 return 1;
1358} 1438}
1359 1439
@@ -1403,26 +1483,26 @@ static int parse_options(char *options, struct super_block *sb,
1403 * Initialize args struct so we know whether arg was 1483 * Initialize args struct so we know whether arg was
1404 * found; some options take optional arguments. 1484 * found; some options take optional arguments.
1405 */ 1485 */
1406 args[0].to = args[0].from = 0; 1486 args[0].to = args[0].from = NULL;
1407 token = match_token(p, tokens, args); 1487 token = match_token(p, tokens, args);
1408 switch (token) { 1488 switch (token) {
1409 case Opt_bsd_df: 1489 case Opt_bsd_df:
1410 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1490 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1411 clear_opt(sbi->s_mount_opt, MINIX_DF); 1491 clear_opt(sb, MINIX_DF);
1412 break; 1492 break;
1413 case Opt_minix_df: 1493 case Opt_minix_df:
1414 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1494 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1415 set_opt(sbi->s_mount_opt, MINIX_DF); 1495 set_opt(sb, MINIX_DF);
1416 1496
1417 break; 1497 break;
1418 case Opt_grpid: 1498 case Opt_grpid:
1419 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1499 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1420 set_opt(sbi->s_mount_opt, GRPID); 1500 set_opt(sb, GRPID);
1421 1501
1422 break; 1502 break;
1423 case Opt_nogrpid: 1503 case Opt_nogrpid:
1424 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1504 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1425 clear_opt(sbi->s_mount_opt, GRPID); 1505 clear_opt(sb, GRPID);
1426 1506
1427 break; 1507 break;
1428 case Opt_resuid: 1508 case Opt_resuid:
@@ -1440,38 +1520,38 @@ static int parse_options(char *options, struct super_block *sb,
1440 /* *sb_block = match_int(&args[0]); */ 1520 /* *sb_block = match_int(&args[0]); */
1441 break; 1521 break;
1442 case Opt_err_panic: 1522 case Opt_err_panic:
1443 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1523 clear_opt(sb, ERRORS_CONT);
1444 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1524 clear_opt(sb, ERRORS_RO);
1445 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1525 set_opt(sb, ERRORS_PANIC);
1446 break; 1526 break;
1447 case Opt_err_ro: 1527 case Opt_err_ro:
1448 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1528 clear_opt(sb, ERRORS_CONT);
1449 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1529 clear_opt(sb, ERRORS_PANIC);
1450 set_opt(sbi->s_mount_opt, ERRORS_RO); 1530 set_opt(sb, ERRORS_RO);
1451 break; 1531 break;
1452 case Opt_err_cont: 1532 case Opt_err_cont:
1453 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1533 clear_opt(sb, ERRORS_RO);
1454 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1534 clear_opt(sb, ERRORS_PANIC);
1455 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1535 set_opt(sb, ERRORS_CONT);
1456 break; 1536 break;
1457 case Opt_nouid32: 1537 case Opt_nouid32:
1458 set_opt(sbi->s_mount_opt, NO_UID32); 1538 set_opt(sb, NO_UID32);
1459 break; 1539 break;
1460 case Opt_debug: 1540 case Opt_debug:
1461 set_opt(sbi->s_mount_opt, DEBUG); 1541 set_opt(sb, DEBUG);
1462 break; 1542 break;
1463 case Opt_oldalloc: 1543 case Opt_oldalloc:
1464 set_opt(sbi->s_mount_opt, OLDALLOC); 1544 set_opt(sb, OLDALLOC);
1465 break; 1545 break;
1466 case Opt_orlov: 1546 case Opt_orlov:
1467 clear_opt(sbi->s_mount_opt, OLDALLOC); 1547 clear_opt(sb, OLDALLOC);
1468 break; 1548 break;
1469#ifdef CONFIG_EXT4_FS_XATTR 1549#ifdef CONFIG_EXT4_FS_XATTR
1470 case Opt_user_xattr: 1550 case Opt_user_xattr:
1471 set_opt(sbi->s_mount_opt, XATTR_USER); 1551 set_opt(sb, XATTR_USER);
1472 break; 1552 break;
1473 case Opt_nouser_xattr: 1553 case Opt_nouser_xattr:
1474 clear_opt(sbi->s_mount_opt, XATTR_USER); 1554 clear_opt(sb, XATTR_USER);
1475 break; 1555 break;
1476#else 1556#else
1477 case Opt_user_xattr: 1557 case Opt_user_xattr:
@@ -1481,10 +1561,10 @@ static int parse_options(char *options, struct super_block *sb,
1481#endif 1561#endif
1482#ifdef CONFIG_EXT4_FS_POSIX_ACL 1562#ifdef CONFIG_EXT4_FS_POSIX_ACL
1483 case Opt_acl: 1563 case Opt_acl:
1484 set_opt(sbi->s_mount_opt, POSIX_ACL); 1564 set_opt(sb, POSIX_ACL);
1485 break; 1565 break;
1486 case Opt_noacl: 1566 case Opt_noacl:
1487 clear_opt(sbi->s_mount_opt, POSIX_ACL); 1567 clear_opt(sb, POSIX_ACL);
1488 break; 1568 break;
1489#else 1569#else
1490 case Opt_acl: 1570 case Opt_acl:
@@ -1503,7 +1583,7 @@ static int parse_options(char *options, struct super_block *sb,
1503 "Cannot specify journal on remount"); 1583 "Cannot specify journal on remount");
1504 return 0; 1584 return 0;
1505 } 1585 }
1506 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1586 set_opt(sb, UPDATE_JOURNAL);
1507 break; 1587 break;
1508 case Opt_journal_dev: 1588 case Opt_journal_dev:
1509 if (is_remount) { 1589 if (is_remount) {
@@ -1516,14 +1596,14 @@ static int parse_options(char *options, struct super_block *sb,
1516 *journal_devnum = option; 1596 *journal_devnum = option;
1517 break; 1597 break;
1518 case Opt_journal_checksum: 1598 case Opt_journal_checksum:
1519 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1599 set_opt(sb, JOURNAL_CHECKSUM);
1520 break; 1600 break;
1521 case Opt_journal_async_commit: 1601 case Opt_journal_async_commit:
1522 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1602 set_opt(sb, JOURNAL_ASYNC_COMMIT);
1523 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1603 set_opt(sb, JOURNAL_CHECKSUM);
1524 break; 1604 break;
1525 case Opt_noload: 1605 case Opt_noload:
1526 set_opt(sbi->s_mount_opt, NOLOAD); 1606 set_opt(sb, NOLOAD);
1527 break; 1607 break;
1528 case Opt_commit: 1608 case Opt_commit:
1529 if (match_int(&args[0], &option)) 1609 if (match_int(&args[0], &option))
@@ -1566,15 +1646,15 @@ static int parse_options(char *options, struct super_block *sb,
1566 return 0; 1646 return 0;
1567 } 1647 }
1568 } else { 1648 } else {
1569 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 1649 clear_opt(sb, DATA_FLAGS);
1570 sbi->s_mount_opt |= data_opt; 1650 sbi->s_mount_opt |= data_opt;
1571 } 1651 }
1572 break; 1652 break;
1573 case Opt_data_err_abort: 1653 case Opt_data_err_abort:
1574 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1654 set_opt(sb, DATA_ERR_ABORT);
1575 break; 1655 break;
1576 case Opt_data_err_ignore: 1656 case Opt_data_err_ignore:
1577 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1657 clear_opt(sb, DATA_ERR_ABORT);
1578 break; 1658 break;
1579#ifdef CONFIG_QUOTA 1659#ifdef CONFIG_QUOTA
1580 case Opt_usrjquota: 1660 case Opt_usrjquota:
@@ -1614,12 +1694,12 @@ set_qf_format:
1614 break; 1694 break;
1615 case Opt_quota: 1695 case Opt_quota:
1616 case Opt_usrquota: 1696 case Opt_usrquota:
1617 set_opt(sbi->s_mount_opt, QUOTA); 1697 set_opt(sb, QUOTA);
1618 set_opt(sbi->s_mount_opt, USRQUOTA); 1698 set_opt(sb, USRQUOTA);
1619 break; 1699 break;
1620 case Opt_grpquota: 1700 case Opt_grpquota:
1621 set_opt(sbi->s_mount_opt, QUOTA); 1701 set_opt(sb, QUOTA);
1622 set_opt(sbi->s_mount_opt, GRPQUOTA); 1702 set_opt(sb, GRPQUOTA);
1623 break; 1703 break;
1624 case Opt_noquota: 1704 case Opt_noquota:
1625 if (sb_any_quota_loaded(sb)) { 1705 if (sb_any_quota_loaded(sb)) {
@@ -1627,9 +1707,9 @@ set_qf_format:
1627 "options when quota turned on"); 1707 "options when quota turned on");
1628 return 0; 1708 return 0;
1629 } 1709 }
1630 clear_opt(sbi->s_mount_opt, QUOTA); 1710 clear_opt(sb, QUOTA);
1631 clear_opt(sbi->s_mount_opt, USRQUOTA); 1711 clear_opt(sb, USRQUOTA);
1632 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1712 clear_opt(sb, GRPQUOTA);
1633 break; 1713 break;
1634#else 1714#else
1635 case Opt_quota: 1715 case Opt_quota:
@@ -1655,7 +1735,7 @@ set_qf_format:
1655 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1735 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1656 break; 1736 break;
1657 case Opt_nobarrier: 1737 case Opt_nobarrier:
1658 clear_opt(sbi->s_mount_opt, BARRIER); 1738 clear_opt(sb, BARRIER);
1659 break; 1739 break;
1660 case Opt_barrier: 1740 case Opt_barrier:
1661 if (args[0].from) { 1741 if (args[0].from) {
@@ -1664,9 +1744,9 @@ set_qf_format:
1664 } else 1744 } else
1665 option = 1; /* No argument, default to 1 */ 1745 option = 1; /* No argument, default to 1 */
1666 if (option) 1746 if (option)
1667 set_opt(sbi->s_mount_opt, BARRIER); 1747 set_opt(sb, BARRIER);
1668 else 1748 else
1669 clear_opt(sbi->s_mount_opt, BARRIER); 1749 clear_opt(sb, BARRIER);
1670 break; 1750 break;
1671 case Opt_ignore: 1751 case Opt_ignore:
1672 break; 1752 break;
@@ -1690,11 +1770,17 @@ set_qf_format:
1690 "Ignoring deprecated bh option"); 1770 "Ignoring deprecated bh option");
1691 break; 1771 break;
1692 case Opt_i_version: 1772 case Opt_i_version:
1693 set_opt(sbi->s_mount_opt, I_VERSION); 1773 set_opt(sb, I_VERSION);
1694 sb->s_flags |= MS_I_VERSION; 1774 sb->s_flags |= MS_I_VERSION;
1695 break; 1775 break;
1696 case Opt_nodelalloc: 1776 case Opt_nodelalloc:
1697 clear_opt(sbi->s_mount_opt, DELALLOC); 1777 clear_opt(sb, DELALLOC);
1778 break;
1779 case Opt_mblk_io_submit:
1780 set_opt(sb, MBLK_IO_SUBMIT);
1781 break;
1782 case Opt_nomblk_io_submit:
1783 clear_opt(sb, MBLK_IO_SUBMIT);
1698 break; 1784 break;
1699 case Opt_stripe: 1785 case Opt_stripe:
1700 if (match_int(&args[0], &option)) 1786 if (match_int(&args[0], &option))
@@ -1704,20 +1790,20 @@ set_qf_format:
1704 sbi->s_stripe = option; 1790 sbi->s_stripe = option;
1705 break; 1791 break;
1706 case Opt_delalloc: 1792 case Opt_delalloc:
1707 set_opt(sbi->s_mount_opt, DELALLOC); 1793 set_opt(sb, DELALLOC);
1708 break; 1794 break;
1709 case Opt_block_validity: 1795 case Opt_block_validity:
1710 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1796 set_opt(sb, BLOCK_VALIDITY);
1711 break; 1797 break;
1712 case Opt_noblock_validity: 1798 case Opt_noblock_validity:
1713 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1799 clear_opt(sb, BLOCK_VALIDITY);
1714 break; 1800 break;
1715 case Opt_inode_readahead_blks: 1801 case Opt_inode_readahead_blks:
1716 if (match_int(&args[0], &option)) 1802 if (match_int(&args[0], &option))
1717 return 0; 1803 return 0;
1718 if (option < 0 || option > (1 << 30)) 1804 if (option < 0 || option > (1 << 30))
1719 return 0; 1805 return 0;
1720 if (!is_power_of_2(option)) { 1806 if (option && !is_power_of_2(option)) {
1721 ext4_msg(sb, KERN_ERR, 1807 ext4_msg(sb, KERN_ERR,
1722 "EXT4-fs: inode_readahead_blks" 1808 "EXT4-fs: inode_readahead_blks"
1723 " must be a power of 2"); 1809 " must be a power of 2");
@@ -1734,7 +1820,7 @@ set_qf_format:
1734 option); 1820 option);
1735 break; 1821 break;
1736 case Opt_noauto_da_alloc: 1822 case Opt_noauto_da_alloc:
1737 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1823 set_opt(sb, NO_AUTO_DA_ALLOC);
1738 break; 1824 break;
1739 case Opt_auto_da_alloc: 1825 case Opt_auto_da_alloc:
1740 if (args[0].from) { 1826 if (args[0].from) {
@@ -1743,21 +1829,35 @@ set_qf_format:
1743 } else 1829 } else
1744 option = 1; /* No argument, default to 1 */ 1830 option = 1; /* No argument, default to 1 */
1745 if (option) 1831 if (option)
1746 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1832 clear_opt(sb, NO_AUTO_DA_ALLOC);
1747 else 1833 else
1748 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1834 set_opt(sb,NO_AUTO_DA_ALLOC);
1749 break; 1835 break;
1750 case Opt_discard: 1836 case Opt_discard:
1751 set_opt(sbi->s_mount_opt, DISCARD); 1837 set_opt(sb, DISCARD);
1752 break; 1838 break;
1753 case Opt_nodiscard: 1839 case Opt_nodiscard:
1754 clear_opt(sbi->s_mount_opt, DISCARD); 1840 clear_opt(sb, DISCARD);
1755 break; 1841 break;
1756 case Opt_dioread_nolock: 1842 case Opt_dioread_nolock:
1757 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1843 set_opt(sb, DIOREAD_NOLOCK);
1758 break; 1844 break;
1759 case Opt_dioread_lock: 1845 case Opt_dioread_lock:
1760 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1846 clear_opt(sb, DIOREAD_NOLOCK);
1847 break;
1848 case Opt_init_inode_table:
1849 set_opt(sb, INIT_INODE_TABLE);
1850 if (args[0].from) {
1851 if (match_int(&args[0], &option))
1852 return 0;
1853 } else
1854 option = EXT4_DEF_LI_WAIT_MULT;
1855 if (option < 0)
1856 return 0;
1857 sbi->s_li_wait_mult = option;
1858 break;
1859 case Opt_noinit_inode_table:
1860 clear_opt(sb, INIT_INODE_TABLE);
1761 break; 1861 break;
1762 default: 1862 default:
1763 ext4_msg(sb, KERN_ERR, 1863 ext4_msg(sb, KERN_ERR,
@@ -1769,10 +1869,10 @@ set_qf_format:
1769#ifdef CONFIG_QUOTA 1869#ifdef CONFIG_QUOTA
1770 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1870 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1771 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) 1871 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1772 clear_opt(sbi->s_mount_opt, USRQUOTA); 1872 clear_opt(sb, USRQUOTA);
1773 1873
1774 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) 1874 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1775 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1875 clear_opt(sb, GRPQUOTA);
1776 1876
1777 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 1877 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1778 ext4_msg(sb, KERN_ERR, "old and new quota " 1878 ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1817,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1817 ext4_msg(sb, KERN_WARNING, 1917 ext4_msg(sb, KERN_WARNING,
1818 "warning: mounting fs with errors, " 1918 "warning: mounting fs with errors, "
1819 "running e2fsck is recommended"); 1919 "running e2fsck is recommended");
1820 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1920 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1821 le16_to_cpu(es->s_mnt_count) >= 1921 le16_to_cpu(es->s_mnt_count) >=
1822 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1922 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1823 ext4_msg(sb, KERN_WARNING, 1923 ext4_msg(sb, KERN_WARNING,
@@ -1842,13 +1942,14 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1842 ext4_commit_super(sb, 1); 1942 ext4_commit_super(sb, 1);
1843 if (test_opt(sb, DEBUG)) 1943 if (test_opt(sb, DEBUG))
1844 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1944 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1845 "bpg=%lu, ipg=%lu, mo=%04x]\n", 1945 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1846 sb->s_blocksize, 1946 sb->s_blocksize,
1847 sbi->s_groups_count, 1947 sbi->s_groups_count,
1848 EXT4_BLOCKS_PER_GROUP(sb), 1948 EXT4_BLOCKS_PER_GROUP(sb),
1849 EXT4_INODES_PER_GROUP(sb), 1949 EXT4_INODES_PER_GROUP(sb),
1850 sbi->s_mount_opt); 1950 sbi->s_mount_opt, sbi->s_mount_opt2);
1851 1951
1952 cleancache_init_fs(sb);
1852 return res; 1953 return res;
1853} 1954}
1854 1955
@@ -1877,14 +1978,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
1877 size = flex_group_count * sizeof(struct flex_groups); 1978 size = flex_group_count * sizeof(struct flex_groups);
1878 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 1979 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1879 if (sbi->s_flex_groups == NULL) { 1980 if (sbi->s_flex_groups == NULL) {
1880 sbi->s_flex_groups = vmalloc(size); 1981 sbi->s_flex_groups = vzalloc(size);
1881 if (sbi->s_flex_groups) 1982 if (sbi->s_flex_groups == NULL) {
1882 memset(sbi->s_flex_groups, 0, size); 1983 ext4_msg(sb, KERN_ERR,
1883 } 1984 "not enough memory for %u flex groups",
1884 if (sbi->s_flex_groups == NULL) { 1985 flex_group_count);
1885 ext4_msg(sb, KERN_ERR, "not enough memory for " 1986 goto failed;
1886 "%u flex groups", flex_group_count); 1987 }
1887 goto failed;
1888 } 1988 }
1889 1989
1890 for (i = 0; i < sbi->s_groups_count; i++) { 1990 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -1942,7 +2042,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1942} 2042}
1943 2043
1944/* Called at mount-time, super-block is locked */ 2044/* Called at mount-time, super-block is locked */
1945static int ext4_check_descriptors(struct super_block *sb) 2045static int ext4_check_descriptors(struct super_block *sb,
2046 ext4_group_t *first_not_zeroed)
1946{ 2047{
1947 struct ext4_sb_info *sbi = EXT4_SB(sb); 2048 struct ext4_sb_info *sbi = EXT4_SB(sb);
1948 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 2049 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +2052,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1951 ext4_fsblk_t inode_bitmap; 2052 ext4_fsblk_t inode_bitmap;
1952 ext4_fsblk_t inode_table; 2053 ext4_fsblk_t inode_table;
1953 int flexbg_flag = 0; 2054 int flexbg_flag = 0;
1954 ext4_group_t i; 2055 ext4_group_t i, grp = sbi->s_groups_count;
1955 2056
1956 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 2057 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1957 flexbg_flag = 1; 2058 flexbg_flag = 1;
@@ -1967,6 +2068,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1967 last_block = first_block + 2068 last_block = first_block +
1968 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2069 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1969 2070
2071 if ((grp == sbi->s_groups_count) &&
2072 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2073 grp = i;
2074
1970 block_bitmap = ext4_block_bitmap(sb, gdp); 2075 block_bitmap = ext4_block_bitmap(sb, gdp);
1971 if (block_bitmap < first_block || block_bitmap > last_block) { 2076 if (block_bitmap < first_block || block_bitmap > last_block) {
1972 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2077 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2109,8 @@ static int ext4_check_descriptors(struct super_block *sb)
2004 if (!flexbg_flag) 2109 if (!flexbg_flag)
2005 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2110 first_block += EXT4_BLOCKS_PER_GROUP(sb);
2006 } 2111 }
2112 if (NULL != first_not_zeroed)
2113 *first_not_zeroed = grp;
2007 2114
2008 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2115 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
2009 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2116 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2046,6 +2153,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2046 return; 2153 return;
2047 } 2154 }
2048 2155
2156 /* Check if feature set would not allow a r/w mount */
2157 if (!ext4_feature_set_ok(sb, 0)) {
2158 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2159 "unknown ROCOMPAT features");
2160 return;
2161 }
2162
2049 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2163 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2050 if (es->s_last_orphan) 2164 if (es->s_last_orphan)
2051 jbd_debug(1, "Errors on filesystem, " 2165 jbd_debug(1, "Errors on filesystem, "
@@ -2129,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2129 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, 2243 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
2130 * so that won't be a limiting factor. 2244 * so that won't be a limiting factor.
2131 * 2245 *
2246 * However there is other limiting factor. We do store extents in the form
2247 * of starting block and length, hence the resulting length of the extent
2248 * covering maximum file size must fit into on-disk format containers as
2249 * well. Given that length is always by 1 unit bigger than max unit (because
2250 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2251 *
2132 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 2252 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2133 */ 2253 */
2134static loff_t ext4_max_size(int blkbits, int has_huge_files) 2254static loff_t ext4_max_size(int blkbits, int has_huge_files)
@@ -2150,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
2150 upper_limit <<= blkbits; 2270 upper_limit <<= blkbits;
2151 } 2271 }
2152 2272
2153 /* 32-bit extent-start container, ee_block */ 2273 /*
2154 res = 1LL << 32; 2274 * 32-bit extent-start container, ee_block. We lower the maxbytes
2275 * by one fs block, so ee_len can cover the extent of maximum file
2276 * size
2277 */
2278 res = (1LL << 32) - 1;
2155 res <<= blkbits; 2279 res <<= blkbits;
2156 res -= 1;
2157 2280
2158 /* Sanity check against vm- & vfs- imposed limits */ 2281 /* Sanity check against vm- & vfs- imposed limits */
2159 if (res > upper_limit) 2282 if (res > upper_limit)
@@ -2329,6 +2452,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2329 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 2452 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2330} 2453}
2331 2454
2455static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2456 struct ext4_sb_info *sbi, char *buf)
2457{
2458 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2459}
2460
2461static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2462 struct ext4_sb_info *sbi, char *buf)
2463{
2464 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2465}
2466
2332static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2467static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2333 struct ext4_sb_info *sbi, 2468 struct ext4_sb_info *sbi,
2334 const char *buf, size_t count) 2469 const char *buf, size_t count)
@@ -2338,7 +2473,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2338 if (parse_strtoul(buf, 0x40000000, &t)) 2473 if (parse_strtoul(buf, 0x40000000, &t))
2339 return -EINVAL; 2474 return -EINVAL;
2340 2475
2341 if (!is_power_of_2(t)) 2476 if (t && !is_power_of_2(t))
2342 return -EINVAL; 2477 return -EINVAL;
2343 2478
2344 sbi->s_inode_readahead_blks = t; 2479 sbi->s_inode_readahead_blks = t;
@@ -2376,6 +2511,7 @@ static struct ext4_attr ext4_attr_##_name = { \
2376#define EXT4_ATTR(name, mode, show, store) \ 2511#define EXT4_ATTR(name, mode, show, store) \
2377static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2512static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2378 2513
2514#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2379#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2515#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2380#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2516#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2381#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2517#define EXT4_RW_ATTR_SBI_UI(name, elname) \
@@ -2385,6 +2521,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2385EXT4_RO_ATTR(delayed_allocation_blocks); 2521EXT4_RO_ATTR(delayed_allocation_blocks);
2386EXT4_RO_ATTR(session_write_kbytes); 2522EXT4_RO_ATTR(session_write_kbytes);
2387EXT4_RO_ATTR(lifetime_write_kbytes); 2523EXT4_RO_ATTR(lifetime_write_kbytes);
2524EXT4_RO_ATTR(extent_cache_hits);
2525EXT4_RO_ATTR(extent_cache_misses);
2388EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2526EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2389 inode_readahead_blks_store, s_inode_readahead_blks); 2527 inode_readahead_blks_store, s_inode_readahead_blks);
2390EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2528EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2400,6 +2538,8 @@ static struct attribute *ext4_attrs[] = {
2400 ATTR_LIST(delayed_allocation_blocks), 2538 ATTR_LIST(delayed_allocation_blocks),
2401 ATTR_LIST(session_write_kbytes), 2539 ATTR_LIST(session_write_kbytes),
2402 ATTR_LIST(lifetime_write_kbytes), 2540 ATTR_LIST(lifetime_write_kbytes),
2541 ATTR_LIST(extent_cache_hits),
2542 ATTR_LIST(extent_cache_misses),
2403 ATTR_LIST(inode_readahead_blks), 2543 ATTR_LIST(inode_readahead_blks),
2404 ATTR_LIST(inode_goal), 2544 ATTR_LIST(inode_goal),
2405 ATTR_LIST(mb_stats), 2545 ATTR_LIST(mb_stats),
@@ -2412,6 +2552,16 @@ static struct attribute *ext4_attrs[] = {
2412 NULL, 2552 NULL,
2413}; 2553};
2414 2554
2555/* Features this copy of ext4 supports */
2556EXT4_INFO_ATTR(lazy_itable_init);
2557EXT4_INFO_ATTR(batched_discard);
2558
2559static struct attribute *ext4_feat_attrs[] = {
2560 ATTR_LIST(lazy_itable_init),
2561 ATTR_LIST(batched_discard),
2562 NULL,
2563};
2564
2415static ssize_t ext4_attr_show(struct kobject *kobj, 2565static ssize_t ext4_attr_show(struct kobject *kobj,
2416 struct attribute *attr, char *buf) 2566 struct attribute *attr, char *buf)
2417{ 2567{
@@ -2440,7 +2590,6 @@ static void ext4_sb_release(struct kobject *kobj)
2440 complete(&sbi->s_kobj_unregister); 2590 complete(&sbi->s_kobj_unregister);
2441} 2591}
2442 2592
2443
2444static const struct sysfs_ops ext4_attr_ops = { 2593static const struct sysfs_ops ext4_attr_ops = {
2445 .show = ext4_attr_show, 2594 .show = ext4_attr_show,
2446 .store = ext4_attr_store, 2595 .store = ext4_attr_store,
@@ -2452,6 +2601,17 @@ static struct kobj_type ext4_ktype = {
2452 .release = ext4_sb_release, 2601 .release = ext4_sb_release,
2453}; 2602};
2454 2603
2604static void ext4_feat_release(struct kobject *kobj)
2605{
2606 complete(&ext4_feat->f_kobj_unregister);
2607}
2608
2609static struct kobj_type ext4_feat_ktype = {
2610 .default_attrs = ext4_feat_attrs,
2611 .sysfs_ops = &ext4_attr_ops,
2612 .release = ext4_feat_release,
2613};
2614
2455/* 2615/*
2456 * Check whether this filesystem can be mounted based on 2616 * Check whether this filesystem can be mounted based on
2457 * the features present and the RDONLY/RDWR mount requested. 2617 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2702,343 @@ static void print_daily_error_info(unsigned long arg)
2542 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2702 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2543} 2703}
2544 2704
2705/* Find next suitable group and run ext4_init_inode_table */
2706static int ext4_run_li_request(struct ext4_li_request *elr)
2707{
2708 struct ext4_group_desc *gdp = NULL;
2709 ext4_group_t group, ngroups;
2710 struct super_block *sb;
2711 unsigned long timeout = 0;
2712 int ret = 0;
2713
2714 sb = elr->lr_super;
2715 ngroups = EXT4_SB(sb)->s_groups_count;
2716
2717 for (group = elr->lr_next_group; group < ngroups; group++) {
2718 gdp = ext4_get_group_desc(sb, group, NULL);
2719 if (!gdp) {
2720 ret = 1;
2721 break;
2722 }
2723
2724 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2725 break;
2726 }
2727
2728 if (group == ngroups)
2729 ret = 1;
2730
2731 if (!ret) {
2732 timeout = jiffies;
2733 ret = ext4_init_inode_table(sb, group,
2734 elr->lr_timeout ? 0 : 1);
2735 if (elr->lr_timeout == 0) {
2736 timeout = (jiffies - timeout) *
2737 elr->lr_sbi->s_li_wait_mult;
2738 elr->lr_timeout = timeout;
2739 }
2740 elr->lr_next_sched = jiffies + elr->lr_timeout;
2741 elr->lr_next_group = group + 1;
2742 }
2743
2744 return ret;
2745}
2746
2747/*
2748 * Remove lr_request from the list_request and free the
2749 * request structure. Should be called with li_list_mtx held
2750 */
2751static void ext4_remove_li_request(struct ext4_li_request *elr)
2752{
2753 struct ext4_sb_info *sbi;
2754
2755 if (!elr)
2756 return;
2757
2758 sbi = elr->lr_sbi;
2759
2760 list_del(&elr->lr_request);
2761 sbi->s_li_request = NULL;
2762 kfree(elr);
2763}
2764
2765static void ext4_unregister_li_request(struct super_block *sb)
2766{
2767 mutex_lock(&ext4_li_mtx);
2768 if (!ext4_li_info) {
2769 mutex_unlock(&ext4_li_mtx);
2770 return;
2771 }
2772
2773 mutex_lock(&ext4_li_info->li_list_mtx);
2774 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2775 mutex_unlock(&ext4_li_info->li_list_mtx);
2776 mutex_unlock(&ext4_li_mtx);
2777}
2778
2779static struct task_struct *ext4_lazyinit_task;
2780
2781/*
2782 * This is the function where ext4lazyinit thread lives. It walks
2783 * through the request list searching for next scheduled filesystem.
2784 * When such a fs is found, run the lazy initialization request
2785 * (ext4_rn_li_request) and keep track of the time spend in this
2786 * function. Based on that time we compute next schedule time of
2787 * the request. When walking through the list is complete, compute
2788 * next waking time and put itself into sleep.
2789 */
2790static int ext4_lazyinit_thread(void *arg)
2791{
2792 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2793 struct list_head *pos, *n;
2794 struct ext4_li_request *elr;
2795 unsigned long next_wakeup, cur;
2796
2797 BUG_ON(NULL == eli);
2798
2799cont_thread:
2800 while (true) {
2801 next_wakeup = MAX_JIFFY_OFFSET;
2802
2803 mutex_lock(&eli->li_list_mtx);
2804 if (list_empty(&eli->li_request_list)) {
2805 mutex_unlock(&eli->li_list_mtx);
2806 goto exit_thread;
2807 }
2808
2809 list_for_each_safe(pos, n, &eli->li_request_list) {
2810 elr = list_entry(pos, struct ext4_li_request,
2811 lr_request);
2812
2813 if (time_after_eq(jiffies, elr->lr_next_sched)) {
2814 if (ext4_run_li_request(elr) != 0) {
2815 /* error, remove the lazy_init job */
2816 ext4_remove_li_request(elr);
2817 continue;
2818 }
2819 }
2820
2821 if (time_before(elr->lr_next_sched, next_wakeup))
2822 next_wakeup = elr->lr_next_sched;
2823 }
2824 mutex_unlock(&eli->li_list_mtx);
2825
2826 if (freezing(current))
2827 refrigerator();
2828
2829 cur = jiffies;
2830 if ((time_after_eq(cur, next_wakeup)) ||
2831 (MAX_JIFFY_OFFSET == next_wakeup)) {
2832 cond_resched();
2833 continue;
2834 }
2835
2836 schedule_timeout_interruptible(next_wakeup - cur);
2837
2838 if (kthread_should_stop()) {
2839 ext4_clear_request_list();
2840 goto exit_thread;
2841 }
2842 }
2843
2844exit_thread:
2845 /*
2846 * It looks like the request list is empty, but we need
2847 * to check it under the li_list_mtx lock, to prevent any
2848 * additions into it, and of course we should lock ext4_li_mtx
2849 * to atomically free the list and ext4_li_info, because at
2850 * this point another ext4 filesystem could be registering
2851 * new one.
2852 */
2853 mutex_lock(&ext4_li_mtx);
2854 mutex_lock(&eli->li_list_mtx);
2855 if (!list_empty(&eli->li_request_list)) {
2856 mutex_unlock(&eli->li_list_mtx);
2857 mutex_unlock(&ext4_li_mtx);
2858 goto cont_thread;
2859 }
2860 mutex_unlock(&eli->li_list_mtx);
2861 kfree(ext4_li_info);
2862 ext4_li_info = NULL;
2863 mutex_unlock(&ext4_li_mtx);
2864
2865 return 0;
2866}
2867
2868static void ext4_clear_request_list(void)
2869{
2870 struct list_head *pos, *n;
2871 struct ext4_li_request *elr;
2872
2873 mutex_lock(&ext4_li_info->li_list_mtx);
2874 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2875 elr = list_entry(pos, struct ext4_li_request,
2876 lr_request);
2877 ext4_remove_li_request(elr);
2878 }
2879 mutex_unlock(&ext4_li_info->li_list_mtx);
2880}
2881
2882static int ext4_run_lazyinit_thread(void)
2883{
2884 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
2885 ext4_li_info, "ext4lazyinit");
2886 if (IS_ERR(ext4_lazyinit_task)) {
2887 int err = PTR_ERR(ext4_lazyinit_task);
2888 ext4_clear_request_list();
2889 kfree(ext4_li_info);
2890 ext4_li_info = NULL;
2891 printk(KERN_CRIT "EXT4: error %d creating inode table "
2892 "initialization thread\n",
2893 err);
2894 return err;
2895 }
2896 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2897 return 0;
2898}
2899
2900/*
2901 * Check whether it make sense to run itable init. thread or not.
2902 * If there is at least one uninitialized inode table, return
2903 * corresponding group number, else the loop goes through all
2904 * groups and return total number of groups.
2905 */
2906static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2907{
2908 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2909 struct ext4_group_desc *gdp = NULL;
2910
2911 for (group = 0; group < ngroups; group++) {
2912 gdp = ext4_get_group_desc(sb, group, NULL);
2913 if (!gdp)
2914 continue;
2915
2916 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2917 break;
2918 }
2919
2920 return group;
2921}
2922
2923static int ext4_li_info_new(void)
2924{
2925 struct ext4_lazy_init *eli = NULL;
2926
2927 eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2928 if (!eli)
2929 return -ENOMEM;
2930
2931 INIT_LIST_HEAD(&eli->li_request_list);
2932 mutex_init(&eli->li_list_mtx);
2933
2934 eli->li_state |= EXT4_LAZYINIT_QUIT;
2935
2936 ext4_li_info = eli;
2937
2938 return 0;
2939}
2940
2941static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2942 ext4_group_t start)
2943{
2944 struct ext4_sb_info *sbi = EXT4_SB(sb);
2945 struct ext4_li_request *elr;
2946 unsigned long rnd;
2947
2948 elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2949 if (!elr)
2950 return NULL;
2951
2952 elr->lr_super = sb;
2953 elr->lr_sbi = sbi;
2954 elr->lr_next_group = start;
2955
2956 /*
2957 * Randomize first schedule time of the request to
2958 * spread the inode table initialization requests
2959 * better.
2960 */
2961 get_random_bytes(&rnd, sizeof(rnd));
2962 elr->lr_next_sched = jiffies + (unsigned long)rnd %
2963 (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2964
2965 return elr;
2966}
2967
2968static int ext4_register_li_request(struct super_block *sb,
2969 ext4_group_t first_not_zeroed)
2970{
2971 struct ext4_sb_info *sbi = EXT4_SB(sb);
2972 struct ext4_li_request *elr;
2973 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2974 int ret = 0;
2975
2976 if (sbi->s_li_request != NULL) {
2977 /*
2978 * Reset timeout so it can be computed again, because
2979 * s_li_wait_mult might have changed.
2980 */
2981 sbi->s_li_request->lr_timeout = 0;
2982 return 0;
2983 }
2984
2985 if (first_not_zeroed == ngroups ||
2986 (sb->s_flags & MS_RDONLY) ||
2987 !test_opt(sb, INIT_INODE_TABLE))
2988 return 0;
2989
2990 elr = ext4_li_request_new(sb, first_not_zeroed);
2991 if (!elr)
2992 return -ENOMEM;
2993
2994 mutex_lock(&ext4_li_mtx);
2995
2996 if (NULL == ext4_li_info) {
2997 ret = ext4_li_info_new();
2998 if (ret)
2999 goto out;
3000 }
3001
3002 mutex_lock(&ext4_li_info->li_list_mtx);
3003 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3004 mutex_unlock(&ext4_li_info->li_list_mtx);
3005
3006 sbi->s_li_request = elr;
3007 /*
3008 * set elr to NULL here since it has been inserted to
3009 * the request_list and the removal and free of it is
3010 * handled by ext4_clear_request_list from now on.
3011 */
3012 elr = NULL;
3013
3014 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3015 ret = ext4_run_lazyinit_thread();
3016 if (ret)
3017 goto out;
3018 }
3019out:
3020 mutex_unlock(&ext4_li_mtx);
3021 if (ret)
3022 kfree(elr);
3023 return ret;
3024}
3025
3026/*
3027 * We do not need to lock anything since this is called on
3028 * module unload.
3029 */
3030static void ext4_destroy_lazyinit_thread(void)
3031{
3032 /*
3033 * If thread exited earlier
3034 * there's nothing to be done.
3035 */
3036 if (!ext4_li_info || !ext4_lazyinit_task)
3037 return;
3038
3039 kthread_stop(ext4_lazyinit_task);
3040}
3041
2545static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3042static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2546 __releases(kernel_lock) 3043 __releases(kernel_lock)
2547 __acquires(kernel_lock) 3044 __acquires(kernel_lock)
@@ -2567,6 +3064,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2567 __u64 blocks_count; 3064 __u64 blocks_count;
2568 int err; 3065 int err;
2569 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3066 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3067 ext4_group_t first_not_zeroed;
2570 3068
2571 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 3069 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2572 if (!sbi) 3070 if (!sbi)
@@ -2588,8 +3086,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2588 sbi->s_sectors_written_start = 3086 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 3087 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2590 3088
2591 unlock_kernel();
2592
2593 /* Cleanup superblock name */ 3089 /* Cleanup superblock name */
2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3090 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2595 *cp = '!'; 3091 *cp = '!';
@@ -2629,40 +3125,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2629 3125
2630 /* Set defaults before we parse the mount options */ 3126 /* Set defaults before we parse the mount options */
2631 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3127 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3128 set_opt(sb, INIT_INODE_TABLE);
2632 if (def_mount_opts & EXT4_DEFM_DEBUG) 3129 if (def_mount_opts & EXT4_DEFM_DEBUG)
2633 set_opt(sbi->s_mount_opt, DEBUG); 3130 set_opt(sb, DEBUG);
2634 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3131 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2635 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", 3132 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2636 "2.6.38"); 3133 "2.6.38");
2637 set_opt(sbi->s_mount_opt, GRPID); 3134 set_opt(sb, GRPID);
2638 } 3135 }
2639 if (def_mount_opts & EXT4_DEFM_UID16) 3136 if (def_mount_opts & EXT4_DEFM_UID16)
2640 set_opt(sbi->s_mount_opt, NO_UID32); 3137 set_opt(sb, NO_UID32);
3138 /* xattr user namespace & acls are now defaulted on */
2641#ifdef CONFIG_EXT4_FS_XATTR 3139#ifdef CONFIG_EXT4_FS_XATTR
2642 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3140 set_opt(sb, XATTR_USER);
2643 set_opt(sbi->s_mount_opt, XATTR_USER);
2644#endif 3141#endif
2645#ifdef CONFIG_EXT4_FS_POSIX_ACL 3142#ifdef CONFIG_EXT4_FS_POSIX_ACL
2646 if (def_mount_opts & EXT4_DEFM_ACL) 3143 set_opt(sb, POSIX_ACL);
2647 set_opt(sbi->s_mount_opt, POSIX_ACL);
2648#endif 3144#endif
3145 set_opt(sb, MBLK_IO_SUBMIT);
2649 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3146 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2650 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3147 set_opt(sb, JOURNAL_DATA);
2651 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3148 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2652 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3149 set_opt(sb, ORDERED_DATA);
2653 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 3150 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2654 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3151 set_opt(sb, WRITEBACK_DATA);
2655 3152
2656 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 3153 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2657 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 3154 set_opt(sb, ERRORS_PANIC);
2658 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 3155 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
2659 set_opt(sbi->s_mount_opt, ERRORS_CONT); 3156 set_opt(sb, ERRORS_CONT);
2660 else 3157 else
2661 set_opt(sbi->s_mount_opt, ERRORS_RO); 3158 set_opt(sb, ERRORS_RO);
2662 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3159 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
2663 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 3160 set_opt(sb, BLOCK_VALIDITY);
2664 if (def_mount_opts & EXT4_DEFM_DISCARD) 3161 if (def_mount_opts & EXT4_DEFM_DISCARD)
2665 set_opt(sbi->s_mount_opt, DISCARD); 3162 set_opt(sb, DISCARD);
2666 3163
2667 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 3164 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2668 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 3165 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2671,7 +3168,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2671 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 3168 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2672 3169
2673 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 3170 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
2674 set_opt(sbi->s_mount_opt, BARRIER); 3171 set_opt(sb, BARRIER);
2675 3172
2676 /* 3173 /*
2677 * enable delayed allocation by default 3174 * enable delayed allocation by default
@@ -2679,7 +3176,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 */ 3176 */
2680 if (!IS_EXT3_SB(sb) && 3177 if (!IS_EXT3_SB(sb) &&
2681 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3178 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
2682 set_opt(sbi->s_mount_opt, DELALLOC); 3179 set_opt(sb, DELALLOC);
3180
3181 /*
3182 * set default s_li_wait_mult for lazyinit, for the case there is
3183 * no mount option specified.
3184 */
3185 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
2683 3186
2684 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3187 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
2685 &journal_devnum, &journal_ioprio, NULL, 0)) { 3188 &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -2702,6 +3205,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2702 "feature flags set on rev 0 fs, " 3205 "feature flags set on rev 0 fs, "
2703 "running e2fsck is recommended"); 3206 "running e2fsck is recommended");
2704 3207
3208 if (IS_EXT2_SB(sb)) {
3209 if (ext2_feature_set_ok(sb))
3210 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3211 "using the ext4 subsystem");
3212 else {
3213 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3214 "to feature incompatibilities");
3215 goto failed_mount;
3216 }
3217 }
3218
3219 if (IS_EXT3_SB(sb)) {
3220 if (ext3_feature_set_ok(sb))
3221 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3222 "using the ext4 subsystem");
3223 else {
3224 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3225 "to feature incompatibilities");
3226 goto failed_mount;
3227 }
3228 }
3229
2705 /* 3230 /*
2706 * Check feature flags regardless of the revision level, since we 3231 * Check feature flags regardless of the revision level, since we
2707 * previously didn't change the revision level when setting the flags, 3232 * previously didn't change the revision level when setting the flags,
@@ -2831,15 +3356,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2831 * Test whether we have more sectors than will fit in sector_t, 3356 * Test whether we have more sectors than will fit in sector_t,
2832 * and whether the max offset is addressable by the page cache. 3357 * and whether the max offset is addressable by the page cache.
2833 */ 3358 */
2834 if ((ext4_blocks_count(es) > 3359 err = generic_check_addressable(sb->s_blocksize_bits,
2835 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || 3360 ext4_blocks_count(es));
2836 (ext4_blocks_count(es) > 3361 if (err) {
2837 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2838 ext4_msg(sb, KERN_ERR, "filesystem" 3362 ext4_msg(sb, KERN_ERR, "filesystem"
2839 " too large to mount safely on this system"); 3363 " too large to mount safely on this system");
2840 if (sizeof(sector_t) < 8) 3364 if (sizeof(sector_t) < 8)
2841 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3365 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2842 ret = -EFBIG; 3366 ret = err;
2843 goto failed_mount; 3367 goto failed_mount;
2844 } 3368 }
2845 3369
@@ -2908,7 +3432,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2908 goto failed_mount2; 3432 goto failed_mount2;
2909 } 3433 }
2910 } 3434 }
2911 if (!ext4_check_descriptors(sb)) { 3435 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
2912 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3436 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2913 goto failed_mount2; 3437 goto failed_mount2;
2914 } 3438 }
@@ -2924,6 +3448,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2924 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3448 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2925 spin_lock_init(&sbi->s_next_gen_lock); 3449 spin_lock_init(&sbi->s_next_gen_lock);
2926 3450
3451 init_timer(&sbi->s_err_report);
3452 sbi->s_err_report.function = print_daily_error_info;
3453 sbi->s_err_report.data = (unsigned long) sb;
3454
3455 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3456 ext4_count_free_blocks(sb));
3457 if (!err) {
3458 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3459 ext4_count_free_inodes(sb));
3460 }
3461 if (!err) {
3462 err = percpu_counter_init(&sbi->s_dirs_counter,
3463 ext4_count_dirs(sb));
3464 }
3465 if (!err) {
3466 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3467 }
3468 if (err) {
3469 ext4_msg(sb, KERN_ERR, "insufficient memory");
3470 goto failed_mount3;
3471 }
3472
2927 sbi->s_stripe = ext4_get_stripe_size(sbi); 3473 sbi->s_stripe = ext4_get_stripe_size(sbi);
2928 sbi->s_max_writeback_mb_bump = 128; 3474 sbi->s_max_writeback_mb_bump = 128;
2929 3475
@@ -2941,6 +3487,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2941 sb->s_qcop = &ext4_qctl_operations; 3487 sb->s_qcop = &ext4_qctl_operations;
2942 sb->dq_op = &ext4_quota_operations; 3488 sb->dq_op = &ext4_quota_operations;
2943#endif 3489#endif
3490 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3491
2944 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3492 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2945 mutex_init(&sbi->s_orphan_lock); 3493 mutex_init(&sbi->s_orphan_lock);
2946 mutex_init(&sbi->s_resize_lock); 3494 mutex_init(&sbi->s_resize_lock);
@@ -2951,6 +3499,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2951 EXT4_HAS_INCOMPAT_FEATURE(sb, 3499 EXT4_HAS_INCOMPAT_FEATURE(sb,
2952 EXT4_FEATURE_INCOMPAT_RECOVER)); 3500 EXT4_FEATURE_INCOMPAT_RECOVER));
2953 3501
3502 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3503 !(sb->s_flags & MS_RDONLY))
3504 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3505 goto failed_mount3;
3506
2954 /* 3507 /*
2955 * The first inode we look at is the journal inode. Don't try 3508 * The first inode we look at is the journal inode. Don't try
2956 * root first: it may be modified in the journal! 3509 * root first: it may be modified in the journal!
@@ -2965,8 +3518,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2965 "suppressed and not mounted read-only"); 3518 "suppressed and not mounted read-only");
2966 goto failed_mount_wq; 3519 goto failed_mount_wq;
2967 } else { 3520 } else {
2968 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 3521 clear_opt(sb, DATA_FLAGS);
2969 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2970 sbi->s_journal = NULL; 3522 sbi->s_journal = NULL;
2971 needs_recovery = 0; 3523 needs_recovery = 0;
2972 goto no_journal; 3524 goto no_journal;
@@ -3004,9 +3556,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3004 */ 3556 */
3005 if (jbd2_journal_check_available_features 3557 if (jbd2_journal_check_available_features
3006 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) 3558 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3007 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3559 set_opt(sb, ORDERED_DATA);
3008 else 3560 else
3009 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3561 set_opt(sb, JOURNAL_DATA);
3010 break; 3562 break;
3011 3563
3012 case EXT4_MOUNT_ORDERED_DATA: 3564 case EXT4_MOUNT_ORDERED_DATA:
@@ -3022,23 +3574,25 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3022 } 3574 }
3023 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3575 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3024 3576
3025no_journal: 3577 /*
3026 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3578 * The journal may have updated the bg summary counts, so we
3027 ext4_count_free_blocks(sb)); 3579 * need to update the global counters.
3028 if (!err) 3580 */
3029 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3581 percpu_counter_set(&sbi->s_freeblocks_counter,
3030 ext4_count_free_inodes(sb)); 3582 ext4_count_free_blocks(sb));
3031 if (!err) 3583 percpu_counter_set(&sbi->s_freeinodes_counter,
3032 err = percpu_counter_init(&sbi->s_dirs_counter, 3584 ext4_count_free_inodes(sb));
3033 ext4_count_dirs(sb)); 3585 percpu_counter_set(&sbi->s_dirs_counter,
3034 if (!err) 3586 ext4_count_dirs(sb));
3035 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3587 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3036 if (err) {
3037 ext4_msg(sb, KERN_ERR, "insufficient memory");
3038 goto failed_mount_wq;
3039 }
3040 3588
3041 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3589no_journal:
3590 /*
3591 * The maximum number of concurrent works can be high and
3592 * concurrency isn't really necessary. Limit it to 1.
3593 */
3594 EXT4_SB(sb)->dio_unwritten_wq =
3595 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3042 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3596 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3043 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3597 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3044 goto failed_mount_wq; 3598 goto failed_mount_wq;
@@ -3053,17 +3607,16 @@ no_journal:
3053 if (IS_ERR(root)) { 3607 if (IS_ERR(root)) {
3054 ext4_msg(sb, KERN_ERR, "get root inode failed"); 3608 ext4_msg(sb, KERN_ERR, "get root inode failed");
3055 ret = PTR_ERR(root); 3609 ret = PTR_ERR(root);
3610 root = NULL;
3056 goto failed_mount4; 3611 goto failed_mount4;
3057 } 3612 }
3058 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 3613 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3059 iput(root);
3060 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 3614 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3061 goto failed_mount4; 3615 goto failed_mount4;
3062 } 3616 }
3063 sb->s_root = d_alloc_root(root); 3617 sb->s_root = d_alloc_root(root);
3064 if (!sb->s_root) { 3618 if (!sb->s_root) {
3065 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 3619 ext4_msg(sb, KERN_ERR, "get root dentry failed");
3066 iput(root);
3067 ret = -ENOMEM; 3620 ret = -ENOMEM;
3068 goto failed_mount4; 3621 goto failed_mount4;
3069 } 3622 }
@@ -3099,18 +3652,18 @@ no_journal:
3099 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { 3652 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3100 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 3653 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3101 "requested data journaling mode"); 3654 "requested data journaling mode");
3102 clear_opt(sbi->s_mount_opt, DELALLOC); 3655 clear_opt(sb, DELALLOC);
3103 } 3656 }
3104 if (test_opt(sb, DIOREAD_NOLOCK)) { 3657 if (test_opt(sb, DIOREAD_NOLOCK)) {
3105 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3658 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3106 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3659 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3107 "option - requested data journaling mode"); 3660 "option - requested data journaling mode");
3108 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3661 clear_opt(sb, DIOREAD_NOLOCK);
3109 } 3662 }
3110 if (sb->s_blocksize < PAGE_SIZE) { 3663 if (sb->s_blocksize < PAGE_SIZE) {
3111 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3664 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3112 "option - block size is too small"); 3665 "option - block size is too small");
3113 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3666 clear_opt(sb, DIOREAD_NOLOCK);
3114 } 3667 }
3115 } 3668 }
3116 3669
@@ -3129,6 +3682,10 @@ no_journal:
3129 goto failed_mount4; 3682 goto failed_mount4;
3130 } 3683 }
3131 3684
3685 err = ext4_register_li_request(sb, first_not_zeroed);
3686 if (err)
3687 goto failed_mount4;
3688
3132 sbi->s_kobj.kset = ext4_kset; 3689 sbi->s_kobj.kset = ext4_kset;
3133 init_completion(&sbi->s_kobj_unregister); 3690 init_completion(&sbi->s_kobj_unregister);
3134 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3691 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3160,13 +3717,9 @@ no_journal:
3160 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 3717 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3161 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 3718 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3162 3719
3163 init_timer(&sbi->s_err_report);
3164 sbi->s_err_report.function = print_daily_error_info;
3165 sbi->s_err_report.data = (unsigned long) sb;
3166 if (es->s_error_count) 3720 if (es->s_error_count)
3167 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3721 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3168 3722
3169 lock_kernel();
3170 kfree(orig_data); 3723 kfree(orig_data);
3171 return 0; 3724 return 0;
3172 3725
@@ -3176,6 +3729,8 @@ cantfind_ext4:
3176 goto failed_mount; 3729 goto failed_mount;
3177 3730
3178failed_mount4: 3731failed_mount4:
3732 iput(root);
3733 sb->s_root = NULL;
3179 ext4_msg(sb, KERN_ERR, "mount failed"); 3734 ext4_msg(sb, KERN_ERR, "mount failed");
3180 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3735 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3181failed_mount_wq: 3736failed_mount_wq:
@@ -3184,17 +3739,20 @@ failed_mount_wq:
3184 jbd2_journal_destroy(sbi->s_journal); 3739 jbd2_journal_destroy(sbi->s_journal);
3185 sbi->s_journal = NULL; 3740 sbi->s_journal = NULL;
3186 } 3741 }
3187 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3188 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3189 percpu_counter_destroy(&sbi->s_dirs_counter);
3190 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3191failed_mount3: 3742failed_mount3:
3743 del_timer(&sbi->s_err_report);
3192 if (sbi->s_flex_groups) { 3744 if (sbi->s_flex_groups) {
3193 if (is_vmalloc_addr(sbi->s_flex_groups)) 3745 if (is_vmalloc_addr(sbi->s_flex_groups))
3194 vfree(sbi->s_flex_groups); 3746 vfree(sbi->s_flex_groups);
3195 else 3747 else
3196 kfree(sbi->s_flex_groups); 3748 kfree(sbi->s_flex_groups);
3197 } 3749 }
3750 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3751 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3752 percpu_counter_destroy(&sbi->s_dirs_counter);
3753 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3754 if (sbi->s_mmp_tsk)
3755 kthread_stop(sbi->s_mmp_tsk);
3198failed_mount2: 3756failed_mount2:
3199 for (i = 0; i < db_count; i++) 3757 for (i = 0; i < db_count; i++)
3200 brelse(sbi->s_group_desc[i]); 3758 brelse(sbi->s_group_desc[i]);
@@ -3213,7 +3771,6 @@ out_fail:
3213 sb->s_fs_info = NULL; 3771 sb->s_fs_info = NULL;
3214 kfree(sbi->s_blockgroup_lock); 3772 kfree(sbi->s_blockgroup_lock);
3215 kfree(sbi); 3773 kfree(sbi);
3216 lock_kernel();
3217out_free_orig: 3774out_free_orig:
3218 kfree(orig_data); 3775 kfree(orig_data);
3219 return ret; 3776 return ret;
@@ -3306,13 +3863,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3306 if (bdev == NULL) 3863 if (bdev == NULL)
3307 return NULL; 3864 return NULL;
3308 3865
3309 if (bd_claim(bdev, sb)) {
3310 ext4_msg(sb, KERN_ERR,
3311 "failed to claim external journal device");
3312 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
3313 return NULL;
3314 }
3315
3316 blocksize = sb->s_blocksize; 3866 blocksize = sb->s_blocksize;
3317 hblock = bdev_logical_block_size(bdev); 3867 hblock = bdev_logical_block_size(bdev);
3318 if (blocksize < hblock) { 3868 if (blocksize < hblock) {
@@ -3470,7 +4020,7 @@ static int ext4_load_journal(struct super_block *sb,
3470 EXT4_SB(sb)->s_journal = journal; 4020 EXT4_SB(sb)->s_journal = journal;
3471 ext4_clear_journal_err(sb, es); 4021 ext4_clear_journal_err(sb, es);
3472 4022
3473 if (journal_devnum && 4023 if (!really_read_only && journal_devnum &&
3474 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 4024 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3475 es->s_journal_dev = cpu_to_le32(journal_devnum); 4025 es->s_journal_dev = cpu_to_le32(journal_devnum);
3476 4026
@@ -3524,9 +4074,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3524 es->s_kbytes_written = 4074 es->s_kbytes_written =
3525 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 4075 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3526 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 4076 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3527 &EXT4_SB(sb)->s_freeblocks_counter)); 4077 &EXT4_SB(sb)->s_freeblocks_counter));
3528 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 4078 es->s_free_inodes_count =
3529 &EXT4_SB(sb)->s_freeinodes_counter)); 4079 cpu_to_le32(percpu_counter_sum_positive(
4080 &EXT4_SB(sb)->s_freeinodes_counter));
3530 sb->s_dirt = 0; 4081 sb->s_dirt = 0;
3531 BUFFER_TRACE(sbh, "marking dirty"); 4082 BUFFER_TRACE(sbh, "marking dirty");
3532 mark_buffer_dirty(sbh); 4083 mark_buffer_dirty(sbh);
@@ -3658,6 +4209,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3658/* 4209/*
3659 * LVM calls this function before a (read-only) snapshot is created. This 4210 * LVM calls this function before a (read-only) snapshot is created. This
3660 * gives us a chance to flush the journal completely and mark the fs clean. 4211 * gives us a chance to flush the journal completely and mark the fs clean.
4212 *
4213 * Note that only this function cannot bring a filesystem to be in a clean
4214 * state independently, because ext4 prevents a new handle from being started
4215 * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
4216 * the upper layer.
3661 */ 4217 */
3662static int ext4_freeze(struct super_block *sb) 4218static int ext4_freeze(struct super_block *sb)
3663{ 4219{
@@ -3706,6 +4262,22 @@ static int ext4_unfreeze(struct super_block *sb)
3706 return 0; 4262 return 0;
3707} 4263}
3708 4264
4265/*
4266 * Structure to save mount options for ext4_remount's benefit
4267 */
4268struct ext4_mount_options {
4269 unsigned long s_mount_opt;
4270 unsigned long s_mount_opt2;
4271 uid_t s_resuid;
4272 gid_t s_resgid;
4273 unsigned long s_commit_interval;
4274 u32 s_min_batch_time, s_max_batch_time;
4275#ifdef CONFIG_QUOTA
4276 int s_jquota_fmt;
4277 char *s_qf_names[MAXQUOTAS];
4278#endif
4279};
4280
3709static int ext4_remount(struct super_block *sb, int *flags, char *data) 4281static int ext4_remount(struct super_block *sb, int *flags, char *data)
3710{ 4282{
3711 struct ext4_super_block *es; 4283 struct ext4_super_block *es;
@@ -3716,18 +4288,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3716 int enable_quota = 0; 4288 int enable_quota = 0;
3717 ext4_group_t g; 4289 ext4_group_t g;
3718 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4290 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3719 int err; 4291 int err = 0;
3720#ifdef CONFIG_QUOTA 4292#ifdef CONFIG_QUOTA
3721 int i; 4293 int i;
3722#endif 4294#endif
3723 char *orig_data = kstrdup(data, GFP_KERNEL); 4295 char *orig_data = kstrdup(data, GFP_KERNEL);
3724 4296
3725 lock_kernel();
3726
3727 /* Store the original options */ 4297 /* Store the original options */
3728 lock_super(sb); 4298 lock_super(sb);
3729 old_sb_flags = sb->s_flags; 4299 old_sb_flags = sb->s_flags;
3730 old_opts.s_mount_opt = sbi->s_mount_opt; 4300 old_opts.s_mount_opt = sbi->s_mount_opt;
4301 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
3731 old_opts.s_resuid = sbi->s_resuid; 4302 old_opts.s_resuid = sbi->s_resuid;
3732 old_opts.s_resgid = sbi->s_resgid; 4303 old_opts.s_resgid = sbi->s_resgid;
3733 old_opts.s_commit_interval = sbi->s_commit_interval; 4304 old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -3843,9 +4414,29 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3843 goto restore_opts; 4414 goto restore_opts;
3844 if (!ext4_setup_super(sb, es, 0)) 4415 if (!ext4_setup_super(sb, es, 0))
3845 sb->s_flags &= ~MS_RDONLY; 4416 sb->s_flags &= ~MS_RDONLY;
4417 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4418 EXT4_FEATURE_INCOMPAT_MMP))
4419 if (ext4_multi_mount_protect(sb,
4420 le64_to_cpu(es->s_mmp_block))) {
4421 err = -EROFS;
4422 goto restore_opts;
4423 }
3846 enable_quota = 1; 4424 enable_quota = 1;
3847 } 4425 }
3848 } 4426 }
4427
4428 /*
4429 * Reinitialize lazy itable initialization thread based on
4430 * current settings
4431 */
4432 if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4433 ext4_unregister_li_request(sb);
4434 else {
4435 ext4_group_t first_not_zeroed;
4436 first_not_zeroed = ext4_has_uninit_itable(sb);
4437 ext4_register_li_request(sb, first_not_zeroed);
4438 }
4439
3849 ext4_setup_system_zone(sb); 4440 ext4_setup_system_zone(sb);
3850 if (sbi->s_journal == NULL) 4441 if (sbi->s_journal == NULL)
3851 ext4_commit_super(sb, 1); 4442 ext4_commit_super(sb, 1);
@@ -3858,7 +4449,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3858 kfree(old_opts.s_qf_names[i]); 4449 kfree(old_opts.s_qf_names[i]);
3859#endif 4450#endif
3860 unlock_super(sb); 4451 unlock_super(sb);
3861 unlock_kernel();
3862 if (enable_quota) 4452 if (enable_quota)
3863 dquot_resume(sb, -1); 4453 dquot_resume(sb, -1);
3864 4454
@@ -3869,6 +4459,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3869restore_opts: 4459restore_opts:
3870 sb->s_flags = old_sb_flags; 4460 sb->s_flags = old_sb_flags;
3871 sbi->s_mount_opt = old_opts.s_mount_opt; 4461 sbi->s_mount_opt = old_opts.s_mount_opt;
4462 sbi->s_mount_opt2 = old_opts.s_mount_opt2;
3872 sbi->s_resuid = old_opts.s_resuid; 4463 sbi->s_resuid = old_opts.s_resuid;
3873 sbi->s_resgid = old_opts.s_resgid; 4464 sbi->s_resgid = old_opts.s_resgid;
3874 sbi->s_commit_interval = old_opts.s_commit_interval; 4465 sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -3884,7 +4475,6 @@ restore_opts:
3884 } 4475 }
3885#endif 4476#endif
3886 unlock_super(sb); 4477 unlock_super(sb);
3887 unlock_kernel();
3888 kfree(orig_data); 4478 kfree(orig_data);
3889 return err; 4479 return err;
3890} 4480}
@@ -3895,6 +4485,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3895 struct ext4_sb_info *sbi = EXT4_SB(sb); 4485 struct ext4_sb_info *sbi = EXT4_SB(sb);
3896 struct ext4_super_block *es = sbi->s_es; 4486 struct ext4_super_block *es = sbi->s_es;
3897 u64 fsid; 4487 u64 fsid;
4488 s64 bfree;
3898 4489
3899 if (test_opt(sb, MINIX_DF)) { 4490 if (test_opt(sb, MINIX_DF)) {
3900 sbi->s_overhead_last = 0; 4491 sbi->s_overhead_last = 0;
@@ -3938,8 +4529,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3938 buf->f_type = EXT4_SUPER_MAGIC; 4529 buf->f_type = EXT4_SUPER_MAGIC;
3939 buf->f_bsize = sb->s_blocksize; 4530 buf->f_bsize = sb->s_blocksize;
3940 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4531 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3941 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4532 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3942 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4533 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4534 /* prevent underflow in case that few free space is available */
4535 buf->f_bfree = max_t(s64, bfree, 0);
3943 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4536 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3944 if (buf->f_bfree < ext4_r_blocks_count(es)) 4537 if (buf->f_bfree < ext4_r_blocks_count(es))
3945 buf->f_bavail = 0; 4538 buf->f_bavail = 0;
@@ -4066,27 +4659,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
4066 * Standard function to be called on quota_on 4659 * Standard function to be called on quota_on
4067 */ 4660 */
4068static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4661static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4069 char *name) 4662 struct path *path)
4070{ 4663{
4071 int err; 4664 int err;
4072 struct path path;
4073 4665
4074 if (!test_opt(sb, QUOTA)) 4666 if (!test_opt(sb, QUOTA))
4075 return -EINVAL; 4667 return -EINVAL;
4076 4668
4077 err = kern_path(name, LOOKUP_FOLLOW, &path);
4078 if (err)
4079 return err;
4080
4081 /* Quotafile not on the same filesystem? */ 4669 /* Quotafile not on the same filesystem? */
4082 if (path.mnt->mnt_sb != sb) { 4670 if (path->mnt->mnt_sb != sb)
4083 path_put(&path);
4084 return -EXDEV; 4671 return -EXDEV;
4085 }
4086 /* Journaling quota? */ 4672 /* Journaling quota? */
4087 if (EXT4_SB(sb)->s_qf_names[type]) { 4673 if (EXT4_SB(sb)->s_qf_names[type]) {
4088 /* Quotafile not in fs root? */ 4674 /* Quotafile not in fs root? */
4089 if (path.dentry->d_parent != sb->s_root) 4675 if (path->dentry->d_parent != sb->s_root)
4090 ext4_msg(sb, KERN_WARNING, 4676 ext4_msg(sb, KERN_WARNING,
4091 "Quota file not on filesystem root. " 4677 "Quota file not on filesystem root. "
4092 "Journaled quota will not work"); 4678 "Journaled quota will not work");
@@ -4097,7 +4683,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4097 * all updates to the file when we bypass pagecache... 4683 * all updates to the file when we bypass pagecache...
4098 */ 4684 */
4099 if (EXT4_SB(sb)->s_journal && 4685 if (EXT4_SB(sb)->s_journal &&
4100 ext4_should_journal_data(path.dentry->d_inode)) { 4686 ext4_should_journal_data(path->dentry->d_inode)) {
4101 /* 4687 /*
4102 * We don't need to lock updates but journal_flush() could 4688 * We don't need to lock updates but journal_flush() could
4103 * otherwise be livelocked... 4689 * otherwise be livelocked...
@@ -4105,32 +4691,42 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4105 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 4691 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4106 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 4692 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4107 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4693 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4108 if (err) { 4694 if (err)
4109 path_put(&path);
4110 return err; 4695 return err;
4111 }
4112 } 4696 }
4113 4697
4114 err = dquot_quota_on_path(sb, type, format_id, &path); 4698 return dquot_quota_on(sb, type, format_id, path);
4115 path_put(&path);
4116 return err;
4117} 4699}
4118 4700
4119static int ext4_quota_off(struct super_block *sb, int type) 4701static int ext4_quota_off(struct super_block *sb, int type)
4120{ 4702{
4121 /* Force all delayed allocation blocks to be allocated */ 4703 struct inode *inode = sb_dqopt(sb)->files[type];
4122 if (test_opt(sb, DELALLOC)) { 4704 handle_t *handle;
4123 down_read(&sb->s_umount); 4705
4706 /* Force all delayed allocation blocks to be allocated.
4707 * Caller already holds s_umount sem */
4708 if (test_opt(sb, DELALLOC))
4124 sync_filesystem(sb); 4709 sync_filesystem(sb);
4125 up_read(&sb->s_umount);
4126 }
4127 4710
4711 if (!inode)
4712 goto out;
4713
4714 /* Update modification times of quota files when userspace can
4715 * start looking at them */
4716 handle = ext4_journal_start(inode, 1);
4717 if (IS_ERR(handle))
4718 goto out;
4719 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4720 ext4_mark_inode_dirty(handle, inode);
4721 ext4_journal_stop(handle);
4722
4723out:
4128 return dquot_quota_off(sb, type); 4724 return dquot_quota_off(sb, type);
4129} 4725}
4130 4726
4131/* Read data from quotafile - avoid pagecache and such because we cannot afford 4727/* Read data from quotafile - avoid pagecache and such because we cannot afford
4132 * acquiring the locks... As quota files are never truncated and quota code 4728 * acquiring the locks... As quota files are never truncated and quota code
4133 * itself serializes the operations (and noone else should touch the files) 4729 * itself serializes the operations (and no one else should touch the files)
4134 * we don't have to be afraid of races */ 4730 * we don't have to be afraid of races */
4135static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 4731static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
4136 size_t len, loff_t off) 4732 size_t len, loff_t off)
@@ -4220,30 +4816,21 @@ out:
4220 if (inode->i_size < off + len) { 4816 if (inode->i_size < off + len) {
4221 i_size_write(inode, off + len); 4817 i_size_write(inode, off + len);
4222 EXT4_I(inode)->i_disksize = inode->i_size; 4818 EXT4_I(inode)->i_disksize = inode->i_size;
4819 ext4_mark_inode_dirty(handle, inode);
4223 } 4820 }
4224 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4225 ext4_mark_inode_dirty(handle, inode);
4226 mutex_unlock(&inode->i_mutex); 4821 mutex_unlock(&inode->i_mutex);
4227 return len; 4822 return len;
4228} 4823}
4229 4824
4230#endif 4825#endif
4231 4826
4232static int ext4_get_sb(struct file_system_type *fs_type, int flags, 4827static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4233 const char *dev_name, void *data, struct vfsmount *mnt) 4828 const char *dev_name, void *data)
4234{ 4829{
4235 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4830 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
4236} 4831}
4237 4832
4238#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4833#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4239static struct file_system_type ext2_fs_type = {
4240 .owner = THIS_MODULE,
4241 .name = "ext2",
4242 .get_sb = ext4_get_sb,
4243 .kill_sb = kill_block_super,
4244 .fs_flags = FS_REQUIRES_DEV,
4245};
4246
4247static inline void register_as_ext2(void) 4834static inline void register_as_ext2(void)
4248{ 4835{
4249 int err = register_filesystem(&ext2_fs_type); 4836 int err = register_filesystem(&ext2_fs_type);
@@ -4256,10 +4843,22 @@ static inline void unregister_as_ext2(void)
4256{ 4843{
4257 unregister_filesystem(&ext2_fs_type); 4844 unregister_filesystem(&ext2_fs_type);
4258} 4845}
4846
4847static inline int ext2_feature_set_ok(struct super_block *sb)
4848{
4849 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4850 return 0;
4851 if (sb->s_flags & MS_RDONLY)
4852 return 1;
4853 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4854 return 0;
4855 return 1;
4856}
4259MODULE_ALIAS("ext2"); 4857MODULE_ALIAS("ext2");
4260#else 4858#else
4261static inline void register_as_ext2(void) { } 4859static inline void register_as_ext2(void) { }
4262static inline void unregister_as_ext2(void) { } 4860static inline void unregister_as_ext2(void) { }
4861static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4263#endif 4862#endif
4264 4863
4265#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4864#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4275,79 +4874,155 @@ static inline void unregister_as_ext3(void)
4275{ 4874{
4276 unregister_filesystem(&ext3_fs_type); 4875 unregister_filesystem(&ext3_fs_type);
4277} 4876}
4877
4878static inline int ext3_feature_set_ok(struct super_block *sb)
4879{
4880 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4881 return 0;
4882 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4883 return 0;
4884 if (sb->s_flags & MS_RDONLY)
4885 return 1;
4886 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4887 return 0;
4888 return 1;
4889}
4278MODULE_ALIAS("ext3"); 4890MODULE_ALIAS("ext3");
4279#else 4891#else
4280static inline void register_as_ext3(void) { } 4892static inline void register_as_ext3(void) { }
4281static inline void unregister_as_ext3(void) { } 4893static inline void unregister_as_ext3(void) { }
4894static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4282#endif 4895#endif
4283 4896
4284static struct file_system_type ext4_fs_type = { 4897static struct file_system_type ext4_fs_type = {
4285 .owner = THIS_MODULE, 4898 .owner = THIS_MODULE,
4286 .name = "ext4", 4899 .name = "ext4",
4287 .get_sb = ext4_get_sb, 4900 .mount = ext4_mount,
4288 .kill_sb = kill_block_super, 4901 .kill_sb = kill_block_super,
4289 .fs_flags = FS_REQUIRES_DEV, 4902 .fs_flags = FS_REQUIRES_DEV,
4290}; 4903};
4291 4904
4292static int __init init_ext4_fs(void) 4905static int __init ext4_init_feat_adverts(void)
4293{ 4906{
4294 int err; 4907 struct ext4_features *ef;
4908 int ret = -ENOMEM;
4909
4910 ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
4911 if (!ef)
4912 goto out;
4913
4914 ef->f_kobj.kset = ext4_kset;
4915 init_completion(&ef->f_kobj_unregister);
4916 ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
4917 "features");
4918 if (ret) {
4919 kfree(ef);
4920 goto out;
4921 }
4922
4923 ext4_feat = ef;
4924 ret = 0;
4925out:
4926 return ret;
4927}
4928
4929static void ext4_exit_feat_adverts(void)
4930{
4931 kobject_put(&ext4_feat->f_kobj);
4932 wait_for_completion(&ext4_feat->f_kobj_unregister);
4933 kfree(ext4_feat);
4934}
4935
4936/* Shared across all ext4 file systems */
4937wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4938struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4939
4940static int __init ext4_init_fs(void)
4941{
4942 int i, err;
4295 4943
4296 ext4_check_flag_values(); 4944 ext4_check_flag_values();
4297 err = init_ext4_system_zone(); 4945
4946 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4947 mutex_init(&ext4__aio_mutex[i]);
4948 init_waitqueue_head(&ext4__ioend_wq[i]);
4949 }
4950
4951 err = ext4_init_pageio();
4298 if (err) 4952 if (err)
4299 return err; 4953 return err;
4954 err = ext4_init_system_zone();
4955 if (err)
4956 goto out7;
4300 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4957 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4301 if (!ext4_kset) 4958 if (!ext4_kset)
4302 goto out4; 4959 goto out6;
4303 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4960 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4304 err = init_ext4_mballoc(); 4961 if (!ext4_proc_root)
4962 goto out5;
4963
4964 err = ext4_init_feat_adverts();
4965 if (err)
4966 goto out4;
4967
4968 err = ext4_init_mballoc();
4305 if (err) 4969 if (err)
4306 goto out3; 4970 goto out3;
4307 4971
4308 err = init_ext4_xattr(); 4972 err = ext4_init_xattr();
4309 if (err) 4973 if (err)
4310 goto out2; 4974 goto out2;
4311 err = init_inodecache(); 4975 err = init_inodecache();
4312 if (err) 4976 if (err)
4313 goto out1; 4977 goto out1;
4314 register_as_ext2();
4315 register_as_ext3(); 4978 register_as_ext3();
4979 register_as_ext2();
4316 err = register_filesystem(&ext4_fs_type); 4980 err = register_filesystem(&ext4_fs_type);
4317 if (err) 4981 if (err)
4318 goto out; 4982 goto out;
4983
4984 ext4_li_info = NULL;
4985 mutex_init(&ext4_li_mtx);
4319 return 0; 4986 return 0;
4320out: 4987out:
4321 unregister_as_ext2(); 4988 unregister_as_ext2();
4322 unregister_as_ext3(); 4989 unregister_as_ext3();
4323 destroy_inodecache(); 4990 destroy_inodecache();
4324out1: 4991out1:
4325 exit_ext4_xattr(); 4992 ext4_exit_xattr();
4326out2: 4993out2:
4327 exit_ext4_mballoc(); 4994 ext4_exit_mballoc();
4328out3: 4995out3:
4996 ext4_exit_feat_adverts();
4997out4:
4329 remove_proc_entry("fs/ext4", NULL); 4998 remove_proc_entry("fs/ext4", NULL);
4999out5:
4330 kset_unregister(ext4_kset); 5000 kset_unregister(ext4_kset);
4331out4: 5001out6:
4332 exit_ext4_system_zone(); 5002 ext4_exit_system_zone();
5003out7:
5004 ext4_exit_pageio();
4333 return err; 5005 return err;
4334} 5006}
4335 5007
4336static void __exit exit_ext4_fs(void) 5008static void __exit ext4_exit_fs(void)
4337{ 5009{
5010 ext4_destroy_lazyinit_thread();
4338 unregister_as_ext2(); 5011 unregister_as_ext2();
4339 unregister_as_ext3(); 5012 unregister_as_ext3();
4340 unregister_filesystem(&ext4_fs_type); 5013 unregister_filesystem(&ext4_fs_type);
4341 destroy_inodecache(); 5014 destroy_inodecache();
4342 exit_ext4_xattr(); 5015 ext4_exit_xattr();
4343 exit_ext4_mballoc(); 5016 ext4_exit_mballoc();
5017 ext4_exit_feat_adverts();
4344 remove_proc_entry("fs/ext4", NULL); 5018 remove_proc_entry("fs/ext4", NULL);
4345 kset_unregister(ext4_kset); 5019 kset_unregister(ext4_kset);
4346 exit_ext4_system_zone(); 5020 ext4_exit_system_zone();
5021 ext4_exit_pageio();
4347} 5022}
4348 5023
4349MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 5024MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
4350MODULE_DESCRIPTION("Fourth Extended Filesystem"); 5025MODULE_DESCRIPTION("Fourth Extended Filesystem");
4351MODULE_LICENSE("GPL"); 5026MODULE_LICENSE("GPL");
4352module_init(init_ext4_fs) 5027module_init(ext4_init_fs)
4353module_exit(exit_ext4_fs) 5028module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
427static int 427static int
428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) 428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
429{ 429{
430 int i_error, b_error; 430 int ret, ret2;
431 431
432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem); 432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
433 i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); 433 ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
434 if (i_error < 0) { 434 if (ret < 0)
435 b_error = 0; 435 goto errout;
436 } else { 436 if (buffer) {
437 if (buffer) { 437 buffer += ret;
438 buffer += i_error; 438 buffer_size -= ret;
439 buffer_size -= i_error;
440 }
441 b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
442 if (b_error < 0)
443 i_error = 0;
444 } 439 }
440 ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
441 if (ret < 0)
442 goto errout;
443 ret += ret2;
444errout:
445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem); 445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
446 return i_error + b_error; 446 return ret;
447} 447}
448 448
449/* 449/*
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
735 int offset = (char *)s->here - bs->bh->b_data; 735 int offset = (char *)s->here - bs->bh->b_data;
736 736
737 unlock_buffer(bs->bh); 737 unlock_buffer(bs->bh);
738 jbd2_journal_release_buffer(handle, bs->bh); 738 ext4_handle_release_buffer(handle, bs->bh);
739 if (ce) { 739 if (ce) {
740 mb_cache_entry_release(ce); 740 mb_cache_entry_release(ce);
741 ce = NULL; 741 ce = NULL;
@@ -820,8 +820,8 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 goal, NULL, &error); 824 NULL, &error);
825 if (error) 825 if (error)
826 goto cleanup; 826 goto cleanup;
827 827
@@ -833,7 +833,7 @@ inserted:
833 new_bh = sb_getblk(sb, block); 833 new_bh = sb_getblk(sb, block);
834 if (!new_bh) { 834 if (!new_bh) {
835getblk_failed: 835getblk_failed:
836 ext4_free_blocks(handle, inode, 0, block, 1, 836 ext4_free_blocks(handle, inode, NULL, block, 1,
837 EXT4_FREE_BLOCKS_METADATA); 837 EXT4_FREE_BLOCKS_METADATA);
838 error = -EIO; 838 error = -EIO;
839 goto cleanup; 839 goto cleanup;
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
947/* 947/*
948 * ext4_xattr_set_handle() 948 * ext4_xattr_set_handle()
949 * 949 *
950 * Create, replace or remove an extended attribute for this inode. Buffer 950 * Create, replace or remove an extended attribute for this inode. Value
951 * is NULL to remove an existing extended attribute, and non-NULL to 951 * is NULL to remove an existing extended attribute, and non-NULL to
952 * either replace an existing extended attribute, or create a new extended 952 * either replace an existing extended attribute, or create a new extended
953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1588#undef BLOCK_HASH_SHIFT 1588#undef BLOCK_HASH_SHIFT
1589 1589
1590int __init 1590int __init
1591init_ext4_xattr(void) 1591ext4_init_xattr(void)
1592{ 1592{
1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
1594 if (!ext4_xattr_cache) 1594 if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
1597} 1597}
1598 1598
1599void 1599void
1600exit_ext4_xattr(void) 1600ext4_exit_xattr(void)
1601{ 1601{
1602 if (ext4_xattr_cache) 1602 if (ext4_xattr_cache)
1603 mb_cache_destroy(ext4_xattr_cache); 1603 mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
84 struct ext4_inode *raw_inode, handle_t *handle); 84 struct ext4_inode *raw_inode, handle_t *handle);
85 85
86extern int init_ext4_xattr(void); 86extern int __init ext4_init_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void ext4_exit_xattr(void);
88 88
89extern const struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
121{ 121{
122} 122}
123 123
124static inline int 124static __init inline int
125init_ext4_xattr(void) 125ext4_init_xattr(void)
126{ 126{
127 return 0; 127 return 0;
128} 128}
129 129
130static inline void 130static inline void
131exit_ext4_xattr(void) 131ext4_exit_xattr(void)
132{ 132{
133} 133}
134 134
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
145 145
146#ifdef CONFIG_EXT4_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir, const struct qstr *qstr);
149#else 149#else
150static inline int ext4_init_security(handle_t *handle, struct inode *inode, 150static inline int ext4_init_security(handle_t *handle, struct inode *inode,
151 struct inode *dir) 151 struct inode *dir, const struct qstr *qstr)
152{ 152{
153 return 0; 153 return 0;
154} 154}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
49} 49}
50 50
51int 51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) 52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
53 const struct qstr *qstr)
53{ 54{
54 int err; 55 int err;
55 size_t len; 56 size_t len;
56 void *value; 57 void *value;
57 char *name; 58 char *name;
58 59
59 err = security_inode_init_security(inode, dir, &name, &value, &len); 60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
60 if (err) { 61 if (err) {
61 if (err == -EOPNOTSUPP) 62 if (err == -EOPNOTSUPP)
62 return 0; 63 return 0;