aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 12:28:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 12:28:03 -0500
commit9bfccec24e31f4f83445cfe0c1b0a5ef97900628 (patch)
treecea50a0797abbd27a5a4a47853d1e09b97cd8c83
parent2756d373a3f45a3a9ebf4ac389f9e0e02bd35a93 (diff)
parent50db71abc529c48b21f4c3034d3cff27cfb25795 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Lots of bugs fixes, including Zheng and Jan's extent status shrinker fixes, which should improve CPU utilization and potential soft lockups under heavy memory pressure, and Eric Whitney's bigalloc fixes" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (26 commits) ext4: ext4_da_convert_inline_data_to_extent drop locked page after error ext4: fix suboptimal seek_{data,hole} extents traversial ext4: ext4_inline_data_fiemap should respect callers argument ext4: prevent fsreentrance deadlock for inline_data ext4: forbid journal_async_commit in data=ordered mode jbd2: remove unnecessary NULL check before iput() ext4: Remove an unnecessary check for NULL before iput() ext4: remove unneeded code in ext4_unlink ext4: don't count external journal blocks as overhead ext4: remove never taken branch from ext4_ext_shift_path_extents() ext4: create nojournal_checksum mount option ext4: update comments regarding ext4_delete_inode() ext4: cleanup GFP flags inside resize path ext4: introduce aging to extent status tree ext4: cleanup flag definitions for extent status tree ext4: limit number of scanned extents in status tree shrinker ext4: move handling of list of shrinkable inodes into extent status code ext4: change LRU to round-robin in extent status tree shrinker ext4: cache extent hole in extent status tree for ext4_da_map_blocks() ext4: fix block reservation for bigalloc filesystems ...
-rw-r--r--fs/ext4/ext4.h41
-rw-r--r--fs/ext4/extents.c223
-rw-r--r--fs/ext4/extents_status.c321
-rw-r--r--fs/ext4/extents_status.h82
-rw-r--r--fs/ext4/file.c220
-rw-r--r--fs/ext4/inline.c35
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c51
-rw-r--r--fs/jbd2/journal.c3
-rw-r--r--include/trace/events/ext4.h17
16 files changed, 533 insertions, 531 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db3f772e57ae..a75fba67bb1f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -158,17 +158,8 @@ struct ext4_allocation_request {
158#define EXT4_MAP_MAPPED (1 << BH_Mapped) 158#define EXT4_MAP_MAPPED (1 << BH_Mapped)
159#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) 159#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
160#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) 160#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
161/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
162 * ext4_map_blocks wants to know whether or not the underlying cluster has
163 * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
164 * the requested mapping was from previously mapped (or delayed allocated)
165 * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
166 * should never appear on buffer_head's state flags.
167 */
168#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
169#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ 161#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
170 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ 162 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
171 EXT4_MAP_FROM_CLUSTER)
172 163
173struct ext4_map_blocks { 164struct ext4_map_blocks {
174 ext4_fsblk_t m_pblk; 165 ext4_fsblk_t m_pblk;
@@ -565,10 +556,8 @@ enum {
565#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 556#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
566 /* Do not take i_data_sem locking in ext4_map_blocks */ 557 /* Do not take i_data_sem locking in ext4_map_blocks */
567#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 558#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
568 /* Do not put hole in extent cache */
569#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
570 /* Convert written extents to unwritten */ 559 /* Convert written extents to unwritten */
571#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 560#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
572 561
573/* 562/*
574 * The bit position of these flags must not overlap with any of the 563 * The bit position of these flags must not overlap with any of the
@@ -889,10 +878,12 @@ struct ext4_inode_info {
889 /* extents status tree */ 878 /* extents status tree */
890 struct ext4_es_tree i_es_tree; 879 struct ext4_es_tree i_es_tree;
891 rwlock_t i_es_lock; 880 rwlock_t i_es_lock;
892 struct list_head i_es_lru; 881 struct list_head i_es_list;
893 unsigned int i_es_all_nr; /* protected by i_es_lock */ 882 unsigned int i_es_all_nr; /* protected by i_es_lock */
894 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 883 unsigned int i_es_shk_nr; /* protected by i_es_lock */
895 unsigned long i_touch_when; /* jiffies of last accessing */ 884 ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
885 extents to shrink. Protected by
886 i_es_lock */
896 887
897 /* ialloc */ 888 /* ialloc */
898 ext4_group_t i_last_alloc_group; 889 ext4_group_t i_last_alloc_group;
@@ -1337,10 +1328,11 @@ struct ext4_sb_info {
1337 1328
1338 /* Reclaim extents from extent status tree */ 1329 /* Reclaim extents from extent status tree */
1339 struct shrinker s_es_shrinker; 1330 struct shrinker s_es_shrinker;
1340 struct list_head s_es_lru; 1331 struct list_head s_es_list; /* List of inodes with reclaimable extents */
1332 long s_es_nr_inode;
1341 struct ext4_es_stats s_es_stats; 1333 struct ext4_es_stats s_es_stats;
1342 struct mb_cache *s_mb_cache; 1334 struct mb_cache *s_mb_cache;
1343 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 spinlock_t s_es_lock ____cacheline_aligned_in_smp;
1344 1336
1345 /* Ratelimit ext4 messages. */ 1337 /* Ratelimit ext4 messages. */
1346 struct ratelimit_state s_err_ratelimit_state; 1338 struct ratelimit_state s_err_ratelimit_state;
@@ -2196,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb);
2196extern void ext4_superblock_csum_set(struct super_block *sb); 2188extern void ext4_superblock_csum_set(struct super_block *sb);
2197extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2189extern void *ext4_kvmalloc(size_t size, gfp_t flags);
2198extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2190extern void *ext4_kvzalloc(size_t size, gfp_t flags);
2199extern void ext4_kvfree(void *ptr);
2200extern int ext4_alloc_flex_bg_array(struct super_block *sb, 2191extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2201 ext4_group_t ngroup); 2192 ext4_group_t ngroup);
2202extern const char *ext4_decode_error(struct super_block *sb, int errno, 2193extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2647,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
2647 int *retval); 2638 int *retval);
2648extern int ext4_inline_data_fiemap(struct inode *inode, 2639extern int ext4_inline_data_fiemap(struct inode *inode,
2649 struct fiemap_extent_info *fieinfo, 2640 struct fiemap_extent_info *fieinfo,
2650 int *has_inline); 2641 int *has_inline, __u64 start, __u64 len);
2651extern int ext4_try_to_evict_inline_data(handle_t *handle, 2642extern int ext4_try_to_evict_inline_data(handle_t *handle,
2652 struct inode *inode, 2643 struct inode *inode,
2653 int needed); 2644 int needed);
@@ -2795,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2795extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 2786extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2796 2787
2797/* 2788/*
2798 * Note that these flags will never ever appear in a buffer_head's state flag.
2799 * See EXT4_MAP_... to see where this is used.
2800 */
2801enum ext4_state_bits {
2802 BH_AllocFromCluster /* allocated blocks were part of already
2803 * allocated cluster. */
2804 = BH_JBDPrivateStart
2805};
2806
2807/*
2808 * Add new method to test whether block and inode bitmaps are properly 2789 * Add new method to test whether block and inode bitmaps are properly
2809 * initialized. With uninit_bg reading the block from disk is not enough 2790 * initialized. With uninit_bg reading the block from disk is not enough
2810 * to mark the bitmap uptodate. We need to also zero-out the bitmap 2791 * to mark the bitmap uptodate. We need to also zero-out the bitmap
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0b16fb4c06d3..e5d3eadf47b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2306 ext4_lblk_t block) 2306 ext4_lblk_t block)
2307{ 2307{
2308 int depth = ext_depth(inode); 2308 int depth = ext_depth(inode);
2309 unsigned long len = 0; 2309 ext4_lblk_t len;
2310 ext4_lblk_t lblock = 0; 2310 ext4_lblk_t lblock;
2311 struct ext4_extent *ex; 2311 struct ext4_extent *ex;
2312 struct extent_status es;
2312 2313
2313 ex = path[depth].p_ext; 2314 ex = path[depth].p_ext;
2314 if (ex == NULL) { 2315 if (ex == NULL) {
2315 /* 2316 /* there is no extent yet, so gap is [0;-] */
2316 * there is no extent yet, so gap is [0;-] and we 2317 lblock = 0;
2317 * don't cache it 2318 len = EXT_MAX_BLOCKS;
2318 */
2319 ext_debug("cache gap(whole file):"); 2319 ext_debug("cache gap(whole file):");
2320 } else if (block < le32_to_cpu(ex->ee_block)) { 2320 } else if (block < le32_to_cpu(ex->ee_block)) {
2321 lblock = block; 2321 lblock = block;
@@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2324 block, 2324 block,
2325 le32_to_cpu(ex->ee_block), 2325 le32_to_cpu(ex->ee_block),
2326 ext4_ext_get_actual_len(ex)); 2326 ext4_ext_get_actual_len(ex));
2327 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2328 ext4_es_insert_extent(inode, lblock, len, ~0,
2329 EXTENT_STATUS_HOLE);
2330 } else if (block >= le32_to_cpu(ex->ee_block) 2327 } else if (block >= le32_to_cpu(ex->ee_block)
2331 + ext4_ext_get_actual_len(ex)) { 2328 + ext4_ext_get_actual_len(ex)) {
2332 ext4_lblk_t next; 2329 ext4_lblk_t next;
@@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2340 block); 2337 block);
2341 BUG_ON(next == lblock); 2338 BUG_ON(next == lblock);
2342 len = next - lblock; 2339 len = next - lblock;
2343 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2344 ext4_es_insert_extent(inode, lblock, len, ~0,
2345 EXTENT_STATUS_HOLE);
2346 } else { 2340 } else {
2347 BUG(); 2341 BUG();
2348 } 2342 }
2349 2343
2350 ext_debug(" -> %u:%lu\n", lblock, len); 2344 ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
2345 if (es.es_len) {
2346 /* There's delayed extent containing lblock? */
2347 if (es.es_lblk <= lblock)
2348 return;
2349 len = min(es.es_lblk - lblock, len);
2350 }
2351 ext_debug(" -> %u:%u\n", lblock, len);
2352 ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
2351} 2353}
2352 2354
2353/* 2355/*
@@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2481 ext4_lblk_t from, ext4_lblk_t to) 2483 ext4_lblk_t from, ext4_lblk_t to)
2482{ 2484{
2483 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2484 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2486 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2485 ext4_fsblk_t pblk; 2487 ext4_fsblk_t pblk;
2486 int flags = get_default_free_blocks_flags(inode); 2488 int flags = get_default_free_blocks_flags(inode);
2487 2489
@@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2490 * at the beginning of the extent. Instead, we make a note 2492 * at the beginning of the extent. Instead, we make a note
2491 * that we tried freeing the cluster, and check to see if we 2493 * that we tried freeing the cluster, and check to see if we
2492 * need to free it on a subsequent call to ext4_remove_blocks, 2494 * need to free it on a subsequent call to ext4_remove_blocks,
2493 * or at the end of the ext4_truncate() operation. 2495 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2494 */ 2496 */
2495 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2497 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2496 2498
@@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2501 * partial cluster here. 2503 * partial cluster here.
2502 */ 2504 */
2503 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2505 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2504 if ((*partial_cluster > 0) && 2506 if (*partial_cluster > 0 &&
2505 (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2507 *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2506 ext4_free_blocks(handle, inode, NULL, 2508 ext4_free_blocks(handle, inode, NULL,
2507 EXT4_C2B(sbi, *partial_cluster), 2509 EXT4_C2B(sbi, *partial_cluster),
2508 sbi->s_cluster_ratio, flags); 2510 sbi->s_cluster_ratio, flags);
@@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2528 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2530 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2529 /* tail removal */ 2531 /* tail removal */
2530 ext4_lblk_t num; 2532 ext4_lblk_t num;
2531 unsigned int unaligned; 2533 long long first_cluster;
2532 2534
2533 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2535 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2534 pblk = ext4_ext_pblock(ex) + ee_len - num; 2536 pblk = ext4_ext_pblock(ex) + ee_len - num;
@@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2538 * used by any other extent (partial_cluster is negative). 2540 * used by any other extent (partial_cluster is negative).
2539 */ 2541 */
2540 if (*partial_cluster < 0 && 2542 if (*partial_cluster < 0 &&
2541 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) 2543 *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
2542 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2544 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2543 2545
2544 ext_debug("free last %u blocks starting %llu partial %lld\n", 2546 ext_debug("free last %u blocks starting %llu partial %lld\n",
@@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2549 * beginning of a cluster, and we removed the entire 2551 * beginning of a cluster, and we removed the entire
2550 * extent and the cluster is not used by any other extent, 2552 * extent and the cluster is not used by any other extent,
2551 * save the partial cluster here, since we might need to 2553 * save the partial cluster here, since we might need to
2552 * delete if we determine that the truncate operation has 2554 * delete if we determine that the truncate or punch hole
2553 * removed all of the blocks in the cluster. 2555 * operation has removed all of the blocks in the cluster.
2556 * If that cluster is used by another extent, preserve its
2557 * negative value so it isn't freed later on.
2554 * 2558 *
2555 * On the other hand, if we did not manage to free the whole 2559 * If the whole extent wasn't freed, we've reached the
2556 * extent, we have to mark the cluster as used (store negative 2560 * start of the truncated/punched region and have finished
2557 * cluster number in partial_cluster). 2561 * removing blocks. If there's a partial cluster here it's
2562 * shared with the remainder of the extent and is no longer
2563 * a candidate for removal.
2558 */ 2564 */
2559 unaligned = EXT4_PBLK_COFF(sbi, pblk); 2565 if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
2560 if (unaligned && (ee_len == num) && 2566 first_cluster = (long long) EXT4_B2C(sbi, pblk);
2561 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2567 if (first_cluster != -*partial_cluster)
2562 *partial_cluster = EXT4_B2C(sbi, pblk); 2568 *partial_cluster = first_cluster;
2563 else if (unaligned) 2569 } else {
2564 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2565 else if (*partial_cluster > 0)
2566 *partial_cluster = 0; 2570 *partial_cluster = 0;
2571 }
2567 } else 2572 } else
2568 ext4_error(sbi->s_sb, "strange request: removal(2) " 2573 ext4_error(sbi->s_sb, "strange request: removal(2) "
2569 "%u-%u from %u:%u\n", 2574 "%u-%u from %u:%u\n",
@@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2574 2579
2575/* 2580/*
2576 * ext4_ext_rm_leaf() Removes the extents associated with the 2581 * ext4_ext_rm_leaf() Removes the extents associated with the
2577 * blocks appearing between "start" and "end", and splits the extents 2582 * blocks appearing between "start" and "end". Both "start"
2578 * if "start" and "end" appear in the same extent 2583 * and "end" must appear in the same extent or EIO is returned.
2579 * 2584 *
2580 * @handle: The journal handle 2585 * @handle: The journal handle
2581 * @inode: The files inode 2586 * @inode: The files inode
2582 * @path: The path to the leaf 2587 * @path: The path to the leaf
2583 * @partial_cluster: The cluster which we'll have to free if all extents 2588 * @partial_cluster: The cluster which we'll have to free if all extents
2584 * has been released from it. It gets negative in case 2589 * has been released from it. However, if this value is
2585 * that the cluster is still used. 2590 * negative, it's a cluster just to the right of the
2591 * punched region and it must not be freed.
2586 * @start: The first block to remove 2592 * @start: The first block to remove
2587 * @end: The last block to remove 2593 * @end: The last block to remove
2588 */ 2594 */
@@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2621 ex_ee_block = le32_to_cpu(ex->ee_block); 2627 ex_ee_block = le32_to_cpu(ex->ee_block);
2622 ex_ee_len = ext4_ext_get_actual_len(ex); 2628 ex_ee_len = ext4_ext_get_actual_len(ex);
2623 2629
2624 /*
2625 * If we're starting with an extent other than the last one in the
2626 * node, we need to see if it shares a cluster with the extent to
2627 * the right (towards the end of the file). If its leftmost cluster
2628 * is this extent's rightmost cluster and it is not cluster aligned,
2629 * we'll mark it as a partial that is not to be deallocated.
2630 */
2631
2632 if (ex != EXT_LAST_EXTENT(eh)) {
2633 ext4_fsblk_t current_pblk, right_pblk;
2634 long long current_cluster, right_cluster;
2635
2636 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2637 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2638 right_pblk = ext4_ext_pblock(ex + 1);
2639 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2640 if (current_cluster == right_cluster &&
2641 EXT4_PBLK_COFF(sbi, right_pblk))
2642 *partial_cluster = -right_cluster;
2643 }
2644
2645 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2630 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2646 2631
2647 while (ex >= EXT_FIRST_EXTENT(eh) && 2632 while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2666 if (end < ex_ee_block) { 2651 if (end < ex_ee_block) {
2667 /* 2652 /*
2668 * We're going to skip this extent and move to another, 2653 * We're going to skip this extent and move to another,
2669 * so if this extent is not cluster aligned we have 2654 * so note that its first cluster is in use to avoid
2670 * to mark the current cluster as used to avoid 2655 * freeing it when removing blocks. Eventually, the
2671 * accidentally freeing it later on 2656 * right edge of the truncated/punched region will
2657 * be just to the left.
2672 */ 2658 */
2673 pblk = ext4_ext_pblock(ex); 2659 if (sbi->s_cluster_ratio > 1) {
2674 if (EXT4_PBLK_COFF(sbi, pblk)) 2660 pblk = ext4_ext_pblock(ex);
2675 *partial_cluster = 2661 *partial_cluster =
2676 -((long long)EXT4_B2C(sbi, pblk)); 2662 -(long long) EXT4_B2C(sbi, pblk);
2663 }
2677 ex--; 2664 ex--;
2678 ex_ee_block = le32_to_cpu(ex->ee_block); 2665 ex_ee_block = le32_to_cpu(ex->ee_block);
2679 ex_ee_len = ext4_ext_get_actual_len(ex); 2666 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2749 sizeof(struct ext4_extent)); 2736 sizeof(struct ext4_extent));
2750 } 2737 }
2751 le16_add_cpu(&eh->eh_entries, -1); 2738 le16_add_cpu(&eh->eh_entries, -1);
2752 } else if (*partial_cluster > 0) 2739 }
2753 *partial_cluster = 0;
2754 2740
2755 err = ext4_ext_dirty(handle, inode, path + depth); 2741 err = ext4_ext_dirty(handle, inode, path + depth);
2756 if (err) 2742 if (err)
@@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2769 /* 2755 /*
2770 * If there's a partial cluster and at least one extent remains in 2756 * If there's a partial cluster and at least one extent remains in
2771 * the leaf, free the partial cluster if it isn't shared with the 2757 * the leaf, free the partial cluster if it isn't shared with the
2772 * current extent. If there's a partial cluster and no extents 2758 * current extent. If it is shared with the current extent
2773 * remain in the leaf, it can't be freed here. It can only be 2759 * we zero partial_cluster because we've reached the start of the
2774 * freed when it's possible to determine if it's not shared with 2760 * truncated/punched region and we're done removing blocks.
2775 * any other extent - when the next leaf is processed or when space
2776 * removal is complete.
2777 */ 2761 */
2778 if (*partial_cluster > 0 && eh->eh_entries && 2762 if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
2779 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2763 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2780 *partial_cluster)) { 2764 if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2781 int flags = get_default_free_blocks_flags(inode); 2765 ext4_free_blocks(handle, inode, NULL,
2782 2766 EXT4_C2B(sbi, *partial_cluster),
2783 ext4_free_blocks(handle, inode, NULL, 2767 sbi->s_cluster_ratio,
2784 EXT4_C2B(sbi, *partial_cluster), 2768 get_default_free_blocks_flags(inode));
2785 sbi->s_cluster_ratio, flags); 2769 }
2786 *partial_cluster = 0; 2770 *partial_cluster = 0;
2787 } 2771 }
2788 2772
@@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2819int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2803int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2820 ext4_lblk_t end) 2804 ext4_lblk_t end)
2821{ 2805{
2822 struct super_block *sb = inode->i_sb; 2806 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2823 int depth = ext_depth(inode); 2807 int depth = ext_depth(inode);
2824 struct ext4_ext_path *path = NULL; 2808 struct ext4_ext_path *path = NULL;
2825 long long partial_cluster = 0; 2809 long long partial_cluster = 0;
@@ -2845,9 +2829,10 @@ again:
2845 */ 2829 */
2846 if (end < EXT_MAX_BLOCKS - 1) { 2830 if (end < EXT_MAX_BLOCKS - 1) {
2847 struct ext4_extent *ex; 2831 struct ext4_extent *ex;
2848 ext4_lblk_t ee_block; 2832 ext4_lblk_t ee_block, ex_end, lblk;
2833 ext4_fsblk_t pblk;
2849 2834
2850 /* find extent for this block */ 2835 /* find extent for or closest extent to this block */
2851 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2836 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2852 if (IS_ERR(path)) { 2837 if (IS_ERR(path)) {
2853 ext4_journal_stop(handle); 2838 ext4_journal_stop(handle);
@@ -2867,6 +2852,7 @@ again:
2867 } 2852 }
2868 2853
2869 ee_block = le32_to_cpu(ex->ee_block); 2854 ee_block = le32_to_cpu(ex->ee_block);
2855 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2870 2856
2871 /* 2857 /*
2872 * See if the last block is inside the extent, if so split 2858 * See if the last block is inside the extent, if so split
@@ -2874,8 +2860,19 @@ again:
2874 * tail of the first part of the split extent in 2860 * tail of the first part of the split extent in
2875 * ext4_ext_rm_leaf(). 2861 * ext4_ext_rm_leaf().
2876 */ 2862 */
2877 if (end >= ee_block && 2863 if (end >= ee_block && end < ex_end) {
2878 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2864
2865 /*
2866 * If we're going to split the extent, note that
2867 * the cluster containing the block after 'end' is
2868 * in use to avoid freeing it when removing blocks.
2869 */
2870 if (sbi->s_cluster_ratio > 1) {
2871 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2872 partial_cluster =
2873 -(long long) EXT4_B2C(sbi, pblk);
2874 }
2875
2879 /* 2876 /*
2880 * Split the extent in two so that 'end' is the last 2877 * Split the extent in two so that 'end' is the last
2881 * block in the first new extent. Also we should not 2878 * block in the first new extent. Also we should not
@@ -2886,6 +2883,24 @@ again:
2886 end + 1, 1); 2883 end + 1, 1);
2887 if (err < 0) 2884 if (err < 0)
2888 goto out; 2885 goto out;
2886
2887 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
2888 /*
2889 * If there's an extent to the right its first cluster
2890 * contains the immediate right boundary of the
2891 * truncated/punched region. Set partial_cluster to
2892 * its negative value so it won't be freed if shared
2893 * with the current extent. The end < ee_block case
2894 * is handled in ext4_ext_rm_leaf().
2895 */
2896 lblk = ex_end + 1;
2897 err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2898 &ex);
2899 if (err)
2900 goto out;
2901 if (pblk)
2902 partial_cluster =
2903 -(long long) EXT4_B2C(sbi, pblk);
2889 } 2904 }
2890 } 2905 }
2891 /* 2906 /*
@@ -2996,16 +3011,18 @@ again:
2996 trace_ext4_ext_remove_space_done(inode, start, end, depth, 3011 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2997 partial_cluster, path->p_hdr->eh_entries); 3012 partial_cluster, path->p_hdr->eh_entries);
2998 3013
2999 /* If we still have something in the partial cluster and we have removed 3014 /*
3015 * If we still have something in the partial cluster and we have removed
3000 * even the first extent, then we should free the blocks in the partial 3016 * even the first extent, then we should free the blocks in the partial
3001 * cluster as well. */ 3017 * cluster as well. (This code will only run when there are no leaves
3002 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { 3018 * to the immediate left of the truncated/punched region.)
3003 int flags = get_default_free_blocks_flags(inode); 3019 */
3004 3020 if (partial_cluster > 0 && err == 0) {
3021 /* don't zero partial_cluster since it's not used afterwards */
3005 ext4_free_blocks(handle, inode, NULL, 3022 ext4_free_blocks(handle, inode, NULL,
3006 EXT4_C2B(EXT4_SB(sb), partial_cluster), 3023 EXT4_C2B(sbi, partial_cluster),
3007 EXT4_SB(sb)->s_cluster_ratio, flags); 3024 sbi->s_cluster_ratio,
3008 partial_cluster = 0; 3025 get_default_free_blocks_flags(inode));
3009 } 3026 }
3010 3027
3011 /* TODO: flexible tree reduction should be here */ 3028 /* TODO: flexible tree reduction should be here */
@@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4267 ext4_io_end_t *io = ext4_inode_aio(inode); 4284 ext4_io_end_t *io = ext4_inode_aio(inode);
4268 ext4_lblk_t cluster_offset; 4285 ext4_lblk_t cluster_offset;
4269 int set_unwritten = 0; 4286 int set_unwritten = 0;
4287 bool map_from_cluster = false;
4270 4288
4271 ext_debug("blocks %u/%u requested for inode %lu\n", 4289 ext_debug("blocks %u/%u requested for inode %lu\n",
4272 map->m_lblk, map->m_len, inode->i_ino); 4290 map->m_lblk, map->m_len, inode->i_ino);
@@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4343 } 4361 }
4344 } 4362 }
4345 4363
4346 if ((sbi->s_cluster_ratio > 1) &&
4347 ext4_find_delalloc_cluster(inode, map->m_lblk))
4348 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4349
4350 /* 4364 /*
4351 * requested block isn't allocated yet; 4365 * requested block isn't allocated yet;
4352 * we couldn't try to create block if create flag is zero 4366 * we couldn't try to create block if create flag is zero
@@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4356 * put just found gap into cache to speed up 4370 * put just found gap into cache to speed up
4357 * subsequent requests 4371 * subsequent requests
4358 */ 4372 */
4359 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) 4373 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4360 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4361 goto out2; 4374 goto out2;
4362 } 4375 }
4363 4376
4364 /* 4377 /*
4365 * Okay, we need to do block allocation. 4378 * Okay, we need to do block allocation.
4366 */ 4379 */
4367 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4368 newex.ee_block = cpu_to_le32(map->m_lblk); 4380 newex.ee_block = cpu_to_le32(map->m_lblk);
4369 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4381 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4370 4382
@@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4376 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4388 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4377 ar.len = allocated = map->m_len; 4389 ar.len = allocated = map->m_len;
4378 newblock = map->m_pblk; 4390 newblock = map->m_pblk;
4379 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4391 map_from_cluster = true;
4380 goto got_allocated_blocks; 4392 goto got_allocated_blocks;
4381 } 4393 }
4382 4394
@@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4397 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4409 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4398 ar.len = allocated = map->m_len; 4410 ar.len = allocated = map->m_len;
4399 newblock = map->m_pblk; 4411 newblock = map->m_pblk;
4400 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4412 map_from_cluster = true;
4401 goto got_allocated_blocks; 4413 goto got_allocated_blocks;
4402 } 4414 }
4403 4415
@@ -4523,7 +4535,7 @@ got_allocated_blocks:
4523 */ 4535 */
4524 reserved_clusters = get_reserved_cluster_alloc(inode, 4536 reserved_clusters = get_reserved_cluster_alloc(inode,
4525 map->m_lblk, allocated); 4537 map->m_lblk, allocated);
4526 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { 4538 if (map_from_cluster) {
4527 if (reserved_clusters) { 4539 if (reserved_clusters) {
4528 /* 4540 /*
4529 * We have clusters reserved for this range. 4541 * We have clusters reserved for this range.
@@ -4620,7 +4632,6 @@ out2:
4620 4632
4621 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4633 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4622 err ? err : allocated); 4634 err ? err : allocated);
4623 ext4_es_lru_add(inode);
4624 return err ? err : allocated; 4635 return err ? err : allocated;
4625} 4636}
4626 4637
@@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5140 if (ext4_has_inline_data(inode)) { 5151 if (ext4_has_inline_data(inode)) {
5141 int has_inline = 1; 5152 int has_inline = 1;
5142 5153
5143 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 5154 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5155 start, len);
5144 5156
5145 if (has_inline) 5157 if (has_inline)
5146 return error; 5158 return error;
@@ -5154,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5154 5166
5155 /* fallback to generic here if not in extents fmt */ 5167 /* fallback to generic here if not in extents fmt */
5156 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5168 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5157 return generic_block_fiemap(inode, fieinfo, start, len, 5169 return __generic_block_fiemap(inode, fieinfo, start, len,
5158 ext4_get_block); 5170 ext4_get_block);
5159 5171
5160 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5172 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5161 return -EBADR; 5173 return -EBADR;
@@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5179 error = ext4_fill_fiemap_extents(inode, start_blk, 5191 error = ext4_fill_fiemap_extents(inode, start_blk,
5180 len_blks, fieinfo); 5192 len_blks, fieinfo);
5181 } 5193 }
5182 ext4_es_lru_add(inode);
5183 return error; 5194 return error;
5184} 5195}
5185 5196
@@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5239 return -EIO; 5250 return -EIO;
5240 5251
5241 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 5252 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5242 if (!ex_last)
5243 return -EIO;
5244 5253
5245 err = ext4_access_path(handle, inode, path + depth); 5254 err = ext4_access_path(handle, inode, path + depth);
5246 if (err) 5255 if (err)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 94e7855ae71b..e04d45733976 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep;
147static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 147static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
149 ext4_lblk_t end); 149 ext4_lblk_t end);
150static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
151 int nr_to_scan); 151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 152 struct ext4_inode_info *locked_ei);
153 struct ext4_inode_info *locked_ei);
154 153
155int __init ext4_init_es(void) 154int __init ext4_init_es(void)
156{ 155{
@@ -298,6 +297,36 @@ out:
298 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 297 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
299} 298}
300 299
300static void ext4_es_list_add(struct inode *inode)
301{
302 struct ext4_inode_info *ei = EXT4_I(inode);
303 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
304
305 if (!list_empty(&ei->i_es_list))
306 return;
307
308 spin_lock(&sbi->s_es_lock);
309 if (list_empty(&ei->i_es_list)) {
310 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
311 sbi->s_es_nr_inode++;
312 }
313 spin_unlock(&sbi->s_es_lock);
314}
315
316static void ext4_es_list_del(struct inode *inode)
317{
318 struct ext4_inode_info *ei = EXT4_I(inode);
319 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
320
321 spin_lock(&sbi->s_es_lock);
322 if (!list_empty(&ei->i_es_list)) {
323 list_del_init(&ei->i_es_list);
324 sbi->s_es_nr_inode--;
325 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
326 }
327 spin_unlock(&sbi->s_es_lock);
328}
329
301static struct extent_status * 330static struct extent_status *
302ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, 331ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
303 ext4_fsblk_t pblk) 332 ext4_fsblk_t pblk)
@@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
314 * We don't count delayed extent because we never try to reclaim them 343 * We don't count delayed extent because we never try to reclaim them
315 */ 344 */
316 if (!ext4_es_is_delayed(es)) { 345 if (!ext4_es_is_delayed(es)) {
317 EXT4_I(inode)->i_es_lru_nr++; 346 if (!EXT4_I(inode)->i_es_shk_nr++)
347 ext4_es_list_add(inode);
318 percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 348 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
319 s_es_stats.es_stats_lru_cnt); 349 s_es_stats.es_stats_shk_cnt);
320 } 350 }
321 351
322 EXT4_I(inode)->i_es_all_nr++; 352 EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
330 EXT4_I(inode)->i_es_all_nr--; 360 EXT4_I(inode)->i_es_all_nr--;
331 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 361 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
332 362
333 /* Decrease the lru counter when this es is not delayed */ 363 /* Decrease the shrink counter when this es is not delayed */
334 if (!ext4_es_is_delayed(es)) { 364 if (!ext4_es_is_delayed(es)) {
335 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 365 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
336 EXT4_I(inode)->i_es_lru_nr--; 366 if (!--EXT4_I(inode)->i_es_shk_nr)
367 ext4_es_list_del(inode);
337 percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 368 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
338 s_es_stats.es_stats_lru_cnt); 369 s_es_stats.es_stats_shk_cnt);
339 } 370 }
340 371
341 kmem_cache_free(ext4_es_cachep, es); 372 kmem_cache_free(ext4_es_cachep, es);
@@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
351static int ext4_es_can_be_merged(struct extent_status *es1, 382static int ext4_es_can_be_merged(struct extent_status *es1,
352 struct extent_status *es2) 383 struct extent_status *es2)
353{ 384{
354 if (ext4_es_status(es1) != ext4_es_status(es2)) 385 if (ext4_es_type(es1) != ext4_es_type(es2))
355 return 0; 386 return 0;
356 387
357 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { 388 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
@@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
394 es1 = rb_entry(node, struct extent_status, rb_node); 425 es1 = rb_entry(node, struct extent_status, rb_node);
395 if (ext4_es_can_be_merged(es1, es)) { 426 if (ext4_es_can_be_merged(es1, es)) {
396 es1->es_len += es->es_len; 427 es1->es_len += es->es_len;
428 if (ext4_es_is_referenced(es))
429 ext4_es_set_referenced(es1);
397 rb_erase(&es->rb_node, &tree->root); 430 rb_erase(&es->rb_node, &tree->root);
398 ext4_es_free_extent(inode, es); 431 ext4_es_free_extent(inode, es);
399 es = es1; 432 es = es1;
@@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
416 es1 = rb_entry(node, struct extent_status, rb_node); 449 es1 = rb_entry(node, struct extent_status, rb_node);
417 if (ext4_es_can_be_merged(es, es1)) { 450 if (ext4_es_can_be_merged(es, es1)) {
418 es->es_len += es1->es_len; 451 es->es_len += es1->es_len;
452 if (ext4_es_is_referenced(es1))
453 ext4_es_set_referenced(es);
419 rb_erase(node, &tree->root); 454 rb_erase(node, &tree->root);
420 ext4_es_free_extent(inode, es1); 455 ext4_es_free_extent(inode, es1);
421 } 456 }
@@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
683 goto error; 718 goto error;
684retry: 719retry:
685 err = __es_insert_extent(inode, &newes); 720 err = __es_insert_extent(inode, &newes);
686 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 721 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
687 EXT4_I(inode))) 722 128, EXT4_I(inode)))
688 goto retry; 723 goto retry;
689 if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) 724 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
690 err = 0; 725 err = 0;
@@ -782,6 +817,8 @@ out:
782 es->es_lblk = es1->es_lblk; 817 es->es_lblk = es1->es_lblk;
783 es->es_len = es1->es_len; 818 es->es_len = es1->es_len;
784 es->es_pblk = es1->es_pblk; 819 es->es_pblk = es1->es_pblk;
820 if (!ext4_es_is_referenced(es))
821 ext4_es_set_referenced(es);
785 stats->es_stats_cache_hits++; 822 stats->es_stats_cache_hits++;
786 } else { 823 } else {
787 stats->es_stats_cache_misses++; 824 stats->es_stats_cache_misses++;
@@ -841,8 +878,8 @@ retry:
841 es->es_lblk = orig_es.es_lblk; 878 es->es_lblk = orig_es.es_lblk;
842 es->es_len = orig_es.es_len; 879 es->es_len = orig_es.es_len;
843 if ((err == -ENOMEM) && 880 if ((err == -ENOMEM) &&
844 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 881 __es_shrink(EXT4_SB(inode->i_sb),
845 EXT4_I(inode))) 882 128, EXT4_I(inode)))
846 goto retry; 883 goto retry;
847 goto out; 884 goto out;
848 } 885 }
@@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
914 end = lblk + len - 1; 951 end = lblk + len - 1;
915 BUG_ON(end < lblk); 952 BUG_ON(end < lblk);
916 953
954 /*
955 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
956 * so that we are sure __es_shrink() is done with the inode before it
957 * is reclaimed.
958 */
917 write_lock(&EXT4_I(inode)->i_es_lock); 959 write_lock(&EXT4_I(inode)->i_es_lock);
918 err = __es_remove_extent(inode, lblk, end); 960 err = __es_remove_extent(inode, lblk, end);
919 write_unlock(&EXT4_I(inode)->i_es_lock); 961 write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
921 return err; 963 return err;
922} 964}
923 965
924static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, 966static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
925 struct list_head *b) 967 struct ext4_inode_info *locked_ei)
926{
927 struct ext4_inode_info *eia, *eib;
928 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
929 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
930
931 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
932 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
933 return 1;
934 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
935 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
936 return -1;
937 if (eia->i_touch_when == eib->i_touch_when)
938 return 0;
939 if (time_after(eia->i_touch_when, eib->i_touch_when))
940 return 1;
941 else
942 return -1;
943}
944
945static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
946 struct ext4_inode_info *locked_ei)
947{ 968{
948 struct ext4_inode_info *ei; 969 struct ext4_inode_info *ei;
949 struct ext4_es_stats *es_stats; 970 struct ext4_es_stats *es_stats;
950 struct list_head *cur, *tmp;
951 LIST_HEAD(skipped);
952 ktime_t start_time; 971 ktime_t start_time;
953 u64 scan_time; 972 u64 scan_time;
973 int nr_to_walk;
954 int nr_shrunk = 0; 974 int nr_shrunk = 0;
955 int retried = 0, skip_precached = 1, nr_skipped = 0; 975 int retried = 0, nr_skipped = 0;
956 976
957 es_stats = &sbi->s_es_stats; 977 es_stats = &sbi->s_es_stats;
958 start_time = ktime_get(); 978 start_time = ktime_get();
959 spin_lock(&sbi->s_es_lru_lock);
960 979
961retry: 980retry:
962 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 981 spin_lock(&sbi->s_es_lock);
963 int shrunk; 982 nr_to_walk = sbi->s_es_nr_inode;
964 983 while (nr_to_walk-- > 0) {
965 /* 984 if (list_empty(&sbi->s_es_list)) {
966 * If we have already reclaimed all extents from extent 985 spin_unlock(&sbi->s_es_lock);
967 * status tree, just stop the loop immediately. 986 goto out;
968 */ 987 }
969 if (percpu_counter_read_positive( 988 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
970 &es_stats->es_stats_lru_cnt) == 0) 989 i_es_list);
971 break; 990 /* Move the inode to the tail */
972 991 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
973 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
974 992
975 /* 993 /*
976 * Skip the inode that is newer than the last_sorted 994 * Normally we try hard to avoid shrinking precached inodes,
977 * time. Normally we try hard to avoid shrinking 995 * but we will as a last resort.
978 * precached inodes, but we will as a last resort.
979 */ 996 */
980 if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || 997 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
981 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 998 EXT4_STATE_EXT_PRECACHED)) {
982 EXT4_STATE_EXT_PRECACHED))) {
983 nr_skipped++; 999 nr_skipped++;
984 list_move_tail(cur, &skipped);
985 continue; 1000 continue;
986 } 1001 }
987 1002
988 if (ei->i_es_lru_nr == 0 || ei == locked_ei || 1003 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
989 !write_trylock(&ei->i_es_lock)) 1004 nr_skipped++;
990 continue; 1005 continue;
1006 }
1007 /*
1008 * Now we hold i_es_lock which protects us from inode reclaim
1009 * freeing inode under us
1010 */
1011 spin_unlock(&sbi->s_es_lock);
991 1012
992 shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1013 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
993 if (ei->i_es_lru_nr == 0)
994 list_del_init(&ei->i_es_lru);
995 write_unlock(&ei->i_es_lock); 1014 write_unlock(&ei->i_es_lock);
996 1015
997 nr_shrunk += shrunk; 1016 if (nr_to_scan <= 0)
998 nr_to_scan -= shrunk; 1017 goto out;
999 if (nr_to_scan == 0) 1018 spin_lock(&sbi->s_es_lock);
1000 break;
1001 } 1019 }
1002 1020 spin_unlock(&sbi->s_es_lock);
1003 /* Move the newer inodes into the tail of the LRU list. */
1004 list_splice_tail(&skipped, &sbi->s_es_lru);
1005 INIT_LIST_HEAD(&skipped);
1006 1021
1007 /* 1022 /*
1008 * If we skipped any inodes, and we weren't able to make any 1023 * If we skipped any inodes, and we weren't able to make any
1009 * forward progress, sort the list and try again. 1024 * forward progress, try again to scan precached inodes.
1010 */ 1025 */
1011 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1026 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1012 retried++; 1027 retried++;
1013 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
1014 es_stats->es_stats_last_sorted = jiffies;
1015 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
1016 i_es_lru);
1017 /*
1018 * If there are no non-precached inodes left on the
1019 * list, start releasing precached extents.
1020 */
1021 if (ext4_test_inode_state(&ei->vfs_inode,
1022 EXT4_STATE_EXT_PRECACHED))
1023 skip_precached = 0;
1024 goto retry; 1028 goto retry;
1025 } 1029 }
1026 1030
1027 spin_unlock(&sbi->s_es_lru_lock);
1028
1029 if (locked_ei && nr_shrunk == 0) 1031 if (locked_ei && nr_shrunk == 0)
1030 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1032 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1031 1033
1034out:
1032 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1035 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1033 if (likely(es_stats->es_stats_scan_time)) 1036 if (likely(es_stats->es_stats_scan_time))
1034 es_stats->es_stats_scan_time = (scan_time + 1037 es_stats->es_stats_scan_time = (scan_time +
@@ -1043,7 +1046,7 @@ retry:
1043 else 1046 else
1044 es_stats->es_stats_shrunk = nr_shrunk; 1047 es_stats->es_stats_shrunk = nr_shrunk;
1045 1048
1046 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, 1049 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1047 nr_skipped, retried); 1050 nr_skipped, retried);
1048 return nr_shrunk; 1051 return nr_shrunk;
1049} 1052}
@@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
1055 struct ext4_sb_info *sbi; 1058 struct ext4_sb_info *sbi;
1056 1059
1057 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1060 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1058 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1061 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1059 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1062 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1060 return nr; 1063 return nr;
1061} 1064}
@@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
1068 int nr_to_scan = sc->nr_to_scan; 1071 int nr_to_scan = sc->nr_to_scan;
1069 int ret, nr_shrunk; 1072 int ret, nr_shrunk;
1070 1073
1071 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1074 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1072 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1075 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1073 1076
1074 if (!nr_to_scan) 1077 if (!nr_to_scan)
1075 return ret; 1078 return ret;
1076 1079
1077 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1080 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1078 1081
1079 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1082 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1080 return nr_shrunk; 1083 return nr_shrunk;
@@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1102 return 0; 1105 return 0;
1103 1106
1104 /* here we just find an inode that has the max nr. of objects */ 1107 /* here we just find an inode that has the max nr. of objects */
1105 spin_lock(&sbi->s_es_lru_lock); 1108 spin_lock(&sbi->s_es_lock);
1106 list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { 1109 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1107 inode_cnt++; 1110 inode_cnt++;
1108 if (max && max->i_es_all_nr < ei->i_es_all_nr) 1111 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1109 max = ei; 1112 max = ei;
1110 else if (!max) 1113 else if (!max)
1111 max = ei; 1114 max = ei;
1112 } 1115 }
1113 spin_unlock(&sbi->s_es_lru_lock); 1116 spin_unlock(&sbi->s_es_lock);
1114 1117
1115 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1118 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1116 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1119 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1117 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); 1120 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1118 seq_printf(seq, " %lu/%lu cache hits/misses\n", 1121 seq_printf(seq, " %lu/%lu cache hits/misses\n",
1119 es_stats->es_stats_cache_hits, 1122 es_stats->es_stats_cache_hits,
1120 es_stats->es_stats_cache_misses); 1123 es_stats->es_stats_cache_misses);
1121 if (es_stats->es_stats_last_sorted != 0)
1122 seq_printf(seq, " %u ms last sorted interval\n",
1123 jiffies_to_msecs(jiffies -
1124 es_stats->es_stats_last_sorted));
1125 if (inode_cnt) 1124 if (inode_cnt)
1126 seq_printf(seq, " %d inodes on lru list\n", inode_cnt); 1125 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1127 1126
1128 seq_printf(seq, "average:\n %llu us scan time\n", 1127 seq_printf(seq, "average:\n %llu us scan time\n",
1129 div_u64(es_stats->es_stats_scan_time, 1000)); 1128 div_u64(es_stats->es_stats_scan_time, 1000));
@@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1132 seq_printf(seq, 1131 seq_printf(seq,
1133 "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1132 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1134 " %llu us max scan time\n", 1133 " %llu us max scan time\n",
1135 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, 1134 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1136 div_u64(es_stats->es_stats_max_scan_time, 1000)); 1135 div_u64(es_stats->es_stats_max_scan_time, 1000));
1137 1136
1138 return 0; 1137 return 0;
@@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1181{ 1180{
1182 int err; 1181 int err;
1183 1182
1184 INIT_LIST_HEAD(&sbi->s_es_lru); 1183 /* Make sure we have enough bits for physical block number */
1185 spin_lock_init(&sbi->s_es_lru_lock); 1184 BUILD_BUG_ON(ES_SHIFT < 48);
1186 sbi->s_es_stats.es_stats_last_sorted = 0; 1185 INIT_LIST_HEAD(&sbi->s_es_list);
1186 sbi->s_es_nr_inode = 0;
1187 spin_lock_init(&sbi->s_es_lock);
1187 sbi->s_es_stats.es_stats_shrunk = 0; 1188 sbi->s_es_stats.es_stats_shrunk = 0;
1188 sbi->s_es_stats.es_stats_cache_hits = 0; 1189 sbi->s_es_stats.es_stats_cache_hits = 0;
1189 sbi->s_es_stats.es_stats_cache_misses = 0; 1190 sbi->s_es_stats.es_stats_cache_misses = 0;
@@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1192 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1193 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1193 if (err) 1194 if (err)
1194 return err; 1195 return err;
1195 err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); 1196 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1196 if (err) 1197 if (err)
1197 goto err1; 1198 goto err1;
1198 1199
@@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1210 return 0; 1211 return 0;
1211 1212
1212err2: 1213err2:
1213 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1214 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1214err1: 1215err1:
1215 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1216 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1216 return err; 1217 return err;
@@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1221 if (sbi->s_proc) 1222 if (sbi->s_proc)
1222 remove_proc_entry("es_shrinker_info", sbi->s_proc); 1223 remove_proc_entry("es_shrinker_info", sbi->s_proc);
1223 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1225 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1225 unregister_shrinker(&sbi->s_es_shrinker); 1226 unregister_shrinker(&sbi->s_es_shrinker);
1226} 1227}
1227 1228
1228void ext4_es_lru_add(struct inode *inode) 1229/*
1230 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1231 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1232 *
1233 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1234 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1235 * ei->i_es_shrink_lblk to where we should continue scanning.
1236 */
1237static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1238 int *nr_to_scan, int *nr_shrunk)
1229{ 1239{
1230 struct ext4_inode_info *ei = EXT4_I(inode); 1240 struct inode *inode = &ei->vfs_inode;
1231 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1241 struct ext4_es_tree *tree = &ei->i_es_tree;
1232 1242 struct extent_status *es;
1233 ei->i_touch_when = jiffies; 1243 struct rb_node *node;
1234
1235 if (!list_empty(&ei->i_es_lru))
1236 return;
1237 1244
1238 spin_lock(&sbi->s_es_lru_lock); 1245 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1239 if (list_empty(&ei->i_es_lru)) 1246 if (!es)
1240 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 1247 goto out_wrap;
1241 spin_unlock(&sbi->s_es_lru_lock); 1248 node = &es->rb_node;
1242} 1249 while (*nr_to_scan > 0) {
1250 if (es->es_lblk > end) {
1251 ei->i_es_shrink_lblk = end + 1;
1252 return 0;
1253 }
1243 1254
1244void ext4_es_lru_del(struct inode *inode) 1255 (*nr_to_scan)--;
1245{ 1256 node = rb_next(&es->rb_node);
1246 struct ext4_inode_info *ei = EXT4_I(inode); 1257 /*
1247 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1258 * We can't reclaim delayed extent from status tree because
1259 * fiemap, bigallic, and seek_data/hole need to use it.
1260 */
1261 if (ext4_es_is_delayed(es))
1262 goto next;
1263 if (ext4_es_is_referenced(es)) {
1264 ext4_es_clear_referenced(es);
1265 goto next;
1266 }
1248 1267
1249 spin_lock(&sbi->s_es_lru_lock); 1268 rb_erase(&es->rb_node, &tree->root);
1250 if (!list_empty(&ei->i_es_lru)) 1269 ext4_es_free_extent(inode, es);
1251 list_del_init(&ei->i_es_lru); 1270 (*nr_shrunk)++;
1252 spin_unlock(&sbi->s_es_lru_lock); 1271next:
1272 if (!node)
1273 goto out_wrap;
1274 es = rb_entry(node, struct extent_status, rb_node);
1275 }
1276 ei->i_es_shrink_lblk = es->es_lblk;
1277 return 1;
1278out_wrap:
1279 ei->i_es_shrink_lblk = 0;
1280 return 0;
1253} 1281}
1254 1282
1255static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 1283static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1256 int nr_to_scan)
1257{ 1284{
1258 struct inode *inode = &ei->vfs_inode; 1285 struct inode *inode = &ei->vfs_inode;
1259 struct ext4_es_tree *tree = &ei->i_es_tree; 1286 int nr_shrunk = 0;
1260 struct rb_node *node; 1287 ext4_lblk_t start = ei->i_es_shrink_lblk;
1261 struct extent_status *es;
1262 unsigned long nr_shrunk = 0;
1263 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1288 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1264 DEFAULT_RATELIMIT_BURST); 1289 DEFAULT_RATELIMIT_BURST);
1265 1290
1266 if (ei->i_es_lru_nr == 0) 1291 if (ei->i_es_shk_nr == 0)
1267 return 0; 1292 return 0;
1268 1293
1269 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && 1294 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1270 __ratelimit(&_rs)) 1295 __ratelimit(&_rs))
1271 ext4_warning(inode->i_sb, "forced shrink of precached extents"); 1296 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1272 1297
1273 node = rb_first(&tree->root); 1298 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1274 while (node != NULL) { 1299 start != 0)
1275 es = rb_entry(node, struct extent_status, rb_node); 1300 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1276 node = rb_next(&es->rb_node); 1301
1277 /* 1302 ei->i_es_tree.cache_es = NULL;
1278 * We can't reclaim delayed extent from status tree because
1279 * fiemap, bigallic, and seek_data/hole need to use it.
1280 */
1281 if (!ext4_es_is_delayed(es)) {
1282 rb_erase(&es->rb_node, &tree->root);
1283 ext4_es_free_extent(inode, es);
1284 nr_shrunk++;
1285 if (--nr_to_scan == 0)
1286 break;
1287 }
1288 }
1289 tree->cache_es = NULL;
1290 return nr_shrunk; 1303 return nr_shrunk;
1291} 1304}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index efd5f970b501..691b52613ce4 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,25 +29,28 @@
29/* 29/*
30 * These flags live in the high bits of extent_status.es_pblk 30 * These flags live in the high bits of extent_status.es_pblk
31 */ 31 */
32#define ES_SHIFT 60 32enum {
33 33 ES_WRITTEN_B,
34#define EXTENT_STATUS_WRITTEN (1 << 3) 34 ES_UNWRITTEN_B,
35#define EXTENT_STATUS_UNWRITTEN (1 << 2) 35 ES_DELAYED_B,
36#define EXTENT_STATUS_DELAYED (1 << 1) 36 ES_HOLE_B,
37#define EXTENT_STATUS_HOLE (1 << 0) 37 ES_REFERENCED_B,
38 ES_FLAGS
39};
38 40
39#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ 41#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
40 EXTENT_STATUS_UNWRITTEN | \ 42#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
41 EXTENT_STATUS_DELAYED | \
42 EXTENT_STATUS_HOLE)
43 43
44#define ES_WRITTEN (1ULL << 63) 44#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
45#define ES_UNWRITTEN (1ULL << 62) 45#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
46#define ES_DELAYED (1ULL << 61) 46#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
47#define ES_HOLE (1ULL << 60) 47#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
48#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
48 49
49#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ 50#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
50 ES_DELAYED | ES_HOLE) 51 EXTENT_STATUS_UNWRITTEN | \
52 EXTENT_STATUS_DELAYED | \
53 EXTENT_STATUS_HOLE) << ES_SHIFT)
51 54
52struct ext4_sb_info; 55struct ext4_sb_info;
53struct ext4_extent; 56struct ext4_extent;
@@ -65,14 +68,13 @@ struct ext4_es_tree {
65}; 68};
66 69
67struct ext4_es_stats { 70struct ext4_es_stats {
68 unsigned long es_stats_last_sorted;
69 unsigned long es_stats_shrunk; 71 unsigned long es_stats_shrunk;
70 unsigned long es_stats_cache_hits; 72 unsigned long es_stats_cache_hits;
71 unsigned long es_stats_cache_misses; 73 unsigned long es_stats_cache_misses;
72 u64 es_stats_scan_time; 74 u64 es_stats_scan_time;
73 u64 es_stats_max_scan_time; 75 u64 es_stats_max_scan_time;
74 struct percpu_counter es_stats_all_cnt; 76 struct percpu_counter es_stats_all_cnt;
75 struct percpu_counter es_stats_lru_cnt; 77 struct percpu_counter es_stats_shk_cnt;
76}; 78};
77 79
78extern int __init ext4_init_es(void); 80extern int __init ext4_init_es(void);
@@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
93extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 95extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
94 struct extent_status *es); 96 struct extent_status *es);
95 97
98static inline unsigned int ext4_es_status(struct extent_status *es)
99{
100 return es->es_pblk >> ES_SHIFT;
101}
102
103static inline unsigned int ext4_es_type(struct extent_status *es)
104{
105 return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
106}
107
96static inline int ext4_es_is_written(struct extent_status *es) 108static inline int ext4_es_is_written(struct extent_status *es)
97{ 109{
98 return (es->es_pblk & ES_WRITTEN) != 0; 110 return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
99} 111}
100 112
101static inline int ext4_es_is_unwritten(struct extent_status *es) 113static inline int ext4_es_is_unwritten(struct extent_status *es)
102{ 114{
103 return (es->es_pblk & ES_UNWRITTEN) != 0; 115 return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
104} 116}
105 117
106static inline int ext4_es_is_delayed(struct extent_status *es) 118static inline int ext4_es_is_delayed(struct extent_status *es)
107{ 119{
108 return (es->es_pblk & ES_DELAYED) != 0; 120 return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
109} 121}
110 122
111static inline int ext4_es_is_hole(struct extent_status *es) 123static inline int ext4_es_is_hole(struct extent_status *es)
112{ 124{
113 return (es->es_pblk & ES_HOLE) != 0; 125 return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
114} 126}
115 127
116static inline unsigned int ext4_es_status(struct extent_status *es) 128static inline void ext4_es_set_referenced(struct extent_status *es)
117{ 129{
118 return es->es_pblk >> ES_SHIFT; 130 es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
131}
132
133static inline void ext4_es_clear_referenced(struct extent_status *es)
134{
135 es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
136}
137
138static inline int ext4_es_is_referenced(struct extent_status *es)
139{
140 return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
119} 141}
120 142
121static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) 143static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
@@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
135static inline void ext4_es_store_status(struct extent_status *es, 157static inline void ext4_es_store_status(struct extent_status *es,
136 unsigned int status) 158 unsigned int status)
137{ 159{
138 es->es_pblk = (((ext4_fsblk_t) 160 es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
139 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | 161 (es->es_pblk & ~ES_MASK);
140 (es->es_pblk & ~ES_MASK));
141} 162}
142 163
143static inline void ext4_es_store_pblock_status(struct extent_status *es, 164static inline void ext4_es_store_pblock_status(struct extent_status *es,
144 ext4_fsblk_t pb, 165 ext4_fsblk_t pb,
145 unsigned int status) 166 unsigned int status)
146{ 167{
147 es->es_pblk = (((ext4_fsblk_t) 168 es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
148 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | 169 (pb & ~ES_MASK);
149 (pb & ~ES_MASK));
150} 170}
151 171
152extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); 172extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
153extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 173extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
154extern void ext4_es_lru_add(struct inode *inode);
155extern void ext4_es_lru_del(struct inode *inode);
156 174
157#endif /* _EXT4_EXTENTS_STATUS_H */ 175#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..513c12cf444c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
273 * we determine this extent as a data or a hole according to whether the 273 * we determine this extent as a data or a hole according to whether the
274 * page cache has data or not. 274 * page cache has data or not.
275 */ 275 */
276static int ext4_find_unwritten_pgoff(struct inode *inode, 276static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
277 int whence, 277 loff_t endoff, loff_t *offset)
278 struct ext4_map_blocks *map,
279 loff_t *offset)
280{ 278{
281 struct pagevec pvec; 279 struct pagevec pvec;
282 unsigned int blkbits;
283 pgoff_t index; 280 pgoff_t index;
284 pgoff_t end; 281 pgoff_t end;
285 loff_t endoff;
286 loff_t startoff; 282 loff_t startoff;
287 loff_t lastoff; 283 loff_t lastoff;
288 int found = 0; 284 int found = 0;
289 285
290 blkbits = inode->i_sb->s_blocksize_bits;
291 startoff = *offset; 286 startoff = *offset;
292 lastoff = startoff; 287 lastoff = startoff;
293 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 288
294 289
295 index = startoff >> PAGE_CACHE_SHIFT; 290 index = startoff >> PAGE_CACHE_SHIFT;
296 end = endoff >> PAGE_CACHE_SHIFT; 291 end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
408static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 403static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
409{ 404{
410 struct inode *inode = file->f_mapping->host; 405 struct inode *inode = file->f_mapping->host;
411 struct ext4_map_blocks map; 406 struct fiemap_extent_info fie;
412 struct extent_status es; 407 struct fiemap_extent ext[2];
413 ext4_lblk_t start, last, end; 408 loff_t next;
414 loff_t dataoff, isize; 409 int i, ret = 0;
415 int blkbits;
416 int ret = 0;
417 410
418 mutex_lock(&inode->i_mutex); 411 mutex_lock(&inode->i_mutex);
419 412 if (offset >= inode->i_size) {
420 isize = i_size_read(inode);
421 if (offset >= isize) {
422 mutex_unlock(&inode->i_mutex); 413 mutex_unlock(&inode->i_mutex);
423 return -ENXIO; 414 return -ENXIO;
424 } 415 }
425 416 fie.fi_flags = 0;
426 blkbits = inode->i_sb->s_blocksize_bits; 417 fie.fi_extents_max = 2;
427 start = offset >> blkbits; 418 fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
428 last = start; 419 while (1) {
429 end = isize >> blkbits; 420 mm_segment_t old_fs = get_fs();
430 dataoff = offset; 421
431 422 fie.fi_extents_mapped = 0;
432 do { 423 memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
433 map.m_lblk = last; 424
434 map.m_len = end - last + 1; 425 set_fs(get_ds());
435 ret = ext4_map_blocks(NULL, inode, &map, 0); 426 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
436 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 427 set_fs(old_fs);
437 if (last != start) 428 if (ret)
438 dataoff = (loff_t)last << blkbits;
439 break; 429 break;
440 }
441 430
442 /* 431 /* No extents found, EOF */
443 * If there is a delay extent at this offset, 432 if (!fie.fi_extents_mapped) {
444 * it will be as a data. 433 ret = -ENXIO;
445 */
446 ext4_es_find_delayed_extent_range(inode, last, last, &es);
447 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
448 if (last != start)
449 dataoff = (loff_t)last << blkbits;
450 break; 434 break;
451 } 435 }
436 for (i = 0; i < fie.fi_extents_mapped; i++) {
437 next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
452 438
453 /* 439 if (offset < (loff_t)ext[i].fe_logical)
454 * If there is a unwritten extent at this offset, 440 offset = (loff_t)ext[i].fe_logical;
455 * it will be as a data or a hole according to page 441 /*
456 * cache that has data or not. 442 * If extent is not unwritten, then it contains valid
457 */ 443 * data, mapped or delayed.
458 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 444 */
459 int unwritten; 445 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
460 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 446 goto out;
461 &map, &dataoff);
462 if (unwritten)
463 break;
464 }
465 447
466 last++; 448 /*
467 dataoff = (loff_t)last << blkbits; 449 * If there is a unwritten extent at this offset,
468 } while (last <= end); 450 * it will be as a data or a hole according to page
451 * cache that has data or not.
452 */
453 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
454 next, &offset))
455 goto out;
469 456
457 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
458 ret = -ENXIO;
459 goto out;
460 }
461 offset = next;
462 }
463 }
464 if (offset > inode->i_size)
465 offset = inode->i_size;
466out:
470 mutex_unlock(&inode->i_mutex); 467 mutex_unlock(&inode->i_mutex);
468 if (ret)
469 return ret;
471 470
472 if (dataoff > isize) 471 return vfs_setpos(file, offset, maxsize);
473 return -ENXIO;
474
475 return vfs_setpos(file, dataoff, maxsize);
476} 472}
477 473
478/* 474/*
479 * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 475 * ext4_seek_hole() retrieves the offset for SEEK_HOLE
480 */ 476 */
481static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 477static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
482{ 478{
483 struct inode *inode = file->f_mapping->host; 479 struct inode *inode = file->f_mapping->host;
484 struct ext4_map_blocks map; 480 struct fiemap_extent_info fie;
485 struct extent_status es; 481 struct fiemap_extent ext[2];
486 ext4_lblk_t start, last, end; 482 loff_t next;
487 loff_t holeoff, isize; 483 int i, ret = 0;
488 int blkbits;
489 int ret = 0;
490 484
491 mutex_lock(&inode->i_mutex); 485 mutex_lock(&inode->i_mutex);
492 486 if (offset >= inode->i_size) {
493 isize = i_size_read(inode);
494 if (offset >= isize) {
495 mutex_unlock(&inode->i_mutex); 487 mutex_unlock(&inode->i_mutex);
496 return -ENXIO; 488 return -ENXIO;
497 } 489 }
498 490
499 blkbits = inode->i_sb->s_blocksize_bits; 491 fie.fi_flags = 0;
500 start = offset >> blkbits; 492 fie.fi_extents_max = 2;
501 last = start; 493 fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
502 end = isize >> blkbits; 494 while (1) {
503 holeoff = offset; 495 mm_segment_t old_fs = get_fs();
504 496
505 do { 497 fie.fi_extents_mapped = 0;
506 map.m_lblk = last; 498 memset(ext, 0, sizeof(*ext));
507 map.m_len = end - last + 1;
508 ret = ext4_map_blocks(NULL, inode, &map, 0);
509 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
510 last += ret;
511 holeoff = (loff_t)last << blkbits;
512 continue;
513 }
514 499
515 /* 500 set_fs(get_ds());
516 * If there is a delay extent at this offset, 501 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
517 * we will skip this extent. 502 set_fs(old_fs);
518 */ 503 if (ret)
519 ext4_es_find_delayed_extent_range(inode, last, last, &es); 504 break;
520 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
521 last = es.es_lblk + es.es_len;
522 holeoff = (loff_t)last << blkbits;
523 continue;
524 }
525 505
526 /* 506 /* No extents found */
527 * If there is a unwritten extent at this offset, 507 if (!fie.fi_extents_mapped)
528 * it will be as a data or a hole according to page 508 break;
529 * cache that has data or not. 509
530 */ 510 for (i = 0; i < fie.fi_extents_mapped; i++) {
531 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 511 next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
532 int unwritten; 512 /*
533 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 513 * If extent is not unwritten, then it contains valid
534 &map, &holeoff); 514 * data, mapped or delayed.
535 if (!unwritten) { 515 */
536 last += ret; 516 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
537 holeoff = (loff_t)last << blkbits; 517 if (offset < (loff_t)ext[i].fe_logical)
518 goto out;
519 offset = next;
538 continue; 520 continue;
539 } 521 }
540 } 522 /*
541 523 * If there is a unwritten extent at this offset,
542 /* find a hole */ 524 * it will be as a data or a hole according to page
543 break; 525 * cache that has data or not.
544 } while (last <= end); 526 */
527 if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
528 next, &offset))
529 goto out;
545 530
531 offset = next;
532 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
533 goto out;
534 }
535 }
536 if (offset > inode->i_size)
537 offset = inode->i_size;
538out:
546 mutex_unlock(&inode->i_mutex); 539 mutex_unlock(&inode->i_mutex);
540 if (ret)
541 return ret;
547 542
548 if (holeoff > isize) 543 return vfs_setpos(file, offset, maxsize);
549 holeoff = isize;
550
551 return vfs_setpos(file, holeoff, maxsize);
552} 544}
553 545
554/* 546/*
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3ea62695abce..4b143febf21f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
811 ret = __block_write_begin(page, 0, inline_size, 811 ret = __block_write_begin(page, 0, inline_size,
812 ext4_da_get_block_prep); 812 ext4_da_get_block_prep);
813 if (ret) { 813 if (ret) {
814 up_read(&EXT4_I(inode)->xattr_sem);
815 unlock_page(page);
816 page_cache_release(page);
814 ext4_truncate_failed_write(inode); 817 ext4_truncate_failed_write(inode);
815 goto out; 818 return ret;
816 } 819 }
817 820
818 SetPageDirty(page); 821 SetPageDirty(page);
@@ -870,6 +873,12 @@ retry_journal:
870 goto out_journal; 873 goto out_journal;
871 } 874 }
872 875
876 /*
877 * We cannot recurse into the filesystem as the transaction
878 * is already started.
879 */
880 flags |= AOP_FLAG_NOFS;
881
873 if (ret == -ENOSPC) { 882 if (ret == -ENOSPC) {
874 ret = ext4_da_convert_inline_data_to_extent(mapping, 883 ret = ext4_da_convert_inline_data_to_extent(mapping,
875 inode, 884 inode,
@@ -882,11 +891,6 @@ retry_journal:
882 goto out; 891 goto out;
883 } 892 }
884 893
885 /*
886 * We cannot recurse into the filesystem as the transaction
887 * is already started.
888 */
889 flags |= AOP_FLAG_NOFS;
890 894
891 page = grab_cache_page_write_begin(mapping, 0, flags); 895 page = grab_cache_page_write_begin(mapping, 0, flags);
892 if (!page) { 896 if (!page) {
@@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
1807 1811
1808int ext4_inline_data_fiemap(struct inode *inode, 1812int ext4_inline_data_fiemap(struct inode *inode,
1809 struct fiemap_extent_info *fieinfo, 1813 struct fiemap_extent_info *fieinfo,
1810 int *has_inline) 1814 int *has_inline, __u64 start, __u64 len)
1811{ 1815{
1812 __u64 physical = 0; 1816 __u64 physical = 0;
1813 __u64 length; 1817 __u64 inline_len;
1814 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; 1818 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
1819 FIEMAP_EXTENT_LAST;
1815 int error = 0; 1820 int error = 0;
1816 struct ext4_iloc iloc; 1821 struct ext4_iloc iloc;
1817 1822
@@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode,
1820 *has_inline = 0; 1825 *has_inline = 0;
1821 goto out; 1826 goto out;
1822 } 1827 }
1828 inline_len = min_t(size_t, ext4_get_inline_size(inode),
1829 i_size_read(inode));
1830 if (start >= inline_len)
1831 goto out;
1832 if (start + len < inline_len)
1833 inline_len = start + len;
1834 inline_len -= start;
1823 1835
1824 error = ext4_get_inode_loc(inode, &iloc); 1836 error = ext4_get_inode_loc(inode, &iloc);
1825 if (error) 1837 if (error)
@@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
1828 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1840 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1829 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1841 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1830 physical += offsetof(struct ext4_inode, i_block); 1842 physical += offsetof(struct ext4_inode, i_block);
1831 length = i_size_read(inode);
1832 1843
1833 if (physical) 1844 if (physical)
1834 error = fiemap_fill_next_extent(fieinfo, 0, physical, 1845 error = fiemap_fill_next_extent(fieinfo, start, physical,
1835 length, flags); 1846 inline_len, flags);
1836 brelse(iloc.bh); 1847 brelse(iloc.bh);
1837out: 1848out:
1838 up_read(&EXT4_I(inode)->xattr_sem); 1849 up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3356ab5395f4..5653fa42930b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
416 } 416 }
417 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 417 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
418 up_read((&EXT4_I(inode)->i_data_sem)); 418 up_read((&EXT4_I(inode)->i_data_sem));
419 /*
420 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
421 * because it shouldn't be marked in es_map->m_flags.
422 */
423 map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
424 419
425 /* 420 /*
426 * We don't check m_len because extent will be collpased in status 421 * We don't check m_len because extent will be collpased in status
@@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
491 486
492 /* Lookup extent status tree firstly */ 487 /* Lookup extent status tree firstly */
493 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 488 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
494 ext4_es_lru_add(inode);
495 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 489 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
496 map->m_pblk = ext4_es_pblock(&es) + 490 map->m_pblk = ext4_es_pblock(&es) +
497 map->m_lblk - es.es_lblk; 491 map->m_lblk - es.es_lblk;
@@ -1393,7 +1387,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1393 1387
1394 /* Lookup extent status tree firstly */ 1388 /* Lookup extent status tree firstly */
1395 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1389 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1396 ext4_es_lru_add(inode);
1397 if (ext4_es_is_hole(&es)) { 1390 if (ext4_es_is_hole(&es)) {
1398 retval = 0; 1391 retval = 0;
1399 down_read(&EXT4_I(inode)->i_data_sem); 1392 down_read(&EXT4_I(inode)->i_data_sem);
@@ -1434,24 +1427,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1434 * file system block. 1427 * file system block.
1435 */ 1428 */
1436 down_read(&EXT4_I(inode)->i_data_sem); 1429 down_read(&EXT4_I(inode)->i_data_sem);
1437 if (ext4_has_inline_data(inode)) { 1430 if (ext4_has_inline_data(inode))
1438 /*
1439 * We will soon create blocks for this page, and let
1440 * us pretend as if the blocks aren't allocated yet.
1441 * In case of clusters, we have to handle the work
1442 * of mapping from cluster so that the reserved space
1443 * is calculated properly.
1444 */
1445 if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
1446 ext4_find_delalloc_cluster(inode, map->m_lblk))
1447 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1448 retval = 0; 1431 retval = 0;
1449 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1432 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1450 retval = ext4_ext_map_blocks(NULL, inode, map, 1433 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1451 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1452 else 1434 else
1453 retval = ext4_ind_map_blocks(NULL, inode, map, 1435 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1454 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1455 1436
1456add_delayed: 1437add_delayed:
1457 if (retval == 0) { 1438 if (retval == 0) {
@@ -1465,7 +1446,8 @@ add_delayed:
1465 * then we don't need to reserve it again. However we still need 1446 * then we don't need to reserve it again. However we still need
1466 * to reserve metadata for every block we're going to write. 1447 * to reserve metadata for every block we're going to write.
1467 */ 1448 */
1468 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { 1449 if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
1450 !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1469 ret = ext4_da_reserve_space(inode, iblock); 1451 ret = ext4_da_reserve_space(inode, iblock);
1470 if (ret) { 1452 if (ret) {
1471 /* not enough space to reserve */ 1453 /* not enough space to reserve */
@@ -1481,11 +1463,6 @@ add_delayed:
1481 goto out_unlock; 1463 goto out_unlock;
1482 } 1464 }
1483 1465
1484 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1485 * and it should not appear on the bh->b_state.
1486 */
1487 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1488
1489 map_bh(bh, inode->i_sb, invalid_block); 1466 map_bh(bh, inode->i_sb, invalid_block);
1490 set_buffer_new(bh); 1467 set_buffer_new(bh);
1491 set_buffer_delay(bh); 1468 set_buffer_delay(bh);
@@ -3643,7 +3620,7 @@ out_stop:
3643 * If this was a simple ftruncate() and the file will remain alive, 3620 * If this was a simple ftruncate() and the file will remain alive,
3644 * then we need to clear up the orphan record which we created above. 3621 * then we need to clear up the orphan record which we created above.
3645 * However, if this was a real unlink then we were called by 3622 * However, if this was a real unlink then we were called by
3646 * ext4_delete_inode(), and we allow that function to clean up the 3623 * ext4_evict_inode(), and we allow that function to clean up the
3647 * orphan info for us. 3624 * orphan info for us.
3648 */ 3625 */
3649 if (inode->i_nlink) 3626 if (inode->i_nlink)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfda18a15592..f58a0d106726 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
78 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); 78 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
79 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); 79 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
80 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); 80 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
81 ext4_es_lru_del(inode1);
82 ext4_es_lru_del(inode2);
83 81
84 isize = i_size_read(inode1); 82 isize = i_size_read(inode1);
85 i_size_write(inode1, i_size_read(inode2)); 83 i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbfe15c2533c..8d1e60214ef0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2358 if (sbi->s_group_info) { 2358 if (sbi->s_group_info) {
2359 memcpy(new_groupinfo, sbi->s_group_info, 2359 memcpy(new_groupinfo, sbi->s_group_info,
2360 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 2360 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2361 ext4_kvfree(sbi->s_group_info); 2361 kvfree(sbi->s_group_info);
2362 } 2362 }
2363 sbi->s_group_info = new_groupinfo; 2363 sbi->s_group_info = new_groupinfo;
2364 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 2364 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
@@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2385 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2385 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2386 metalen = sizeof(*meta_group_info) << 2386 metalen = sizeof(*meta_group_info) <<
2387 EXT4_DESC_PER_BLOCK_BITS(sb); 2387 EXT4_DESC_PER_BLOCK_BITS(sb);
2388 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2388 meta_group_info = kmalloc(metalen, GFP_NOFS);
2389 if (meta_group_info == NULL) { 2389 if (meta_group_info == NULL) {
2390 ext4_msg(sb, KERN_ERR, "can't allocate mem " 2390 ext4_msg(sb, KERN_ERR, "can't allocate mem "
2391 "for a buddy group"); 2391 "for a buddy group");
@@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2399 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2399 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2400 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2400 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2401 2401
2402 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); 2402 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2403 if (meta_group_info[i] == NULL) { 2403 if (meta_group_info[i] == NULL) {
2404 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2404 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2405 goto exit_group_info; 2405 goto exit_group_info;
@@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2428 { 2428 {
2429 struct buffer_head *bh; 2429 struct buffer_head *bh;
2430 meta_group_info[i]->bb_bitmap = 2430 meta_group_info[i]->bb_bitmap =
2431 kmalloc(sb->s_blocksize, GFP_KERNEL); 2431 kmalloc(sb->s_blocksize, GFP_NOFS);
2432 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2432 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2433 bh = ext4_read_block_bitmap(sb, group); 2433 bh = ext4_read_block_bitmap(sb, group);
2434 BUG_ON(bh == NULL); 2434 BUG_ON(bh == NULL);
@@ -2495,7 +2495,7 @@ err_freebuddy:
2495 kfree(sbi->s_group_info[i]); 2495 kfree(sbi->s_group_info[i]);
2496 iput(sbi->s_buddy_cache); 2496 iput(sbi->s_buddy_cache);
2497err_freesgi: 2497err_freesgi:
2498 ext4_kvfree(sbi->s_group_info); 2498 kvfree(sbi->s_group_info);
2499 return -ENOMEM; 2499 return -ENOMEM;
2500} 2500}
2501 2501
@@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb)
2708 EXT4_DESC_PER_BLOCK_BITS(sb); 2708 EXT4_DESC_PER_BLOCK_BITS(sb);
2709 for (i = 0; i < num_meta_group_infos; i++) 2709 for (i = 0; i < num_meta_group_infos; i++)
2710 kfree(sbi->s_group_info[i]); 2710 kfree(sbi->s_group_info[i]);
2711 ext4_kvfree(sbi->s_group_info); 2711 kvfree(sbi->s_group_info);
2712 } 2712 }
2713 kfree(sbi->s_mb_offsets); 2713 kfree(sbi->s_mb_offsets);
2714 kfree(sbi->s_mb_maxs); 2714 kfree(sbi->s_mb_maxs);
2715 if (sbi->s_buddy_cache) 2715 iput(sbi->s_buddy_cache);
2716 iput(sbi->s_buddy_cache);
2717 if (sbi->s_mb_stats) { 2716 if (sbi->s_mb_stats) {
2718 ext4_msg(sb, KERN_INFO, 2717 ext4_msg(sb, KERN_INFO,
2719 "mballoc: %u blocks %u reqs (%u success)", 2718 "mballoc: %u blocks %u reqs (%u success)",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a432634f2e6a..3cb267aee802 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -592,7 +592,7 @@ err_out:
592 592
593 /* 593 /*
594 * set the i_blocks count to zero 594 * set the i_blocks count to zero
595 * so that the ext4_delete_inode does the 595 * so that the ext4_evict_inode() does the
596 * right job 596 * right job
597 * 597 *
598 * We don't need to take the i_lock because 598 * We don't need to take the i_lock because
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 9f2311bc9c4f..503ea15dc5db 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -273,6 +273,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
273 int replaced_count = 0; 273 int replaced_count = 0;
274 int from = data_offset_in_page << orig_inode->i_blkbits; 274 int from = data_offset_in_page << orig_inode->i_blkbits;
275 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 275 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
276 struct super_block *sb = orig_inode->i_sb;
276 277
277 /* 278 /*
278 * It needs twice the amount of ordinary journal buffers because 279 * It needs twice the amount of ordinary journal buffers because
@@ -405,10 +406,13 @@ unlock_pages:
405 page_cache_release(pagep[1]); 406 page_cache_release(pagep[1]);
406stop_journal: 407stop_journal:
407 ext4_journal_stop(handle); 408 ext4_journal_stop(handle);
409 if (*err == -ENOSPC &&
410 ext4_should_retry_alloc(sb, &retries))
411 goto again;
408 /* Buffer was busy because probably is pinned to journal transaction, 412 /* Buffer was busy because probably is pinned to journal transaction,
409 * force transaction commit may help to free it. */ 413 * force transaction commit may help to free it. */
410 if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, 414 if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
411 &retries)) 415 jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
412 goto again; 416 goto again;
413 return replaced_count; 417 return replaced_count;
414 418
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 426211882f72..2291923dae4e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2814,7 +2814,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2814 ext4_orphan_add(handle, inode); 2814 ext4_orphan_add(handle, inode);
2815 inode->i_ctime = ext4_current_time(inode); 2815 inode->i_ctime = ext4_current_time(inode);
2816 ext4_mark_inode_dirty(handle, inode); 2816 ext4_mark_inode_dirty(handle, inode);
2817 retval = 0;
2818 2817
2819end_unlink: 2818end_unlink:
2820 brelse(bh); 2819 brelse(bh);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca4588388fc3..bf76f405a5f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
856 n_group_desc[gdb_num] = gdb_bh; 856 n_group_desc[gdb_num] = gdb_bh;
857 EXT4_SB(sb)->s_group_desc = n_group_desc; 857 EXT4_SB(sb)->s_group_desc = n_group_desc;
858 EXT4_SB(sb)->s_gdb_count++; 858 EXT4_SB(sb)->s_gdb_count++;
859 ext4_kvfree(o_group_desc); 859 kvfree(o_group_desc);
860 860
861 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 861 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
862 err = ext4_handle_dirty_super(handle, sb); 862 err = ext4_handle_dirty_super(handle, sb);
@@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
866 return err; 866 return err;
867 867
868exit_inode: 868exit_inode:
869 ext4_kvfree(n_group_desc); 869 kvfree(n_group_desc);
870 brelse(iloc.bh); 870 brelse(iloc.bh);
871exit_dind: 871exit_dind:
872 brelse(dind); 872 brelse(dind);
@@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
909 n_group_desc[gdb_num] = gdb_bh; 909 n_group_desc[gdb_num] = gdb_bh;
910 EXT4_SB(sb)->s_group_desc = n_group_desc; 910 EXT4_SB(sb)->s_group_desc = n_group_desc;
911 EXT4_SB(sb)->s_gdb_count++; 911 EXT4_SB(sb)->s_gdb_count++;
912 ext4_kvfree(o_group_desc); 912 kvfree(o_group_desc);
913 BUFFER_TRACE(gdb_bh, "get_write_access"); 913 BUFFER_TRACE(gdb_bh, "get_write_access");
914 err = ext4_journal_get_write_access(handle, gdb_bh); 914 err = ext4_journal_get_write_access(handle, gdb_bh);
915 if (unlikely(err)) 915 if (unlikely(err))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 63e802b8ec68..43c92b1685cb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
176 return ret; 176 return ret;
177} 177}
178 178
179void ext4_kvfree(void *ptr)
180{
181 if (is_vmalloc_addr(ptr))
182 vfree(ptr);
183 else
184 kfree(ptr);
185
186}
187
188ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 179ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
189 struct ext4_group_desc *bg) 180 struct ext4_group_desc *bg)
190{ 181{
@@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb)
811 802
812 for (i = 0; i < sbi->s_gdb_count; i++) 803 for (i = 0; i < sbi->s_gdb_count; i++)
813 brelse(sbi->s_group_desc[i]); 804 brelse(sbi->s_group_desc[i]);
814 ext4_kvfree(sbi->s_group_desc); 805 kvfree(sbi->s_group_desc);
815 ext4_kvfree(sbi->s_flex_groups); 806 kvfree(sbi->s_flex_groups);
816 percpu_counter_destroy(&sbi->s_freeclusters_counter); 807 percpu_counter_destroy(&sbi->s_freeclusters_counter);
817 percpu_counter_destroy(&sbi->s_freeinodes_counter); 808 percpu_counter_destroy(&sbi->s_freeinodes_counter);
818 percpu_counter_destroy(&sbi->s_dirs_counter); 809 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
880 spin_lock_init(&ei->i_prealloc_lock); 871 spin_lock_init(&ei->i_prealloc_lock);
881 ext4_es_init_tree(&ei->i_es_tree); 872 ext4_es_init_tree(&ei->i_es_tree);
882 rwlock_init(&ei->i_es_lock); 873 rwlock_init(&ei->i_es_lock);
883 INIT_LIST_HEAD(&ei->i_es_lru); 874 INIT_LIST_HEAD(&ei->i_es_list);
884 ei->i_es_all_nr = 0; 875 ei->i_es_all_nr = 0;
885 ei->i_es_lru_nr = 0; 876 ei->i_es_shk_nr = 0;
886 ei->i_touch_when = 0; 877 ei->i_es_shrink_lblk = 0;
887 ei->i_reserved_data_blocks = 0; 878 ei->i_reserved_data_blocks = 0;
888 ei->i_reserved_meta_blocks = 0; 879 ei->i_reserved_meta_blocks = 0;
889 ei->i_allocated_meta_blocks = 0; 880 ei->i_allocated_meta_blocks = 0;
@@ -973,7 +964,6 @@ void ext4_clear_inode(struct inode *inode)
973 dquot_drop(inode); 964 dquot_drop(inode);
974 ext4_discard_preallocations(inode); 965 ext4_discard_preallocations(inode);
975 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 966 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
976 ext4_es_lru_del(inode);
977 if (EXT4_I(inode)->jinode) { 967 if (EXT4_I(inode)->jinode) {
978 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 968 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
979 EXT4_I(inode)->jinode); 969 EXT4_I(inode)->jinode);
@@ -1153,7 +1143,7 @@ enum {
1153 Opt_inode_readahead_blks, Opt_journal_ioprio, 1143 Opt_inode_readahead_blks, Opt_journal_ioprio,
1154 Opt_dioread_nolock, Opt_dioread_lock, 1144 Opt_dioread_nolock, Opt_dioread_lock,
1155 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1145 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1156 Opt_max_dir_size_kb, 1146 Opt_max_dir_size_kb, Opt_nojournal_checksum,
1157}; 1147};
1158 1148
1159static const match_table_t tokens = { 1149static const match_table_t tokens = {
@@ -1187,6 +1177,7 @@ static const match_table_t tokens = {
1187 {Opt_journal_dev, "journal_dev=%u"}, 1177 {Opt_journal_dev, "journal_dev=%u"},
1188 {Opt_journal_path, "journal_path=%s"}, 1178 {Opt_journal_path, "journal_path=%s"},
1189 {Opt_journal_checksum, "journal_checksum"}, 1179 {Opt_journal_checksum, "journal_checksum"},
1180 {Opt_nojournal_checksum, "nojournal_checksum"},
1190 {Opt_journal_async_commit, "journal_async_commit"}, 1181 {Opt_journal_async_commit, "journal_async_commit"},
1191 {Opt_abort, "abort"}, 1182 {Opt_abort, "abort"},
1192 {Opt_data_journal, "data=journal"}, 1183 {Opt_data_journal, "data=journal"},
@@ -1368,6 +1359,8 @@ static const struct mount_opts {
1368 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1359 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1369 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1360 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1370 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1361 MOPT_EXT4_ONLY | MOPT_CLEAR},
1362 {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1363 MOPT_EXT4_ONLY | MOPT_CLEAR},
1371 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1364 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1372 MOPT_EXT4_ONLY | MOPT_SET}, 1365 MOPT_EXT4_ONLY | MOPT_SET},
1373 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1366 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1709,6 +1702,12 @@ static int parse_options(char *options, struct super_block *sb,
1709 return 0; 1702 return 0;
1710 } 1703 }
1711 } 1704 }
1705 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
1706 test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
1707 ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
1708 "in data=ordered mode");
1709 return 0;
1710 }
1712 return 1; 1711 return 1;
1713} 1712}
1714 1713
@@ -1946,7 +1945,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1946 memcpy(new_groups, sbi->s_flex_groups, 1945 memcpy(new_groups, sbi->s_flex_groups,
1947 (sbi->s_flex_groups_allocated * 1946 (sbi->s_flex_groups_allocated *
1948 sizeof(struct flex_groups))); 1947 sizeof(struct flex_groups)));
1949 ext4_kvfree(sbi->s_flex_groups); 1948 kvfree(sbi->s_flex_groups);
1950 } 1949 }
1951 sbi->s_flex_groups = new_groups; 1950 sbi->s_flex_groups = new_groups;
1952 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); 1951 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
@@ -3317,7 +3316,7 @@ int ext4_calculate_overhead(struct super_block *sb)
3317 struct ext4_super_block *es = sbi->s_es; 3316 struct ext4_super_block *es = sbi->s_es;
3318 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3317 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3319 ext4_fsblk_t overhead = 0; 3318 ext4_fsblk_t overhead = 0;
3320 char *buf = (char *) get_zeroed_page(GFP_KERNEL); 3319 char *buf = (char *) get_zeroed_page(GFP_NOFS);
3321 3320
3322 if (!buf) 3321 if (!buf)
3323 return -ENOMEM; 3322 return -ENOMEM;
@@ -3345,8 +3344,8 @@ int ext4_calculate_overhead(struct super_block *sb)
3345 memset(buf, 0, PAGE_SIZE); 3344 memset(buf, 0, PAGE_SIZE);
3346 cond_resched(); 3345 cond_resched();
3347 } 3346 }
3348 /* Add the journal blocks as well */ 3347 /* Add the internal journal blocks as well */
3349 if (sbi->s_journal) 3348 if (sbi->s_journal && !sbi->journal_bdev)
3350 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); 3349 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3351 3350
3352 sbi->s_overhead = overhead; 3351 sbi->s_overhead = overhead;
@@ -4232,7 +4231,7 @@ failed_mount7:
4232failed_mount6: 4231failed_mount6:
4233 ext4_mb_release(sb); 4232 ext4_mb_release(sb);
4234 if (sbi->s_flex_groups) 4233 if (sbi->s_flex_groups)
4235 ext4_kvfree(sbi->s_flex_groups); 4234 kvfree(sbi->s_flex_groups);
4236 percpu_counter_destroy(&sbi->s_freeclusters_counter); 4235 percpu_counter_destroy(&sbi->s_freeclusters_counter);
4237 percpu_counter_destroy(&sbi->s_freeinodes_counter); 4236 percpu_counter_destroy(&sbi->s_freeinodes_counter);
4238 percpu_counter_destroy(&sbi->s_dirs_counter); 4237 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -4261,7 +4260,7 @@ failed_mount3:
4261failed_mount2: 4260failed_mount2:
4262 for (i = 0; i < db_count; i++) 4261 for (i = 0; i < db_count; i++)
4263 brelse(sbi->s_group_desc[i]); 4262 brelse(sbi->s_group_desc[i]);
4264 ext4_kvfree(sbi->s_group_desc); 4263 kvfree(sbi->s_group_desc);
4265failed_mount: 4264failed_mount:
4266 if (sbi->s_chksum_driver) 4265 if (sbi->s_chksum_driver)
4267 crypto_free_shash(sbi->s_chksum_driver); 4266 crypto_free_shash(sbi->s_chksum_driver);
@@ -4862,6 +4861,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4862 goto restore_opts; 4861 goto restore_opts;
4863 } 4862 }
4864 4863
4864 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
4865 test_opt(sb, JOURNAL_CHECKSUM)) {
4866 ext4_msg(sb, KERN_ERR, "changing journal_checksum "
4867 "during remount not supported");
4868 err = -EINVAL;
4869 goto restore_opts;
4870 }
4871
4865 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4872 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4866 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 4873 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4867 ext4_msg(sb, KERN_ERR, "can't mount with " 4874 ext4_msg(sb, KERN_ERR, "can't mount with "
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1df94fabe4eb..b96bd8076b70 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal)
1714 1714
1715 if (journal->j_proc_entry) 1715 if (journal->j_proc_entry)
1716 jbd2_stats_proc_exit(journal); 1716 jbd2_stats_proc_exit(journal);
1717 if (journal->j_inode) 1717 iput(journal->j_inode);
1718 iput(journal->j_inode);
1719 if (journal->j_revoke) 1718 if (journal->j_revoke)
1720 jbd2_journal_destroy_revoke(journal); 1719 jbd2_journal_destroy_revoke(journal);
1721 if (journal->j_chksum_driver) 1720 if (journal->j_chksum_driver)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index ff4bd1b35246..6cfb841fea7c 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -43,15 +43,13 @@ struct extent_status;
43 { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ 43 { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \
44 { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ 44 { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \
45 { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ 45 { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \
46 { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \ 46 { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" })
47 { EXT4_GET_BLOCKS_NO_PUT_HOLE, "NO_PUT_HOLE" })
48 47
49#define show_mflags(flags) __print_flags(flags, "", \ 48#define show_mflags(flags) __print_flags(flags, "", \
50 { EXT4_MAP_NEW, "N" }, \ 49 { EXT4_MAP_NEW, "N" }, \
51 { EXT4_MAP_MAPPED, "M" }, \ 50 { EXT4_MAP_MAPPED, "M" }, \
52 { EXT4_MAP_UNWRITTEN, "U" }, \ 51 { EXT4_MAP_UNWRITTEN, "U" }, \
53 { EXT4_MAP_BOUNDARY, "B" }, \ 52 { EXT4_MAP_BOUNDARY, "B" })
54 { EXT4_MAP_FROM_CLUSTER, "C" })
55 53
56#define show_free_flags(flags) __print_flags(flags, "|", \ 54#define show_free_flags(flags) __print_flags(flags, "|", \
57 { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ 55 { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \
@@ -2452,15 +2450,14 @@ TRACE_EVENT(ext4_collapse_range,
2452 2450
2453TRACE_EVENT(ext4_es_shrink, 2451TRACE_EVENT(ext4_es_shrink,
2454 TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, 2452 TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
2455 int skip_precached, int nr_skipped, int retried), 2453 int nr_skipped, int retried),
2456 2454
2457 TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), 2455 TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),
2458 2456
2459 TP_STRUCT__entry( 2457 TP_STRUCT__entry(
2460 __field( dev_t, dev ) 2458 __field( dev_t, dev )
2461 __field( int, nr_shrunk ) 2459 __field( int, nr_shrunk )
2462 __field( unsigned long long, scan_time ) 2460 __field( unsigned long long, scan_time )
2463 __field( int, skip_precached )
2464 __field( int, nr_skipped ) 2461 __field( int, nr_skipped )
2465 __field( int, retried ) 2462 __field( int, retried )
2466 ), 2463 ),
@@ -2469,16 +2466,14 @@ TRACE_EVENT(ext4_es_shrink,
2469 __entry->dev = sb->s_dev; 2466 __entry->dev = sb->s_dev;
2470 __entry->nr_shrunk = nr_shrunk; 2467 __entry->nr_shrunk = nr_shrunk;
2471 __entry->scan_time = div_u64(scan_time, 1000); 2468 __entry->scan_time = div_u64(scan_time, 1000);
2472 __entry->skip_precached = skip_precached;
2473 __entry->nr_skipped = nr_skipped; 2469 __entry->nr_skipped = nr_skipped;
2474 __entry->retried = retried; 2470 __entry->retried = retried;
2475 ), 2471 ),
2476 2472
2477 TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " 2473 TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
2478 "nr_skipped %d retried %d", 2474 "nr_skipped %d retried %d",
2479 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, 2475 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
2480 __entry->scan_time, __entry->skip_precached, 2476 __entry->scan_time, __entry->nr_skipped, __entry->retried)
2481 __entry->nr_skipped, __entry->retried)
2482); 2477);
2483 2478
2484#endif /* _TRACE_EXT4_H */ 2479#endif /* _TRACE_EXT4_H */