aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/ext4.h17
-rw-r--r--fs/ext4/extents.c73
-rw-r--r--fs/ext4/extents_status.c72
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--include/uapi/linux/fiemap.h1
5 files changed, 137 insertions, 29 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c74b1948feb0..635135e6148e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -561,15 +561,16 @@ enum {
561#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 561#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
562 562
563/* 563/*
564 * The bit position of this flag must not overlap with any of the 564 * The bit position of these flags must not overlap with any of the
565 * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), 565 * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
566 * read_extent_tree_block(), ext4_split_extent_at(), 566 * read_extent_tree_block(), ext4_split_extent_at(),
567 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to 567 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
568 * indicate that the we shouldn't be caching the extents when reading 568 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
569 * from the extent tree while a truncate or punch hole operation 569 * caching the extents when reading from the extent tree while a
570 * is in progress. 570 * truncate or punch hole operation is in progress.
571 */ 571 */
572#define EXT4_EX_NOCACHE 0x0400 572#define EXT4_EX_NOCACHE 0x0400
573#define EXT4_EX_FORCE_CACHE 0x0800
573 574
574/* 575/*
575 * Flags used by ext4_free_blocks 576 * Flags used by ext4_free_blocks
@@ -601,6 +602,7 @@ enum {
601#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 602#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
602#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 603#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
603#define EXT4_IOC_SWAP_BOOT _IO('f', 17) 604#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
605#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
604 606
605#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 607#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
606/* 608/*
@@ -1386,6 +1388,7 @@ enum {
1386 nolocking */ 1388 nolocking */
1387 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1389 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1388 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 1390 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1391 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
1389}; 1392};
1390 1393
1391#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1394#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
2705extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2708extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2706extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2709extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2707 __u64 start, __u64 len); 2710 __u64 start, __u64 len);
2708 2711extern int ext4_ext_precache(struct inode *inode);
2709 2712
2710/* move_extent.c */ 2713/* move_extent.c */
2711extern void ext4_double_down_write_data_sem(struct inode *first, 2714extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 08c1ac976479..01838875fcaf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
482 if (err < 0) 482 if (err < 0)
483 goto errout; 483 goto errout;
484 } 484 }
485 if (buffer_verified(bh)) 485 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
486 return bh; 486 return bh;
487 err = __ext4_ext_check(function, line, inode, 487 err = __ext4_ext_check(function, line, inode,
488 ext_block_hdr(bh), depth, pblk); 488 ext_block_hdr(bh), depth, pblk);
@@ -526,6 +526,71 @@ errout:
526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
527 (depth), (flags)) 527 (depth), (flags))
528 528
529/*
530 * This function is called to cache a file's extent information in the
531 * extent status tree
532 */
533int ext4_ext_precache(struct inode *inode)
534{
535 struct ext4_inode_info *ei = EXT4_I(inode);
536 struct ext4_ext_path *path = NULL;
537 struct buffer_head *bh;
538 int i = 0, depth, ret = 0;
539
540 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
541 return 0; /* not an extent-mapped inode */
542
543 down_read(&ei->i_data_sem);
544 depth = ext_depth(inode);
545
546 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
547 GFP_NOFS);
548 if (path == NULL) {
549 up_read(&ei->i_data_sem);
550 return -ENOMEM;
551 }
552
553 /* Don't cache anything if there are no external extent blocks */
554 if (depth == 0)
555 goto out;
556 path[0].p_hdr = ext_inode_hdr(inode);
557 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
558 if (ret)
559 goto out;
560 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
561 while (i >= 0) {
562 /*
563 * If this is a leaf block or we've reached the end of
564 * the index block, go up
565 */
566 if ((i == depth) ||
567 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
568 brelse(path[i].p_bh);
569 path[i].p_bh = NULL;
570 i--;
571 continue;
572 }
573 bh = read_extent_tree_block(inode,
574 ext4_idx_pblock(path[i].p_idx++),
575 depth - i - 1,
576 EXT4_EX_FORCE_CACHE);
577 if (IS_ERR(bh)) {
578 ret = PTR_ERR(bh);
579 break;
580 }
581 i++;
582 path[i].p_bh = bh;
583 path[i].p_hdr = ext_block_hdr(bh);
584 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
585 }
586 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
587out:
588 up_read(&ei->i_data_sem);
589 ext4_ext_drop_refs(path);
590 kfree(path);
591 return ret;
592}
593
529#ifdef EXT_DEBUG 594#ifdef EXT_DEBUG
530static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 595static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
531{ 596{
@@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4766 return error; 4831 return error;
4767 } 4832 }
4768 4833
4834 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4835 error = ext4_ext_precache(inode);
4836 if (error)
4837 return error;
4838 }
4839
4769 /* fallback to generic here if not in extents fmt */ 4840 /* fallback to generic here if not in extents fmt */
4770 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4841 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4771 return generic_block_fiemap(inode, fieinfo, start, len, 4842 return generic_block_fiemap(inode, fieinfo, start, len,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 1dc5df016e25..0e88a367b535 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
710 write_lock(&EXT4_I(inode)->i_es_lock); 710 write_lock(&EXT4_I(inode)->i_es_lock);
711 711
712 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); 712 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
713 if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) 713 if (!es || es->es_lblk > end)
714 goto out; 714 __es_insert_extent(inode, &newes);
715
716 __es_insert_extent(inode, &newes);
717out:
718 write_unlock(&EXT4_I(inode)->i_es_lock); 715 write_unlock(&EXT4_I(inode)->i_es_lock);
719} 716}
720 717
@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
930 eia = list_entry(a, struct ext4_inode_info, i_es_lru); 927 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
931 eib = list_entry(b, struct ext4_inode_info, i_es_lru); 928 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
932 929
930 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
931 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
932 return 1;
933 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
934 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
935 return -1;
933 if (eia->i_touch_when == eib->i_touch_when) 936 if (eia->i_touch_when == eib->i_touch_when)
934 return 0; 937 return 0;
935 if (time_after(eia->i_touch_when, eib->i_touch_when)) 938 if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
943{ 946{
944 struct ext4_inode_info *ei; 947 struct ext4_inode_info *ei;
945 struct list_head *cur, *tmp; 948 struct list_head *cur, *tmp;
946 LIST_HEAD(skiped); 949 LIST_HEAD(skipped);
947 int ret, nr_shrunk = 0; 950 int ret, nr_shrunk = 0;
951 int retried = 0, skip_precached = 1, nr_skipped = 0;
948 952
949 spin_lock(&sbi->s_es_lru_lock); 953 spin_lock(&sbi->s_es_lru_lock);
950 954
951 /* 955retry:
952 * If the inode that is at the head of LRU list is newer than
953 * last_sorted time, that means that we need to sort this list.
954 */
955 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
956 if (sbi->s_es_last_sorted < ei->i_touch_when) {
957 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
958 sbi->s_es_last_sorted = jiffies;
959 }
960
961 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 956 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
962 /* 957 /*
963 * If we have already reclaimed all extents from extent 958 * If we have already reclaimed all extents from extent
@@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
968 963
969 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 964 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
970 965
971 /* Skip the inode that is newer than the last_sorted time */ 966 /*
972 if (sbi->s_es_last_sorted < ei->i_touch_when) { 967 * Skip the inode that is newer than the last_sorted
973 list_move_tail(cur, &skiped); 968 * time. Normally we try hard to avoid shrinking
969 * precached inodes, but we will as a last resort.
970 */
971 if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
972 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
973 EXT4_STATE_EXT_PRECACHED))) {
974 nr_skipped++;
975 list_move_tail(cur, &skipped);
974 continue; 976 continue;
975 } 977 }
976 978
@@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
990 } 992 }
991 993
992 /* Move the newer inodes into the tail of the LRU list. */ 994 /* Move the newer inodes into the tail of the LRU list. */
993 list_splice_tail(&skiped, &sbi->s_es_lru); 995 list_splice_tail(&skipped, &sbi->s_es_lru);
996 INIT_LIST_HEAD(&skipped);
997
998 /*
999 * If we skipped any inodes, and we weren't able to make any
1000 * forward progress, sort the list and try again.
1001 */
1002 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1003 retried++;
1004 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
1005 sbi->s_es_last_sorted = jiffies;
1006 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
1007 i_es_lru);
1008 /*
1009 * If there are no non-precached inodes left on the
1010 * list, start releasing precached extents.
1011 */
1012 if (ext4_test_inode_state(&ei->vfs_inode,
1013 EXT4_STATE_EXT_PRECACHED))
1014 skip_precached = 0;
1015 goto retry;
1016 }
1017
994 spin_unlock(&sbi->s_es_lru_lock); 1018 spin_unlock(&sbi->s_es_lru_lock);
995 1019
996 if (locked_ei && nr_shrunk == 0) 1020 if (locked_ei && nr_shrunk == 0)
997 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1021 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
998 1022
999 return nr_shrunk; 1023 return nr_shrunk;
1000} 1024}
@@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1069 struct rb_node *node; 1093 struct rb_node *node;
1070 struct extent_status *es; 1094 struct extent_status *es;
1071 int nr_shrunk = 0; 1095 int nr_shrunk = 0;
1096 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1097 DEFAULT_RATELIMIT_BURST);
1072 1098
1073 if (ei->i_es_lru_nr == 0) 1099 if (ei->i_es_lru_nr == 0)
1074 return 0; 1100 return 0;
1075 1101
1102 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1103 __ratelimit(&_rs))
1104 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1105
1076 node = rb_first(&tree->root); 1106 node = rb_first(&tree->root);
1077 while (node != NULL) { 1107 while (node != NULL) {
1078 es = rb_entry(node, struct extent_status, rb_node); 1108 es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c0427e2f6648..5498f75a1648 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -624,6 +624,8 @@ resizefs_out:
624 624
625 return 0; 625 return 0;
626 } 626 }
627 case EXT4_IOC_PRECACHE_EXTENTS:
628 return ext4_ext_precache(inode);
627 629
628 default: 630 default:
629 return -ENOTTY; 631 return -ENOTTY;
@@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
688 case EXT4_IOC_MOVE_EXT: 690 case EXT4_IOC_MOVE_EXT:
689 case FITRIM: 691 case FITRIM:
690 case EXT4_IOC_RESIZE_FS: 692 case EXT4_IOC_RESIZE_FS:
693 case EXT4_IOC_PRECACHE_EXTENTS:
691 break; 694 break;
692 default: 695 default:
693 return -ENOIOCTLCMD; 696 return -ENOIOCTLCMD;
diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h
index d830747f5c0b..0c51d617dae9 100644
--- a/include/uapi/linux/fiemap.h
+++ b/include/uapi/linux/fiemap.h
@@ -40,6 +40,7 @@ struct fiemap {
40 40
41#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ 41#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
42#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ 42#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
43#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */
43 44
44#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) 45#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
45 46