aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2013-08-16 22:05:14 -0400
committerTheodore Ts'o <tytso@mit.edu>2013-08-16 22:05:14 -0400
commit7869a4a6c5caa7b2e5c41ccaf46eb3371f88eea7 (patch)
tree1c55037a6b090b843b7f8669686dfdbbfd9ceb70 /fs/ext4
parent107a7bd31ac003e42c0f966aa8e5b26947de6024 (diff)
ext4: add support for extent pre-caching
Add a new fiemap flag which forces the all of the extents in an inode to be cached in the extent_status tree. This is critically important when using AIO to a preallocated file, since if we need to read in blocks from the extent tree, the io_submit(2) system call becomes synchronous, and the AIO is no longer "A", which is bad. In addition, for most files which have an external leaf tree block, the cost of caching the information in the extent status tree will be less than caching the entire 4k block in the buffer cache. So it is generally a win to keep the extent information cached. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h17
-rw-r--r--fs/ext4/extents.c73
-rw-r--r--fs/ext4/extents_status.c72
-rw-r--r--fs/ext4/ioctl.c3
4 files changed, 136 insertions, 29 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c74b1948feb0..635135e6148e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -561,15 +561,16 @@ enum {
561#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 561#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
562 562
563/* 563/*
564 * The bit position of this flag must not overlap with any of the 564 * The bit position of these flags must not overlap with any of the
565 * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), 565 * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
566 * read_extent_tree_block(), ext4_split_extent_at(), 566 * read_extent_tree_block(), ext4_split_extent_at(),
567 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to 567 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
568 * indicate that the we shouldn't be caching the extents when reading 568 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
569 * from the extent tree while a truncate or punch hole operation 569 * caching the extents when reading from the extent tree while a
570 * is in progress. 570 * truncate or punch hole operation is in progress.
571 */ 571 */
572#define EXT4_EX_NOCACHE 0x0400 572#define EXT4_EX_NOCACHE 0x0400
573#define EXT4_EX_FORCE_CACHE 0x0800
573 574
574/* 575/*
575 * Flags used by ext4_free_blocks 576 * Flags used by ext4_free_blocks
@@ -601,6 +602,7 @@ enum {
601#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 602#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
602#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 603#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
603#define EXT4_IOC_SWAP_BOOT _IO('f', 17) 604#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
605#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
604 606
605#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 607#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
606/* 608/*
@@ -1386,6 +1388,7 @@ enum {
1386 nolocking */ 1388 nolocking */
1387 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1389 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1388 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 1390 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1391 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
1389}; 1392};
1390 1393
1391#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1394#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
2705extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2708extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2706extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2709extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2707 __u64 start, __u64 len); 2710 __u64 start, __u64 len);
2708 2711extern int ext4_ext_precache(struct inode *inode);
2709 2712
2710/* move_extent.c */ 2713/* move_extent.c */
2711extern void ext4_double_down_write_data_sem(struct inode *first, 2714extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 08c1ac976479..01838875fcaf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
482 if (err < 0) 482 if (err < 0)
483 goto errout; 483 goto errout;
484 } 484 }
485 if (buffer_verified(bh)) 485 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
486 return bh; 486 return bh;
487 err = __ext4_ext_check(function, line, inode, 487 err = __ext4_ext_check(function, line, inode,
488 ext_block_hdr(bh), depth, pblk); 488 ext_block_hdr(bh), depth, pblk);
@@ -526,6 +526,71 @@ errout:
526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
527 (depth), (flags)) 527 (depth), (flags))
528 528
529/*
530 * This function is called to cache a file's extent information in the
531 * extent status tree
532 */
533int ext4_ext_precache(struct inode *inode)
534{
535 struct ext4_inode_info *ei = EXT4_I(inode);
536 struct ext4_ext_path *path = NULL;
537 struct buffer_head *bh;
538 int i = 0, depth, ret = 0;
539
540 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
541 return 0; /* not an extent-mapped inode */
542
543 down_read(&ei->i_data_sem);
544 depth = ext_depth(inode);
545
546 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
547 GFP_NOFS);
548 if (path == NULL) {
549 up_read(&ei->i_data_sem);
550 return -ENOMEM;
551 }
552
553 /* Don't cache anything if there are no external extent blocks */
554 if (depth == 0)
555 goto out;
556 path[0].p_hdr = ext_inode_hdr(inode);
557 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
558 if (ret)
559 goto out;
560 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
561 while (i >= 0) {
562 /*
563 * If this is a leaf block or we've reached the end of
564 * the index block, go up
565 */
566 if ((i == depth) ||
567 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
568 brelse(path[i].p_bh);
569 path[i].p_bh = NULL;
570 i--;
571 continue;
572 }
573 bh = read_extent_tree_block(inode,
574 ext4_idx_pblock(path[i].p_idx++),
575 depth - i - 1,
576 EXT4_EX_FORCE_CACHE);
577 if (IS_ERR(bh)) {
578 ret = PTR_ERR(bh);
579 break;
580 }
581 i++;
582 path[i].p_bh = bh;
583 path[i].p_hdr = ext_block_hdr(bh);
584 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
585 }
586 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
587out:
588 up_read(&ei->i_data_sem);
589 ext4_ext_drop_refs(path);
590 kfree(path);
591 return ret;
592}
593
529#ifdef EXT_DEBUG 594#ifdef EXT_DEBUG
530static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 595static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
531{ 596{
@@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4766 return error; 4831 return error;
4767 } 4832 }
4768 4833
4834 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4835 error = ext4_ext_precache(inode);
4836 if (error)
4837 return error;
4838 }
4839
4769 /* fallback to generic here if not in extents fmt */ 4840 /* fallback to generic here if not in extents fmt */
4770 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4841 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4771 return generic_block_fiemap(inode, fieinfo, start, len, 4842 return generic_block_fiemap(inode, fieinfo, start, len,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 1dc5df016e25..0e88a367b535 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
710 write_lock(&EXT4_I(inode)->i_es_lock); 710 write_lock(&EXT4_I(inode)->i_es_lock);
711 711
712 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); 712 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
713 if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) 713 if (!es || es->es_lblk > end)
714 goto out; 714 __es_insert_extent(inode, &newes);
715
716 __es_insert_extent(inode, &newes);
717out:
718 write_unlock(&EXT4_I(inode)->i_es_lock); 715 write_unlock(&EXT4_I(inode)->i_es_lock);
719} 716}
720 717
@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
930 eia = list_entry(a, struct ext4_inode_info, i_es_lru); 927 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
931 eib = list_entry(b, struct ext4_inode_info, i_es_lru); 928 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
932 929
930 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
931 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
932 return 1;
933 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
934 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
935 return -1;
933 if (eia->i_touch_when == eib->i_touch_when) 936 if (eia->i_touch_when == eib->i_touch_when)
934 return 0; 937 return 0;
935 if (time_after(eia->i_touch_when, eib->i_touch_when)) 938 if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
943{ 946{
944 struct ext4_inode_info *ei; 947 struct ext4_inode_info *ei;
945 struct list_head *cur, *tmp; 948 struct list_head *cur, *tmp;
946 LIST_HEAD(skiped); 949 LIST_HEAD(skipped);
947 int ret, nr_shrunk = 0; 950 int ret, nr_shrunk = 0;
951 int retried = 0, skip_precached = 1, nr_skipped = 0;
948 952
949 spin_lock(&sbi->s_es_lru_lock); 953 spin_lock(&sbi->s_es_lru_lock);
950 954
951 /* 955retry:
952 * If the inode that is at the head of LRU list is newer than
953 * last_sorted time, that means that we need to sort this list.
954 */
955 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
956 if (sbi->s_es_last_sorted < ei->i_touch_when) {
957 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
958 sbi->s_es_last_sorted = jiffies;
959 }
960
961 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 956 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
962 /* 957 /*
963 * If we have already reclaimed all extents from extent 958 * If we have already reclaimed all extents from extent
@@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
968 963
969 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 964 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
970 965
971 /* Skip the inode that is newer than the last_sorted time */ 966 /*
972 if (sbi->s_es_last_sorted < ei->i_touch_when) { 967 * Skip the inode that is newer than the last_sorted
973 list_move_tail(cur, &skiped); 968 * time. Normally we try hard to avoid shrinking
969 * precached inodes, but we will as a last resort.
970 */
971 if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
972 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
973 EXT4_STATE_EXT_PRECACHED))) {
974 nr_skipped++;
975 list_move_tail(cur, &skipped);
974 continue; 976 continue;
975 } 977 }
976 978
@@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
990 } 992 }
991 993
992 /* Move the newer inodes into the tail of the LRU list. */ 994 /* Move the newer inodes into the tail of the LRU list. */
993 list_splice_tail(&skiped, &sbi->s_es_lru); 995 list_splice_tail(&skipped, &sbi->s_es_lru);
996 INIT_LIST_HEAD(&skipped);
997
998 /*
999 * If we skipped any inodes, and we weren't able to make any
1000 * forward progress, sort the list and try again.
1001 */
1002 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1003 retried++;
1004 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
1005 sbi->s_es_last_sorted = jiffies;
1006 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
1007 i_es_lru);
1008 /*
1009 * If there are no non-precached inodes left on the
1010 * list, start releasing precached extents.
1011 */
1012 if (ext4_test_inode_state(&ei->vfs_inode,
1013 EXT4_STATE_EXT_PRECACHED))
1014 skip_precached = 0;
1015 goto retry;
1016 }
1017
994 spin_unlock(&sbi->s_es_lru_lock); 1018 spin_unlock(&sbi->s_es_lru_lock);
995 1019
996 if (locked_ei && nr_shrunk == 0) 1020 if (locked_ei && nr_shrunk == 0)
997 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1021 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
998 1022
999 return nr_shrunk; 1023 return nr_shrunk;
1000} 1024}
@@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1069 struct rb_node *node; 1093 struct rb_node *node;
1070 struct extent_status *es; 1094 struct extent_status *es;
1071 int nr_shrunk = 0; 1095 int nr_shrunk = 0;
1096 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1097 DEFAULT_RATELIMIT_BURST);
1072 1098
1073 if (ei->i_es_lru_nr == 0) 1099 if (ei->i_es_lru_nr == 0)
1074 return 0; 1100 return 0;
1075 1101
1102 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1103 __ratelimit(&_rs))
1104 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1105
1076 node = rb_first(&tree->root); 1106 node = rb_first(&tree->root);
1077 while (node != NULL) { 1107 while (node != NULL) {
1078 es = rb_entry(node, struct extent_status, rb_node); 1108 es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c0427e2f6648..5498f75a1648 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -624,6 +624,8 @@ resizefs_out:
624 624
625 return 0; 625 return 0;
626 } 626 }
627 case EXT4_IOC_PRECACHE_EXTENTS:
628 return ext4_ext_precache(inode);
627 629
628 default: 630 default:
629 return -ENOTTY; 631 return -ENOTTY;
@@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
688 case EXT4_IOC_MOVE_EXT: 690 case EXT4_IOC_MOVE_EXT:
689 case FITRIM: 691 case FITRIM:
690 case EXT4_IOC_RESIZE_FS: 692 case EXT4_IOC_RESIZE_FS:
693 case EXT4_IOC_PRECACHE_EXTENTS:
691 break; 694 break;
692 default: 695 default:
693 return -ENOIOCTLCMD; 696 return -ENOIOCTLCMD;