diff options
-rw-r--r-- | fs/ext4/ext4.h | 17 | ||||
-rw-r--r-- | fs/ext4/extents.c | 73 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 72 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 3 | ||||
-rw-r--r-- | include/uapi/linux/fiemap.h | 1 |
5 files changed, 137 insertions, 29 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c74b1948feb0..635135e6148e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -561,15 +561,16 @@ enum { | |||
561 | #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 | 561 | #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 |
562 | 562 | ||
563 | /* | 563 | /* |
564 | * The bit position of this flag must not overlap with any of the | 564 | * The bit position of these flags must not overlap with any of the |
565 | * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), | 565 | * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), |
566 | * read_extent_tree_block(), ext4_split_extent_at(), | 566 | * read_extent_tree_block(), ext4_split_extent_at(), |
567 | * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to | 567 | * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). |
568 | * indicate that the we shouldn't be caching the extents when reading | 568 | * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be |
569 | * from the extent tree while a truncate or punch hole operation | 569 | * caching the extents when reading from the extent tree while a |
570 | * is in progress. | 570 | * truncate or punch hole operation is in progress. |
571 | */ | 571 | */ |
572 | #define EXT4_EX_NOCACHE 0x0400 | 572 | #define EXT4_EX_NOCACHE 0x0400 |
573 | #define EXT4_EX_FORCE_CACHE 0x0800 | ||
573 | 574 | ||
574 | /* | 575 | /* |
575 | * Flags used by ext4_free_blocks | 576 | * Flags used by ext4_free_blocks |
@@ -601,6 +602,7 @@ enum { | |||
601 | #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) | 602 | #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) |
602 | #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) | 603 | #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) |
603 | #define EXT4_IOC_SWAP_BOOT _IO('f', 17) | 604 | #define EXT4_IOC_SWAP_BOOT _IO('f', 17) |
605 | #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) | ||
604 | 606 | ||
605 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) | 607 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) |
606 | /* | 608 | /* |
@@ -1386,6 +1388,7 @@ enum { | |||
1386 | nolocking */ | 1388 | nolocking */ |
1387 | EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ | 1389 | EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ |
1388 | EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ | 1390 | EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ |
1391 | EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ | ||
1389 | }; | 1392 | }; |
1390 | 1393 | ||
1391 | #define EXT4_INODE_BIT_FNS(name, field, offset) \ | 1394 | #define EXT4_INODE_BIT_FNS(name, field, offset) \ |
@@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, | |||
2705 | extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); | 2708 | extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); |
2706 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 2709 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
2707 | __u64 start, __u64 len); | 2710 | __u64 start, __u64 len); |
2708 | 2711 | extern int ext4_ext_precache(struct inode *inode); | |
2709 | 2712 | ||
2710 | /* move_extent.c */ | 2713 | /* move_extent.c */ |
2711 | extern void ext4_double_down_write_data_sem(struct inode *first, | 2714 | extern void ext4_double_down_write_data_sem(struct inode *first, |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 08c1ac976479..01838875fcaf 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line, | |||
482 | if (err < 0) | 482 | if (err < 0) |
483 | goto errout; | 483 | goto errout; |
484 | } | 484 | } |
485 | if (buffer_verified(bh)) | 485 | if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) |
486 | return bh; | 486 | return bh; |
487 | err = __ext4_ext_check(function, line, inode, | 487 | err = __ext4_ext_check(function, line, inode, |
488 | ext_block_hdr(bh), depth, pblk); | 488 | ext_block_hdr(bh), depth, pblk); |
@@ -526,6 +526,71 @@ errout: | |||
526 | __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ | 526 | __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ |
527 | (depth), (flags)) | 527 | (depth), (flags)) |
528 | 528 | ||
529 | /* | ||
530 | * This function is called to cache a file's extent information in the | ||
531 | * extent status tree | ||
532 | */ | ||
533 | int ext4_ext_precache(struct inode *inode) | ||
534 | { | ||
535 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
536 | struct ext4_ext_path *path = NULL; | ||
537 | struct buffer_head *bh; | ||
538 | int i = 0, depth, ret = 0; | ||
539 | |||
540 | if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||
541 | return 0; /* not an extent-mapped inode */ | ||
542 | |||
543 | down_read(&ei->i_data_sem); | ||
544 | depth = ext_depth(inode); | ||
545 | |||
546 | path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), | ||
547 | GFP_NOFS); | ||
548 | if (path == NULL) { | ||
549 | up_read(&ei->i_data_sem); | ||
550 | return -ENOMEM; | ||
551 | } | ||
552 | |||
553 | /* Don't cache anything if there are no external extent blocks */ | ||
554 | if (depth == 0) | ||
555 | goto out; | ||
556 | path[0].p_hdr = ext_inode_hdr(inode); | ||
557 | ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); | ||
558 | if (ret) | ||
559 | goto out; | ||
560 | path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); | ||
561 | while (i >= 0) { | ||
562 | /* | ||
563 | * If this is a leaf block or we've reached the end of | ||
564 | * the index block, go up | ||
565 | */ | ||
566 | if ((i == depth) || | ||
567 | path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { | ||
568 | brelse(path[i].p_bh); | ||
569 | path[i].p_bh = NULL; | ||
570 | i--; | ||
571 | continue; | ||
572 | } | ||
573 | bh = read_extent_tree_block(inode, | ||
574 | ext4_idx_pblock(path[i].p_idx++), | ||
575 | depth - i - 1, | ||
576 | EXT4_EX_FORCE_CACHE); | ||
577 | if (IS_ERR(bh)) { | ||
578 | ret = PTR_ERR(bh); | ||
579 | break; | ||
580 | } | ||
581 | i++; | ||
582 | path[i].p_bh = bh; | ||
583 | path[i].p_hdr = ext_block_hdr(bh); | ||
584 | path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); | ||
585 | } | ||
586 | ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); | ||
587 | out: | ||
588 | up_read(&ei->i_data_sem); | ||
589 | ext4_ext_drop_refs(path); | ||
590 | kfree(path); | ||
591 | return ret; | ||
592 | } | ||
593 | |||
529 | #ifdef EXT_DEBUG | 594 | #ifdef EXT_DEBUG |
530 | static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) | 595 | static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) |
531 | { | 596 | { |
@@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
4766 | return error; | 4831 | return error; |
4767 | } | 4832 | } |
4768 | 4833 | ||
4834 | if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { | ||
4835 | error = ext4_ext_precache(inode); | ||
4836 | if (error) | ||
4837 | return error; | ||
4838 | } | ||
4839 | |||
4769 | /* fallback to generic here if not in extents fmt */ | 4840 | /* fallback to generic here if not in extents fmt */ |
4770 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 4841 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
4771 | return generic_block_fiemap(inode, fieinfo, start, len, | 4842 | return generic_block_fiemap(inode, fieinfo, start, len, |
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 1dc5df016e25..0e88a367b535 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c | |||
@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, | |||
710 | write_lock(&EXT4_I(inode)->i_es_lock); | 710 | write_lock(&EXT4_I(inode)->i_es_lock); |
711 | 711 | ||
712 | es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); | 712 | es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); |
713 | if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) | 713 | if (!es || es->es_lblk > end) |
714 | goto out; | 714 | __es_insert_extent(inode, &newes); |
715 | |||
716 | __es_insert_extent(inode, &newes); | ||
717 | out: | ||
718 | write_unlock(&EXT4_I(inode)->i_es_lock); | 715 | write_unlock(&EXT4_I(inode)->i_es_lock); |
719 | } | 716 | } |
720 | 717 | ||
@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, | |||
930 | eia = list_entry(a, struct ext4_inode_info, i_es_lru); | 927 | eia = list_entry(a, struct ext4_inode_info, i_es_lru); |
931 | eib = list_entry(b, struct ext4_inode_info, i_es_lru); | 928 | eib = list_entry(b, struct ext4_inode_info, i_es_lru); |
932 | 929 | ||
930 | if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && | ||
931 | !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) | ||
932 | return 1; | ||
933 | if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && | ||
934 | ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) | ||
935 | return -1; | ||
933 | if (eia->i_touch_when == eib->i_touch_when) | 936 | if (eia->i_touch_when == eib->i_touch_when) |
934 | return 0; | 937 | return 0; |
935 | if (time_after(eia->i_touch_when, eib->i_touch_when)) | 938 | if (time_after(eia->i_touch_when, eib->i_touch_when)) |
@@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, | |||
943 | { | 946 | { |
944 | struct ext4_inode_info *ei; | 947 | struct ext4_inode_info *ei; |
945 | struct list_head *cur, *tmp; | 948 | struct list_head *cur, *tmp; |
946 | LIST_HEAD(skiped); | 949 | LIST_HEAD(skipped); |
947 | int ret, nr_shrunk = 0; | 950 | int ret, nr_shrunk = 0; |
951 | int retried = 0, skip_precached = 1, nr_skipped = 0; | ||
948 | 952 | ||
949 | spin_lock(&sbi->s_es_lru_lock); | 953 | spin_lock(&sbi->s_es_lru_lock); |
950 | 954 | ||
951 | /* | 955 | retry: |
952 | * If the inode that is at the head of LRU list is newer than | ||
953 | * last_sorted time, that means that we need to sort this list. | ||
954 | */ | ||
955 | ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); | ||
956 | if (sbi->s_es_last_sorted < ei->i_touch_when) { | ||
957 | list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); | ||
958 | sbi->s_es_last_sorted = jiffies; | ||
959 | } | ||
960 | |||
961 | list_for_each_safe(cur, tmp, &sbi->s_es_lru) { | 956 | list_for_each_safe(cur, tmp, &sbi->s_es_lru) { |
962 | /* | 957 | /* |
963 | * If we have already reclaimed all extents from extent | 958 | * If we have already reclaimed all extents from extent |
@@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, | |||
968 | 963 | ||
969 | ei = list_entry(cur, struct ext4_inode_info, i_es_lru); | 964 | ei = list_entry(cur, struct ext4_inode_info, i_es_lru); |
970 | 965 | ||
971 | /* Skip the inode that is newer than the last_sorted time */ | 966 | /* |
972 | if (sbi->s_es_last_sorted < ei->i_touch_when) { | 967 | * Skip the inode that is newer than the last_sorted |
973 | list_move_tail(cur, &skiped); | 968 | * time. Normally we try hard to avoid shrinking |
969 | * precached inodes, but we will as a last resort. | ||
970 | */ | ||
971 | if ((sbi->s_es_last_sorted < ei->i_touch_when) || | ||
972 | (skip_precached && ext4_test_inode_state(&ei->vfs_inode, | ||
973 | EXT4_STATE_EXT_PRECACHED))) { | ||
974 | nr_skipped++; | ||
975 | list_move_tail(cur, &skipped); | ||
974 | continue; | 976 | continue; |
975 | } | 977 | } |
976 | 978 | ||
@@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, | |||
990 | } | 992 | } |
991 | 993 | ||
992 | /* Move the newer inodes into the tail of the LRU list. */ | 994 | /* Move the newer inodes into the tail of the LRU list. */ |
993 | list_splice_tail(&skiped, &sbi->s_es_lru); | 995 | list_splice_tail(&skipped, &sbi->s_es_lru); |
996 | INIT_LIST_HEAD(&skipped); | ||
997 | |||
998 | /* | ||
999 | * If we skipped any inodes, and we weren't able to make any | ||
1000 | * forward progress, sort the list and try again. | ||
1001 | */ | ||
1002 | if ((nr_shrunk == 0) && nr_skipped && !retried) { | ||
1003 | retried++; | ||
1004 | list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); | ||
1005 | sbi->s_es_last_sorted = jiffies; | ||
1006 | ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, | ||
1007 | i_es_lru); | ||
1008 | /* | ||
1009 | * If there are no non-precached inodes left on the | ||
1010 | * list, start releasing precached extents. | ||
1011 | */ | ||
1012 | if (ext4_test_inode_state(&ei->vfs_inode, | ||
1013 | EXT4_STATE_EXT_PRECACHED)) | ||
1014 | skip_precached = 0; | ||
1015 | goto retry; | ||
1016 | } | ||
1017 | |||
994 | spin_unlock(&sbi->s_es_lru_lock); | 1018 | spin_unlock(&sbi->s_es_lru_lock); |
995 | 1019 | ||
996 | if (locked_ei && nr_shrunk == 0) | 1020 | if (locked_ei && nr_shrunk == 0) |
997 | nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); | 1021 | nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); |
998 | 1022 | ||
999 | return nr_shrunk; | 1023 | return nr_shrunk; |
1000 | } | 1024 | } |
@@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, | |||
1069 | struct rb_node *node; | 1093 | struct rb_node *node; |
1070 | struct extent_status *es; | 1094 | struct extent_status *es; |
1071 | int nr_shrunk = 0; | 1095 | int nr_shrunk = 0; |
1096 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
1097 | DEFAULT_RATELIMIT_BURST); | ||
1072 | 1098 | ||
1073 | if (ei->i_es_lru_nr == 0) | 1099 | if (ei->i_es_lru_nr == 0) |
1074 | return 0; | 1100 | return 0; |
1075 | 1101 | ||
1102 | if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && | ||
1103 | __ratelimit(&_rs)) | ||
1104 | ext4_warning(inode->i_sb, "forced shrink of precached extents"); | ||
1105 | |||
1076 | node = rb_first(&tree->root); | 1106 | node = rb_first(&tree->root); |
1077 | while (node != NULL) { | 1107 | while (node != NULL) { |
1078 | es = rb_entry(node, struct extent_status, rb_node); | 1108 | es = rb_entry(node, struct extent_status, rb_node); |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c0427e2f6648..5498f75a1648 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -624,6 +624,8 @@ resizefs_out: | |||
624 | 624 | ||
625 | return 0; | 625 | return 0; |
626 | } | 626 | } |
627 | case EXT4_IOC_PRECACHE_EXTENTS: | ||
628 | return ext4_ext_precache(inode); | ||
627 | 629 | ||
628 | default: | 630 | default: |
629 | return -ENOTTY; | 631 | return -ENOTTY; |
@@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
688 | case EXT4_IOC_MOVE_EXT: | 690 | case EXT4_IOC_MOVE_EXT: |
689 | case FITRIM: | 691 | case FITRIM: |
690 | case EXT4_IOC_RESIZE_FS: | 692 | case EXT4_IOC_RESIZE_FS: |
693 | case EXT4_IOC_PRECACHE_EXTENTS: | ||
691 | break; | 694 | break; |
692 | default: | 695 | default: |
693 | return -ENOIOCTLCMD; | 696 | return -ENOIOCTLCMD; |
diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index d830747f5c0b..0c51d617dae9 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h | |||
@@ -40,6 +40,7 @@ struct fiemap { | |||
40 | 40 | ||
41 | #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ | 41 | #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ |
42 | #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ | 42 | #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ |
43 | #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ | ||
43 | 44 | ||
44 | #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) | 45 | #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) |
45 | 46 | ||