aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorZheng Liu <wenqing.lz@taobao.com>2013-07-01 08:12:37 -0400
committerTheodore Ts'o <tytso@mit.edu>2013-07-01 08:12:37 -0400
commitd3922a777f9b4c4df898d326fa940f239af4f9b6 (patch)
tree76f63123d146bb7bbc4a3478e301bff68ab08ed7 /fs/ext4
parent2c00ef3ee309142041c7395f42aa1d49fc9f44b9 (diff)
ext4: improve extent cache shrink mechanism to avoid to burn CPU time
Now we maintain an proper in-order LRU list in ext4 to reclaim entries from extent status tree when we are under heavy memory pressure. For keeping this order, a spin lock is used to protect this list. But this lock burns a lot of CPU time. We can use the following steps to trigger it. % cd /dev/shm % dd if=/dev/zero of=ext4-img bs=1M count=2k % mkfs.ext4 ext4-img % mount -t ext4 -o loop ext4-img /mnt % cd /mnt % for ((i=0;i<160;i++)); do truncate -s 64g $i; done % for ((i=0;i<160;i++)); do cp $i /dev/null &; done % perf record -a -g % perf report This commit tries to fix this problem. Now a new member called i_touch_when is added into ext4_inode_info to record the last access time for an inode. Meanwhile we never need to keep a proper in-order LRU list. So this can avoid to burns some CPU time. When we try to reclaim some entries from extent status tree, we use list_sort() to get a proper in-order list. Then we traverse this list to discard some entries. In ext4_sb_info, we use s_es_last_sorted to record the last time of sorting this list. When we traverse the list, we skip the inode that is newer than this time, and move this inode to the tail of LRU list. When the head of the list is newer than s_es_last_sorted, we will sort the LRU list again. In this commit, we break the loop if s_extent_cache_cnt == 0 because that means that all extents in extent status tree have been reclaimed. Meanwhile in this commit, ext4_es_{un}register_shrinker()'s prototype is changed to save a local variable in these functions. Reported-by: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents_status.c75
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/super.c7
5 files changed, 68 insertions, 25 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f85f1fb49df8..f5f3b6c58240 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -864,6 +864,7 @@ struct ext4_inode_info {
864 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
865 struct list_head i_es_lru; 865 struct list_head i_es_lru;
866 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
867 868
868 /* ialloc */ 869 /* ialloc */
869 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -1303,6 +1304,7 @@ struct ext4_sb_info {
1303 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1304 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1305 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1306 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1307 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1308}; 1310};
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -291,7 +292,6 @@ out:
291 292
292 read_unlock(&EXT4_I(inode)->i_es_lock); 293 read_unlock(&EXT4_I(inode)->i_es_lock);
293 294
294 ext4_es_lru_add(inode);
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296} 296}
297 297
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
672error: 672error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 673 write_unlock(&EXT4_I(inode)->i_es_lock);
674 674
675 ext4_es_lru_add(inode);
676 ext4_es_print_tree(inode); 675 ext4_es_print_tree(inode);
677 676
678 return err; 677 return err;
@@ -734,7 +733,6 @@ out:
734 733
735 read_unlock(&EXT4_I(inode)->i_es_lock); 734 read_unlock(&EXT4_I(inode)->i_es_lock);
736 735
737 ext4_es_lru_add(inode);
738 trace_ext4_es_lookup_extent_exit(inode, es, found); 736 trace_ext4_es_lookup_extent_exit(inode, es, found);
739 return found; 737 return found;
740} 738}
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
878 EXTENT_STATUS_WRITTEN); 876 EXTENT_STATUS_WRITTEN);
879} 877}
880 878
879static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
880 struct list_head *b)
881{
882 struct ext4_inode_info *eia, *eib;
883 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
884 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
885
886 if (eia->i_touch_when == eib->i_touch_when)
887 return 0;
888 if (time_after(eia->i_touch_when, eib->i_touch_when))
889 return 1;
890 else
891 return -1;
892}
893
881static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 894static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
882{ 895{
883 struct ext4_sb_info *sbi = container_of(shrink, 896 struct ext4_sb_info *sbi = container_of(shrink,
884 struct ext4_sb_info, s_es_shrinker); 897 struct ext4_sb_info, s_es_shrinker);
885 struct ext4_inode_info *ei; 898 struct ext4_inode_info *ei;
886 struct list_head *cur, *tmp, scanned; 899 struct list_head *cur, *tmp;
900 LIST_HEAD(skiped);
887 int nr_to_scan = sc->nr_to_scan; 901 int nr_to_scan = sc->nr_to_scan;
888 int ret, nr_shrunk = 0; 902 int ret, nr_shrunk = 0;
889 903
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
893 if (!nr_to_scan) 907 if (!nr_to_scan)
894 return ret; 908 return ret;
895 909
896 INIT_LIST_HEAD(&scanned);
897
898 spin_lock(&sbi->s_es_lru_lock); 910 spin_lock(&sbi->s_es_lru_lock);
911
912 /*
913 * If the inode that is at the head of LRU list is newer than
914 * last_sorted time, that means that we need to sort this list.
915 */
916 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
917 if (sbi->s_es_last_sorted < ei->i_touch_when) {
918 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
919 sbi->s_es_last_sorted = jiffies;
920 }
921
899 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 922 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
900 list_move_tail(cur, &scanned); 923 /*
924 * If we have already reclaimed all extents from extent
925 * status tree, just stop the loop immediately.
926 */
927 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
928 break;
901 929
902 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 930 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
903 931
904 read_lock(&ei->i_es_lock); 932 /* Skip the inode that is newer than the last_sorted time */
905 if (ei->i_es_lru_nr == 0) { 933 if (sbi->s_es_last_sorted < ei->i_touch_when) {
906 read_unlock(&ei->i_es_lock); 934 list_move_tail(cur, &skiped);
907 continue; 935 continue;
908 } 936 }
909 read_unlock(&ei->i_es_lock); 937
938 if (ei->i_es_lru_nr == 0)
939 continue;
910 940
911 write_lock(&ei->i_es_lock); 941 write_lock(&ei->i_es_lock);
912 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 942 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
943 if (ei->i_es_lru_nr == 0)
944 list_del_init(&ei->i_es_lru);
913 write_unlock(&ei->i_es_lock); 945 write_unlock(&ei->i_es_lock);
914 946
915 nr_shrunk += ret; 947 nr_shrunk += ret;
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
917 if (nr_to_scan == 0) 949 if (nr_to_scan == 0)
918 break; 950 break;
919 } 951 }
920 list_splice_tail(&scanned, &sbi->s_es_lru); 952
953 /* Move the newer inodes into the tail of the LRU list. */
954 list_splice_tail(&skiped, &sbi->s_es_lru);
921 spin_unlock(&sbi->s_es_lru_lock); 955 spin_unlock(&sbi->s_es_lru_lock);
922 956
923 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 957 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
925 return ret; 959 return ret;
926} 960}
927 961
928void ext4_es_register_shrinker(struct super_block *sb) 962void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
929{ 963{
930 struct ext4_sb_info *sbi;
931
932 sbi = EXT4_SB(sb);
933 INIT_LIST_HEAD(&sbi->s_es_lru); 964 INIT_LIST_HEAD(&sbi->s_es_lru);
934 spin_lock_init(&sbi->s_es_lru_lock); 965 spin_lock_init(&sbi->s_es_lru_lock);
966 sbi->s_es_last_sorted = 0;
935 sbi->s_es_shrinker.shrink = ext4_es_shrink; 967 sbi->s_es_shrinker.shrink = ext4_es_shrink;
936 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 968 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
937 register_shrinker(&sbi->s_es_shrinker); 969 register_shrinker(&sbi->s_es_shrinker);
938} 970}
939 971
940void ext4_es_unregister_shrinker(struct super_block *sb) 972void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
941{ 973{
942 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 974 unregister_shrinker(&sbi->s_es_shrinker);
943} 975}
944 976
945void ext4_es_lru_add(struct inode *inode) 977void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
947 struct ext4_inode_info *ei = EXT4_I(inode); 979 struct ext4_inode_info *ei = EXT4_I(inode);
948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 980 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
949 981
982 ei->i_touch_when = jiffies;
983
984 if (!list_empty(&ei->i_es_lru))
985 return;
986
950 spin_lock(&sbi->s_es_lru_lock); 987 spin_lock(&sbi->s_es_lru_lock);
951 if (list_empty(&ei->i_es_lru)) 988 if (list_empty(&ei->i_es_lru))
952 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 989 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
953 else
954 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 990 spin_unlock(&sbi->s_es_lru_lock);
956} 991}
957 992
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
119 es->es_pblk = block; 120 es->es_pblk = block;
120} 121}
121 122
122extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
124extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
125extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
126 127
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0db830d541ec..f9ba51f68777 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -514,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
514 "logical block %lu\n", inode->i_ino, flags, map->m_len, 514 "logical block %lu\n", inode->i_ino, flags, map->m_len,
515 (unsigned long) map->m_lblk); 515 (unsigned long) map->m_lblk);
516 516
517 ext4_es_lru_add(inode);
518
517 /* Lookup extent status tree firstly */ 519 /* Lookup extent status tree firstly */
518 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 520 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
519 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 521 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1526,6 +1528,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1526 "logical block %lu\n", inode->i_ino, map->m_len, 1528 "logical block %lu\n", inode->i_ino, map->m_len,
1527 (unsigned long) map->m_lblk); 1529 (unsigned long) map->m_lblk);
1528 1530
1531 ext4_es_lru_add(inode);
1532
1529 /* Lookup extent status tree firstly */ 1533 /* Lookup extent status tree firstly */
1530 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1534 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1531 1535
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 54701fca4515..cc8201180b30 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -773,7 +773,7 @@ static void ext4_put_super(struct super_block *sb)
773 ext4_abort(sb, "Couldn't clean up the journal"); 773 ext4_abort(sb, "Couldn't clean up the journal");
774 } 774 }
775 775
776 ext4_es_unregister_shrinker(sb); 776 ext4_es_unregister_shrinker(sbi);
777 del_timer(&sbi->s_err_report); 777 del_timer(&sbi->s_err_report);
778 ext4_release_system_zone(sb); 778 ext4_release_system_zone(sb);
779 ext4_mb_release(sb); 779 ext4_mb_release(sb);
@@ -862,6 +862,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
862 rwlock_init(&ei->i_es_lock); 862 rwlock_init(&ei->i_es_lock);
863 INIT_LIST_HEAD(&ei->i_es_lru); 863 INIT_LIST_HEAD(&ei->i_es_lru);
864 ei->i_es_lru_nr = 0; 864 ei->i_es_lru_nr = 0;
865 ei->i_touch_when = 0;
865 ei->i_reserved_data_blocks = 0; 866 ei->i_reserved_data_blocks = 0;
866 ei->i_reserved_meta_blocks = 0; 867 ei->i_reserved_meta_blocks = 0;
867 ei->i_allocated_meta_blocks = 0; 868 ei->i_allocated_meta_blocks = 0;
@@ -3799,7 +3800,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3799 sbi->s_err_report.data = (unsigned long) sb; 3800 sbi->s_err_report.data = (unsigned long) sb;
3800 3801
3801 /* Register extent status tree shrinker */ 3802 /* Register extent status tree shrinker */
3802 ext4_es_register_shrinker(sb); 3803 ext4_es_register_shrinker(sbi);
3803 3804
3804 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3805 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3805 ext4_count_free_clusters(sb)); 3806 ext4_count_free_clusters(sb));
@@ -4127,7 +4128,7 @@ failed_mount_wq:
4127 sbi->s_journal = NULL; 4128 sbi->s_journal = NULL;
4128 } 4129 }
4129failed_mount3: 4130failed_mount3:
4130 ext4_es_unregister_shrinker(sb); 4131 ext4_es_unregister_shrinker(sbi);
4131 del_timer(&sbi->s_err_report); 4132 del_timer(&sbi->s_err_report);
4132 if (sbi->s_flex_groups) 4133 if (sbi->s_flex_groups)
4133 ext4_kvfree(sbi->s_flex_groups); 4134 ext4_kvfree(sbi->s_flex_groups);