aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorZheng Liu <wenqing.lz@taobao.com>2013-02-18 00:32:55 -0500
committerTheodore Ts'o <tytso@mit.edu>2013-02-18 00:32:55 -0500
commit74cd15cd02708c7188581f279f33a98b2ae8d322 (patch)
tree5d5c2380ffc7ddf1cd529127b89bf572c1798ffd /fs
parentbdedbb7b8d5b960e1ff0d116f5d4935febe73183 (diff)
ext4: reclaim extents from extent status tree
Although extent status is loaded on-demand, we also need to reclaim extent from the tree when we are under a heavy memory pressure because in some cases fragmented extent tree causes status tree costs too much memory. Here we maintain a lru list in super_block. When the extent status of an inode is accessed and changed, this inode will be move to the tail of the list. The inode will be dropped from this list when it is cleared. In the inode, a counter is added to count the number of cached objects in extent status tree. Here only written/unwritten/hole extent is counted because delayed extent doesn't be reclaimed due to fiemap, bigalloc and seek_data/hole need it. The counter will be increased as a new extent is allocated, and it will be decreased as a extent is freed. In this commit we use normal shrinker framework to reclaim memory from the status tree. ext4_es_reclaim_extents_count() traverses the lru list to count the number of reclaimable extents. ext4_es_shrink() tries to reclaim written/unwritten/hole extents from extent status tree. The inode that has been shrunk is moved to the tail of lru list. Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Jan kara <jack@suse.cz>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents_status.c156
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/super.c7
4 files changed, 175 insertions, 0 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c565c941f7a..6e16c1867959 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -888,6 +888,8 @@ struct ext4_inode_info {
888 /* extents status tree */ 888 /* extents status tree */
889 struct ext4_es_tree i_es_tree; 889 struct ext4_es_tree i_es_tree;
890 rwlock_t i_es_lock; 890 rwlock_t i_es_lock;
891 struct list_head i_es_lru;
892 unsigned int i_es_lru_nr; /* protected by i_es_lock */
891 893
892 /* ialloc */ 894 /* ialloc */
893 ext4_group_t i_last_alloc_group; 895 ext4_group_t i_last_alloc_group;
@@ -1303,6 +1305,11 @@ struct ext4_sb_info {
1303 1305
1304 /* Precomputed FS UUID checksum for seeding other checksums */ 1306 /* Precomputed FS UUID checksum for seeding other checksums */
1305 __u32 s_csum_seed; 1307 __u32 s_csum_seed;
1308
1309 /* Reclaim extents from extent status tree */
1310 struct shrinker s_es_shrinker;
1311 struct list_head s_es_lru;
1312 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1306}; 1313};
1307 1314
1308static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1315static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index cce152c3c8dc..9f1380e05474 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -145,6 +145,9 @@ static struct kmem_cache *ext4_es_cachep;
145static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 145static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
146static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 146static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
147 ext4_lblk_t end); 147 ext4_lblk_t end);
148static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
149 int nr_to_scan);
150static int ext4_es_reclaim_extents_count(struct super_block *sb);
148 151
149int __init ext4_init_es(void) 152int __init ext4_init_es(void)
150{ 153{
@@ -280,6 +283,7 @@ out:
280 283
281 read_unlock(&EXT4_I(inode)->i_es_lock); 284 read_unlock(&EXT4_I(inode)->i_es_lock);
282 285
286 ext4_es_lru_add(inode);
283 trace_ext4_es_find_delayed_extent_exit(inode, es); 287 trace_ext4_es_find_delayed_extent_exit(inode, es);
284} 288}
285 289
@@ -294,11 +298,24 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
294 es->es_lblk = lblk; 298 es->es_lblk = lblk;
295 es->es_len = len; 299 es->es_len = len;
296 es->es_pblk = pblk; 300 es->es_pblk = pblk;
301
302 /*
303 * We don't count delayed extent because we never try to reclaim them
304 */
305 if (!ext4_es_is_delayed(es))
306 EXT4_I(inode)->i_es_lru_nr++;
307
297 return es; 308 return es;
298} 309}
299 310
300static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 311static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
301{ 312{
313 /* Decrease the lru counter when this es is not delayed */
314 if (!ext4_es_is_delayed(es)) {
315 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
316 EXT4_I(inode)->i_es_lru_nr--;
317 }
318
302 kmem_cache_free(ext4_es_cachep, es); 319 kmem_cache_free(ext4_es_cachep, es);
303} 320}
304 321
@@ -456,6 +473,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
456error: 473error:
457 write_unlock(&EXT4_I(inode)->i_es_lock); 474 write_unlock(&EXT4_I(inode)->i_es_lock);
458 475
476 ext4_es_lru_add(inode);
459 ext4_es_print_tree(inode); 477 ext4_es_print_tree(inode);
460 478
461 return err; 479 return err;
@@ -517,6 +535,7 @@ out:
517 535
518 read_unlock(&EXT4_I(inode)->i_es_lock); 536 read_unlock(&EXT4_I(inode)->i_es_lock);
519 537
538 ext4_es_lru_add(inode);
520 trace_ext4_es_lookup_extent_exit(inode, es, found); 539 trace_ext4_es_lookup_extent_exit(inode, es, found);
521 return found; 540 return found;
522} 541}
@@ -639,3 +658,140 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
639 ext4_es_print_tree(inode); 658 ext4_es_print_tree(inode);
640 return err; 659 return err;
641} 660}
661
662static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
663{
664 struct ext4_sb_info *sbi = container_of(shrink,
665 struct ext4_sb_info, s_es_shrinker);
666 struct ext4_inode_info *ei;
667 struct list_head *cur, *tmp, scanned;
668 int nr_to_scan = sc->nr_to_scan;
669 int ret, nr_shrunk = 0;
670
671 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan);
672
673 if (!nr_to_scan)
674 return ext4_es_reclaim_extents_count(sbi->s_sb);
675
676 INIT_LIST_HEAD(&scanned);
677
678 spin_lock(&sbi->s_es_lru_lock);
679 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
680 list_move_tail(cur, &scanned);
681
682 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
683
684 read_lock(&ei->i_es_lock);
685 if (ei->i_es_lru_nr == 0) {
686 read_unlock(&ei->i_es_lock);
687 continue;
688 }
689 read_unlock(&ei->i_es_lock);
690
691 write_lock(&ei->i_es_lock);
692 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
693 write_unlock(&ei->i_es_lock);
694
695 nr_shrunk += ret;
696 nr_to_scan -= ret;
697 if (nr_to_scan == 0)
698 break;
699 }
700 list_splice_tail(&scanned, &sbi->s_es_lru);
701 spin_unlock(&sbi->s_es_lru_lock);
702 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk);
703
704 return ext4_es_reclaim_extents_count(sbi->s_sb);
705}
706
707void ext4_es_register_shrinker(struct super_block *sb)
708{
709 struct ext4_sb_info *sbi;
710
711 sbi = EXT4_SB(sb);
712 INIT_LIST_HEAD(&sbi->s_es_lru);
713 spin_lock_init(&sbi->s_es_lru_lock);
714 sbi->s_es_shrinker.shrink = ext4_es_shrink;
715 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
716 register_shrinker(&sbi->s_es_shrinker);
717}
718
719void ext4_es_unregister_shrinker(struct super_block *sb)
720{
721 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
722}
723
724void ext4_es_lru_add(struct inode *inode)
725{
726 struct ext4_inode_info *ei = EXT4_I(inode);
727 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
728
729 spin_lock(&sbi->s_es_lru_lock);
730 if (list_empty(&ei->i_es_lru))
731 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
732 else
733 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
734 spin_unlock(&sbi->s_es_lru_lock);
735}
736
737void ext4_es_lru_del(struct inode *inode)
738{
739 struct ext4_inode_info *ei = EXT4_I(inode);
740 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
741
742 spin_lock(&sbi->s_es_lru_lock);
743 if (!list_empty(&ei->i_es_lru))
744 list_del_init(&ei->i_es_lru);
745 spin_unlock(&sbi->s_es_lru_lock);
746}
747
748static int ext4_es_reclaim_extents_count(struct super_block *sb)
749{
750 struct ext4_sb_info *sbi = EXT4_SB(sb);
751 struct ext4_inode_info *ei;
752 struct list_head *cur;
753 int nr_cached = 0;
754
755 spin_lock(&sbi->s_es_lru_lock);
756 list_for_each(cur, &sbi->s_es_lru) {
757 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
758 read_lock(&ei->i_es_lock);
759 nr_cached += ei->i_es_lru_nr;
760 read_unlock(&ei->i_es_lock);
761 }
762 spin_unlock(&sbi->s_es_lru_lock);
763 trace_ext4_es_reclaim_extents_count(sb, nr_cached);
764 return nr_cached;
765}
766
767static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
768 int nr_to_scan)
769{
770 struct inode *inode = &ei->vfs_inode;
771 struct ext4_es_tree *tree = &ei->i_es_tree;
772 struct rb_node *node;
773 struct extent_status *es;
774 int nr_shrunk = 0;
775
776 if (ei->i_es_lru_nr == 0)
777 return 0;
778
779 node = rb_first(&tree->root);
780 while (node != NULL) {
781 es = rb_entry(node, struct extent_status, rb_node);
782 node = rb_next(&es->rb_node);
783 /*
784 * We can't reclaim delayed extent from status tree because
785 * fiemap, bigallic, and seek_data/hole need to use it.
786 */
787 if (!ext4_es_is_delayed(es)) {
788 rb_erase(&es->rb_node, &tree->root);
789 ext4_es_free_extent(inode, es);
790 nr_shrunk++;
791 if (--nr_to_scan == 0)
792 break;
793 }
794 }
795 tree->cache_es = NULL;
796 return nr_shrunk;
797}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8ffc90c784fa..cf83e77b16cb 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -106,4 +106,9 @@ static inline void ext4_es_store_status(struct extent_status *es,
106 es->es_pblk = block; 106 es->es_pblk = block;
107} 107}
108 108
109extern void ext4_es_register_shrinker(struct super_block *sb);
110extern void ext4_es_unregister_shrinker(struct super_block *sb);
111extern void ext4_es_lru_add(struct inode *inode);
112extern void ext4_es_lru_del(struct inode *inode);
113
109#endif /* _EXT4_EXTENTS_STATUS_H */ 114#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d80bfe5ac11c..373d46cd5d3f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -755,6 +755,7 @@ static void ext4_put_super(struct super_block *sb)
755 ext4_abort(sb, "Couldn't clean up the journal"); 755 ext4_abort(sb, "Couldn't clean up the journal");
756 } 756 }
757 757
758 ext4_es_unregister_shrinker(sb);
758 del_timer(&sbi->s_err_report); 759 del_timer(&sbi->s_err_report);
759 ext4_release_system_zone(sb); 760 ext4_release_system_zone(sb);
760 ext4_mb_release(sb); 761 ext4_mb_release(sb);
@@ -840,6 +841,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
840 spin_lock_init(&ei->i_prealloc_lock); 841 spin_lock_init(&ei->i_prealloc_lock);
841 ext4_es_init_tree(&ei->i_es_tree); 842 ext4_es_init_tree(&ei->i_es_tree);
842 rwlock_init(&ei->i_es_lock); 843 rwlock_init(&ei->i_es_lock);
844 INIT_LIST_HEAD(&ei->i_es_lru);
845 ei->i_es_lru_nr = 0;
843 ei->i_reserved_data_blocks = 0; 846 ei->i_reserved_data_blocks = 0;
844 ei->i_reserved_meta_blocks = 0; 847 ei->i_reserved_meta_blocks = 0;
845 ei->i_allocated_meta_blocks = 0; 848 ei->i_allocated_meta_blocks = 0;
@@ -928,6 +931,7 @@ void ext4_clear_inode(struct inode *inode)
928 dquot_drop(inode); 931 dquot_drop(inode);
929 ext4_discard_preallocations(inode); 932 ext4_discard_preallocations(inode);
930 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 933 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
934 ext4_es_lru_del(inode);
931 if (EXT4_I(inode)->jinode) { 935 if (EXT4_I(inode)->jinode) {
932 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 936 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
933 EXT4_I(inode)->jinode); 937 EXT4_I(inode)->jinode);
@@ -3693,6 +3697,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3693 sbi->s_max_writeback_mb_bump = 128; 3697 sbi->s_max_writeback_mb_bump = 128;
3694 sbi->s_extent_max_zeroout_kb = 32; 3698 sbi->s_extent_max_zeroout_kb = 32;
3695 3699
3700 /* Register extent status tree shrinker */
3701 ext4_es_register_shrinker(sb);
3702
3696 /* 3703 /*
3697 * set up enough so that it can read an inode 3704 * set up enough so that it can read an inode
3698 */ 3705 */