superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Dave Chinner <dchinner@redhat.com> 2011-07-08 00:14:42 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2011-07-20 20:47:10 -0400
commit: b0d40c92adafde7c2d81203ce7c1c69275f41140 (patch)
tree: f75a19dcd1a37aff23dc43323b58f014b1297c6b /fs/inode.c
parent: 12ad3ab66103e6582ca69c0c9de18b13487eaaef (diff)
1 files changed, 9 insertions, 108 deletions
diff --git a/fs/inode.c b/fs/inode.c
index 0450e25aeda0..1fdbb64a952f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 *
 * We don't actually need it to protect anything in the umount path,
 * but only need to cycle through it to make sure any inode that
- * prune_icache took off the LRU list has been fully torn down by the
+ * prune_icache_sb took off the LRU list has been fully torn down by the
 * time we are past evict_inodes.
 */
 static DECLARE_RWSEM(iprune_sem);
@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb)
        dispose_list(&dispose);
        /*
-         * Cycle through iprune_sem to make sure any inode that prune_icache
+         * Cycle through iprune_sem to make sure any inode that prune_icache_sb
         * moved off the list before we took the lock has been fully torn
         * down.
         */
@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode)
 }
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * temporary list and then are freed outside sb->s_inode_lru_lock by
+ * This is called from the superblock shrinker function with a number of inodes
- * dispose_list().
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  If the inode has metadata buffers attached to
@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode)
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
-static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
+void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 {
        LIST_HEAD(freeable);
        int nr_scanned;
        unsigned long reap = 0;
+        down_read(&iprune_sem);
        spin_lock(&sb->s_inode_lru_lock);
-        for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
+        for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
                struct inode *inode;
                if (list_empty(&sb->s_inode_lru))
@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
        else
                __count_vm_events(PGINODESTEAL, reap);
        spin_unlock(&sb->s_inode_lru_lock);
-        *nr_to_scan = nr_scanned;
        dispose_list(&freeable);
-}
-static void prune_icache(int count)
-{
-        struct super_block *sb, *p = NULL;
-        int w_count;
-        int unused = inodes_stat.nr_unused;
-        int prune_ratio;
-        int pruned;
-        if (unused == 0 || count == 0)
-                return;
-        down_read(&iprune_sem);
-        if (count >= unused)
-                prune_ratio = 1;
-        else
-                prune_ratio = unused / count;
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (list_empty(&sb->s_instances))
-                        continue;
-                if (sb->s_nr_inodes_unused == 0)
-                        continue;
-                sb->s_count++;
-                /* Now, we reclaim unused dentrins with fairness.
-                 * We reclaim them same percentage from each superblock.
-                 * We calculate number of dentries to scan on this sb
-                 * as follows, but the implementation is arranged to avoid
-                 * overflows:
-                 * number of dentries to scan on this sb =
-                 * count * (number of dentries on this sb /
-                 * number of dentries in the machine)
-                 */
-                spin_unlock(&sb_lock);
-                if (prune_ratio != 1)
-                        w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
-                else
-                        w_count = sb->s_nr_inodes_unused;
-                pruned = w_count;
-                /*
-                 * We need to be sure this filesystem isn't being unmounted,
-                 * otherwise we could race with generic_shutdown_super(), and
-                 * end up holding a reference to an inode while the filesystem
-                 * is unmounted.  So we try to get s_umount, and make sure
-                 * s_root isn't NULL.
-                 */
-                if (down_read_trylock(&sb->s_umount)) {
-                        if ((sb->s_root != NULL) &&
-                            (!list_empty(&sb->s_dentry_lru))) {
-                                shrink_icache_sb(sb, &w_count);
-                                pruned -= w_count;
-                        }
-                        up_read(&sb->s_umount);
-                }
-                spin_lock(&sb_lock);
-                if (p)
-                        __put_super(p);
-                count -= pruned;
-                p = sb;
-                /* more work left to do? */
-                if (count <= 0)
-                        break;
-        }
-        if (p)
-                __put_super(p);
-        spin_unlock(&sb_lock);
        up_read(&iprune_sem);
 }
-/*
- * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
- * "unused" means that no dentries are referring to the inodes: the files are
- * not open and the dcache references to those inodes have already been
- * reclaimed.
- *
- * This function is passed the number of inodes to scan, and it returns the
- * total number of remaining possibly-reclaimable inodes.
- */
-static int shrink_icache_memory(struct shrinker *shrink,
-                                struct shrink_control *sc)
-{
-        int nr = sc->nr_to_scan;
-        gfp_t gfp_mask = sc->gfp_mask;
-        if (nr) {
-                /*
-                 * Nasty deadlock avoidance.  We may hold various FS locks,
-                 * and we don't want to recurse into the FS that called us
-                 * in clear_inode() and friends..
-                 */
-                if (!(gfp_mask & __GFP_FS))
-                        return -1;
-                prune_icache(nr);
-        }
-        return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
-}
-static struct shrinker icache_shrinker = {
-        .shrink = shrink_icache_memory,
-        .seeks = DEFAULT_SEEKS,
-};
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
@@ -1691,7 +1593,6 @@ void __init inode_init(void)
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_MEM_SPREAD),
                                         init_once);
-        register_shrinker(&icache_shrinker);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
author	Dave Chinner <dchinner@redhat.com>	2011-07-08 00:14:42 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2011-07-20 20:47:10 -0400
commit	b0d40c92adafde7c2d81203ce7c1c69275f41140 (patch)
tree	f75a19dcd1a37aff23dc43323b58f014b1297c6b /fs/inode.c
parent	12ad3ab66103e6582ca69c0c9de18b13487eaaef (diff)

diff --git a/fs/inode.c b/fs/inode.c index 0450e25aeda0..1fdbb64a952f 100644 --- a/fs/inode.c +++ b/fs/inode.c
@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
73	*	73	*
74	* We don't actually need it to protect anything in the umount path,	74	* We don't actually need it to protect anything in the umount path,
75	* but only need to cycle through it to make sure any inode that	75	* but only need to cycle through it to make sure any inode that
76	* prune_icache took off the LRU list has been fully torn down by the	76	* prune_icache_sb took off the LRU list has been fully torn down by the
77	* time we are past evict_inodes.	77	* time we are past evict_inodes.
78	*/	78	*/
79	static DECLARE_RWSEM(iprune_sem);	79	static DECLARE_RWSEM(iprune_sem);
@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb)
544	dispose_list(&dispose);	544	dispose_list(&dispose);
545		545
546	/*	546	/*
547	* Cycle through iprune_sem to make sure any inode that prune_icache	547	* Cycle through iprune_sem to make sure any inode that prune_icache_sb
548	* moved off the list before we took the lock has been fully torn	548	* moved off the list before we took the lock has been fully torn
549	* down.	549	* down.
550	*/	550	*/
@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode)
612	}	612	}
613		613
614	/*	614	/*
615	* Scan `goal' inodes on the unused list for freeable ones. They are moved to a	615	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
616	* temporary list and then are freed outside sb->s_inode_lru_lock by	616	* This is called from the superblock shrinker function with a number of inodes
617	* dispose_list().	617	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
		618	* then are freed outside inode_lock by dispose_list().
618	*	619	*
619	* Any inodes which are pinned purely because of attached pagecache have their	620	* Any inodes which are pinned purely because of attached pagecache have their
620	* pagecache removed. If the inode has metadata buffers attached to	621	* pagecache removed. If the inode has metadata buffers attached to
@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode)
628	* LRU does not have strict ordering. Hence we don't want to reclaim inodes	629	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
629	* with this flag set because they are the inodes that are out of order.	630	* with this flag set because they are the inodes that are out of order.
630	*/	631	*/
631	static void shrink_icache_sb(struct super_block sb, int nr_to_scan)	632	void prune_icache_sb(struct super_block *sb, int nr_to_scan)
632	{	633	{
633	LIST_HEAD(freeable);	634	LIST_HEAD(freeable);
634	int nr_scanned;	635	int nr_scanned;
635	unsigned long reap = 0;	636	unsigned long reap = 0;
636		637
		638	down_read(&iprune_sem);
637	spin_lock(&sb->s_inode_lru_lock);	639	spin_lock(&sb->s_inode_lru_lock);
638	for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {	640	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
639	struct inode *inode;	641	struct inode *inode;
640		642
641	if (list_empty(&sb->s_inode_lru))	643	if (list_empty(&sb->s_inode_lru))
@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block sb, int nr_to_scan)
707	else	709	else
708	__count_vm_events(PGINODESTEAL, reap);	710	__count_vm_events(PGINODESTEAL, reap);
709	spin_unlock(&sb->s_inode_lru_lock);	711	spin_unlock(&sb->s_inode_lru_lock);
710	*nr_to_scan = nr_scanned;
711		712
712	dispose_list(&freeable);	713	dispose_list(&freeable);
713	}
714
715	static void prune_icache(int count)
716	{
717	struct super_block sb, p = NULL;
718	int w_count;
719	int unused = inodes_stat.nr_unused;
720	int prune_ratio;
721	int pruned;
722
723	if (unused == 0 \|\| count == 0)
724	return;
725	down_read(&iprune_sem);
726	if (count >= unused)
727	prune_ratio = 1;
728	else
729	prune_ratio = unused / count;
730	spin_lock(&sb_lock);
731	list_for_each_entry(sb, &super_blocks, s_list) {
732	if (list_empty(&sb->s_instances))
733	continue;
734	if (sb->s_nr_inodes_unused == 0)
735	continue;
736	sb->s_count++;
737	/* Now, we reclaim unused dentrins with fairness.
738	* We reclaim them same percentage from each superblock.
739	* We calculate number of dentries to scan on this sb
740	* as follows, but the implementation is arranged to avoid
741	* overflows:
742	* number of dentries to scan on this sb =
743	* count * (number of dentries on this sb /
744	* number of dentries in the machine)
745	*/
746	spin_unlock(&sb_lock);
747	if (prune_ratio != 1)
748	w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
749	else
750	w_count = sb->s_nr_inodes_unused;
751	pruned = w_count;
752	/*
753	* We need to be sure this filesystem isn't being unmounted,
754	* otherwise we could race with generic_shutdown_super(), and
755	* end up holding a reference to an inode while the filesystem
756	* is unmounted. So we try to get s_umount, and make sure
757	* s_root isn't NULL.
758	*/
759	if (down_read_trylock(&sb->s_umount)) {
760	if ((sb->s_root != NULL) &&
761	(!list_empty(&sb->s_dentry_lru))) {
762	shrink_icache_sb(sb, &w_count);
763	pruned -= w_count;
764	}
765	up_read(&sb->s_umount);
766	}
767	spin_lock(&sb_lock);
768	if (p)
769	__put_super(p);
770	count -= pruned;
771	p = sb;
772	/* more work left to do? */
773	if (count <= 0)
774	break;
775	}
776	if (p)
777	__put_super(p);
778	spin_unlock(&sb_lock);
779	up_read(&iprune_sem);	714	up_read(&iprune_sem);
780	}	715	}
781		716
782	/*
783	* shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
784	* "unused" means that no dentries are referring to the inodes: the files are
785	* not open and the dcache references to those inodes have already been
786	* reclaimed.
787	*
788	* This function is passed the number of inodes to scan, and it returns the
789	* total number of remaining possibly-reclaimable inodes.
790	*/
791	static int shrink_icache_memory(struct shrinker *shrink,
792	struct shrink_control *sc)
793	{
794	int nr = sc->nr_to_scan;
795	gfp_t gfp_mask = sc->gfp_mask;
796
797	if (nr) {
798	/*
799	* Nasty deadlock avoidance. We may hold various FS locks,
800	* and we don't want to recurse into the FS that called us
801	* in clear_inode() and friends..
802	*/
803	if (!(gfp_mask & __GFP_FS))
804	return -1;
805	prune_icache(nr);
806	}
807	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
808	}
809
810	static struct shrinker icache_shrinker = {
811	.shrink = shrink_icache_memory,
812	.seeks = DEFAULT_SEEKS,
813	};
814
815	static void __wait_on_freeing_inode(struct inode *inode);	717	static void __wait_on_freeing_inode(struct inode *inode);
816	/*	718	/*
817	* Called with the inode lock held.	719	* Called with the inode lock held.
@@ -1691,7 +1593,6 @@ void __init inode_init(void)
1691	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|	1593	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
1692	SLAB_MEM_SPREAD),	1594	SLAB_MEM_SPREAD),
1693	init_once);	1595	init_once);
1694	register_shrinker(&icache_shrinker);
1695		1596
1696	/* Hash may have been set up in inode_init_early */	1597	/* Hash may have been set up in inode_init_early */
1697	if (!hashdist)	1598	if (!hashdist)