diff options
author | Dave Chinner <dchinner@redhat.com> | 2011-07-08 00:14:42 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2011-07-20 20:47:10 -0400 |
commit | b0d40c92adafde7c2d81203ce7c1c69275f41140 (patch) | |
tree | f75a19dcd1a37aff23dc43323b58f014b1297c6b /fs/inode.c | |
parent | 12ad3ab66103e6582ca69c0c9de18b13487eaaef (diff) |
superblock: introduce per-sb cache shrinker infrastructure
With context based shrinkers, we can implement a per-superblock
shrinker that shrinks the caches attached to the superblock. We
currently have global shrinkers for the inode and dentry caches that
split up into per-superblock operations via a coarse proportioning
method that does not batch very well. The global shrinkers also
have a dependency - dentries pin inodes - so we have to be very
careful about how we register the global shrinkers so that the
implicit call order is always correct.
With a per-sb shrinker callout, we can encode this dependency
directly into the per-sb shrinker, hence avoiding the need for
strictly ordering shrinker registrations. We also have no need for
any proportioning code for the shrinker subsystem already provides
this functionality across all shrinkers. Allowing the shrinker to
operate on a single superblock at a time means that we do less
superblock list traversals and locking and reclaim should batch more
effectively. This should result in less CPU overhead for reclaim and
potentially faster reclaim of items from each filesystem.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/inode.c')
-rw-r--r-- | fs/inode.c | 117 |
1 files changed, 9 insertions, 108 deletions
diff --git a/fs/inode.c b/fs/inode.c index 0450e25aeda0..1fdbb64a952f 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); | |||
73 | * | 73 | * |
74 | * We don't actually need it to protect anything in the umount path, | 74 | * We don't actually need it to protect anything in the umount path, |
75 | * but only need to cycle through it to make sure any inode that | 75 | * but only need to cycle through it to make sure any inode that |
76 | * prune_icache took off the LRU list has been fully torn down by the | 76 | * prune_icache_sb took off the LRU list has been fully torn down by the |
77 | * time we are past evict_inodes. | 77 | * time we are past evict_inodes. |
78 | */ | 78 | */ |
79 | static DECLARE_RWSEM(iprune_sem); | 79 | static DECLARE_RWSEM(iprune_sem); |
@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb) | |||
544 | dispose_list(&dispose); | 544 | dispose_list(&dispose); |
545 | 545 | ||
546 | /* | 546 | /* |
547 | * Cycle through iprune_sem to make sure any inode that prune_icache | 547 | * Cycle through iprune_sem to make sure any inode that prune_icache_sb |
548 | * moved off the list before we took the lock has been fully torn | 548 | * moved off the list before we took the lock has been fully torn |
549 | * down. | 549 | * down. |
550 | */ | 550 | */ |
@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode) | |||
612 | } | 612 | } |
613 | 613 | ||
614 | /* | 614 | /* |
615 | * Scan `goal' inodes on the unused list for freeable ones. They are moved to a | 615 | * Walk the superblock inode LRU for freeable inodes and attempt to free them. |
616 | * temporary list and then are freed outside sb->s_inode_lru_lock by | 616 | * This is called from the superblock shrinker function with a number of inodes |
617 | * dispose_list(). | 617 | * to trim from the LRU. Inodes to be freed are moved to a temporary list and |
618 | * then are freed outside inode_lock by dispose_list(). | ||
618 | * | 619 | * |
619 | * Any inodes which are pinned purely because of attached pagecache have their | 620 | * Any inodes which are pinned purely because of attached pagecache have their |
620 | * pagecache removed. If the inode has metadata buffers attached to | 621 | * pagecache removed. If the inode has metadata buffers attached to |
@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode) | |||
628 | * LRU does not have strict ordering. Hence we don't want to reclaim inodes | 629 | * LRU does not have strict ordering. Hence we don't want to reclaim inodes |
629 | * with this flag set because they are the inodes that are out of order. | 630 | * with this flag set because they are the inodes that are out of order. |
630 | */ | 631 | */ |
631 | static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) | 632 | void prune_icache_sb(struct super_block *sb, int nr_to_scan) |
632 | { | 633 | { |
633 | LIST_HEAD(freeable); | 634 | LIST_HEAD(freeable); |
634 | int nr_scanned; | 635 | int nr_scanned; |
635 | unsigned long reap = 0; | 636 | unsigned long reap = 0; |
636 | 637 | ||
638 | down_read(&iprune_sem); | ||
637 | spin_lock(&sb->s_inode_lru_lock); | 639 | spin_lock(&sb->s_inode_lru_lock); |
638 | for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) { | 640 | for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { |
639 | struct inode *inode; | 641 | struct inode *inode; |
640 | 642 | ||
641 | if (list_empty(&sb->s_inode_lru)) | 643 | if (list_empty(&sb->s_inode_lru)) |
@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) | |||
707 | else | 709 | else |
708 | __count_vm_events(PGINODESTEAL, reap); | 710 | __count_vm_events(PGINODESTEAL, reap); |
709 | spin_unlock(&sb->s_inode_lru_lock); | 711 | spin_unlock(&sb->s_inode_lru_lock); |
710 | *nr_to_scan = nr_scanned; | ||
711 | 712 | ||
712 | dispose_list(&freeable); | 713 | dispose_list(&freeable); |
713 | } | ||
714 | |||
715 | static void prune_icache(int count) | ||
716 | { | ||
717 | struct super_block *sb, *p = NULL; | ||
718 | int w_count; | ||
719 | int unused = inodes_stat.nr_unused; | ||
720 | int prune_ratio; | ||
721 | int pruned; | ||
722 | |||
723 | if (unused == 0 || count == 0) | ||
724 | return; | ||
725 | down_read(&iprune_sem); | ||
726 | if (count >= unused) | ||
727 | prune_ratio = 1; | ||
728 | else | ||
729 | prune_ratio = unused / count; | ||
730 | spin_lock(&sb_lock); | ||
731 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
732 | if (list_empty(&sb->s_instances)) | ||
733 | continue; | ||
734 | if (sb->s_nr_inodes_unused == 0) | ||
735 | continue; | ||
736 | sb->s_count++; | ||
737 | /* Now, we reclaim unused dentrins with fairness. | ||
738 | * We reclaim them same percentage from each superblock. | ||
739 | * We calculate number of dentries to scan on this sb | ||
740 | * as follows, but the implementation is arranged to avoid | ||
741 | * overflows: | ||
742 | * number of dentries to scan on this sb = | ||
743 | * count * (number of dentries on this sb / | ||
744 | * number of dentries in the machine) | ||
745 | */ | ||
746 | spin_unlock(&sb_lock); | ||
747 | if (prune_ratio != 1) | ||
748 | w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1; | ||
749 | else | ||
750 | w_count = sb->s_nr_inodes_unused; | ||
751 | pruned = w_count; | ||
752 | /* | ||
753 | * We need to be sure this filesystem isn't being unmounted, | ||
754 | * otherwise we could race with generic_shutdown_super(), and | ||
755 | * end up holding a reference to an inode while the filesystem | ||
756 | * is unmounted. So we try to get s_umount, and make sure | ||
757 | * s_root isn't NULL. | ||
758 | */ | ||
759 | if (down_read_trylock(&sb->s_umount)) { | ||
760 | if ((sb->s_root != NULL) && | ||
761 | (!list_empty(&sb->s_dentry_lru))) { | ||
762 | shrink_icache_sb(sb, &w_count); | ||
763 | pruned -= w_count; | ||
764 | } | ||
765 | up_read(&sb->s_umount); | ||
766 | } | ||
767 | spin_lock(&sb_lock); | ||
768 | if (p) | ||
769 | __put_super(p); | ||
770 | count -= pruned; | ||
771 | p = sb; | ||
772 | /* more work left to do? */ | ||
773 | if (count <= 0) | ||
774 | break; | ||
775 | } | ||
776 | if (p) | ||
777 | __put_super(p); | ||
778 | spin_unlock(&sb_lock); | ||
779 | up_read(&iprune_sem); | 714 | up_read(&iprune_sem); |
780 | } | 715 | } |
781 | 716 | ||
782 | /* | ||
783 | * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, | ||
784 | * "unused" means that no dentries are referring to the inodes: the files are | ||
785 | * not open and the dcache references to those inodes have already been | ||
786 | * reclaimed. | ||
787 | * | ||
788 | * This function is passed the number of inodes to scan, and it returns the | ||
789 | * total number of remaining possibly-reclaimable inodes. | ||
790 | */ | ||
791 | static int shrink_icache_memory(struct shrinker *shrink, | ||
792 | struct shrink_control *sc) | ||
793 | { | ||
794 | int nr = sc->nr_to_scan; | ||
795 | gfp_t gfp_mask = sc->gfp_mask; | ||
796 | |||
797 | if (nr) { | ||
798 | /* | ||
799 | * Nasty deadlock avoidance. We may hold various FS locks, | ||
800 | * and we don't want to recurse into the FS that called us | ||
801 | * in clear_inode() and friends.. | ||
802 | */ | ||
803 | if (!(gfp_mask & __GFP_FS)) | ||
804 | return -1; | ||
805 | prune_icache(nr); | ||
806 | } | ||
807 | return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; | ||
808 | } | ||
809 | |||
810 | static struct shrinker icache_shrinker = { | ||
811 | .shrink = shrink_icache_memory, | ||
812 | .seeks = DEFAULT_SEEKS, | ||
813 | }; | ||
814 | |||
815 | static void __wait_on_freeing_inode(struct inode *inode); | 717 | static void __wait_on_freeing_inode(struct inode *inode); |
816 | /* | 718 | /* |
817 | * Called with the inode lock held. | 719 | * Called with the inode lock held. |
@@ -1691,7 +1593,6 @@ void __init inode_init(void) | |||
1691 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 1593 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
1692 | SLAB_MEM_SPREAD), | 1594 | SLAB_MEM_SPREAD), |
1693 | init_once); | 1595 | init_once); |
1694 | register_shrinker(&icache_shrinker); | ||
1695 | 1596 | ||
1696 | /* Hash may have been set up in inode_init_early */ | 1597 | /* Hash may have been set up in inode_init_early */ |
1697 | if (!hashdist) | 1598 | if (!hashdist) |