aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVladimir Davydov <vdavydov@parallels.com>2015-02-12 17:58:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 21:54:08 -0500
commit503c358cf1925853195ee39ec437e51138bbb7df (patch)
tree14aebe291975ec4353f21068990ebfec503ed63f
parent10c1045f28e86ac90589a188f0be2d7a4347efdf (diff)
list_lru: introduce list_lru_shrink_{count,walk}
Kmem accounting of memcg is unusable now, because it lacks slab shrinker support. That means when we hit the limit we will get ENOMEM w/o any chance to recover. What we should do then is to call shrink_slab, which would reclaim old inode/dentry caches from this cgroup. This is what this patch set is intended to do. Basically, it does two things. First, it introduces the notion of per-memcg slab shrinker. A shrinker that wants to reclaim objects per cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be passed the memory cgroup to scan from in shrink_control->memcg. For such shrinkers shrink_slab iterates over the whole cgroup subtree under the target cgroup and calls the shrinker for each kmem-active memory cgroup. Secondly, this patch set makes the list_lru structure per-memcg. It's done transparently to list_lru users - everything they have to do is to tell list_lru_init that they want memcg-aware list_lru. Then the list_lru will automatically distribute objects among per-memcg lists basing on which cgroup the object is accounted to. This way to make FS shrinkers (icache, dcache) memcg-aware we only need to make them use memcg-aware list_lru, and this is what this patch set does. As before, this patch set only enables per-memcg kmem reclaim when the pressure goes from memory.limit, not from memory.kmem.limit. Handling memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and it is still unclear whether we will have this knob in the unified hierarchy. This patch (of 9): NUMA aware slab shrinkers use the list_lru structure to distribute objects coming from different NUMA nodes to different lists. Whenever such a shrinker needs to count or scan objects from a particular node, it issues commands like this: count = list_lru_count_node(lru, sc->nid); freed = list_lru_walk_node(lru, sc->nid, isolate_func, isolate_arg, &sc->nr_to_scan); where sc is an instance of the shrink_control structure passed to it from vmscan. To simplify this, let's add special list_lru functions to be used by shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which consolidate the nid and nr_to_scan arguments in the shrink_control structure. This will also allow us to avoid patching shrinkers that use list_lru when we make shrink_slab() per-memcg - all we will have to do is extend the shrink_control structure to include the target memcg and make list_lru_shrink_{count,walk} handle this appropriately. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Suggested-by: Dave Chinner <david@fromorbit.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Greg Thelen <gthelen@google.com> Cc: Glauber Costa <glommer@gmail.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/inode.c7
-rw-r--r--fs/internal.h7
-rw-r--r--fs/super.c24
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--include/linux/list_lru.h16
-rw-r--r--mm/workingset.c6
9 files changed, 51 insertions, 43 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..56c5da89f58a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
930/** 930/**
931 * prune_dcache_sb - shrink the dcache 931 * prune_dcache_sb - shrink the dcache
932 * @sb: superblock 932 * @sb: superblock
933 * @nr_to_scan : number of entries to try to free 933 * @sc: shrink control, passed to list_lru_shrink_walk()
934 * @nid: which node to scan for freeable entities
935 * 934 *
936 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 935 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
937 * done when we need more memory an called from the superblock shrinker 936 * is done when we need more memory and called from the superblock shrinker
938 * function. 937 * function.
939 * 938 *
940 * This function may fail to free any resources if all the dentries are in 939 * This function may fail to free any resources if all the dentries are in
941 * use. 940 * use.
942 */ 941 */
943long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 942long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
944 int nid)
945{ 943{
946 LIST_HEAD(dispose); 944 LIST_HEAD(dispose);
947 long freed; 945 long freed;
948 946
949 freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, 947 freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
950 &dispose, &nr_to_scan); 948 dentry_lru_isolate, &dispose);
951 shrink_dentry_list(&dispose); 949 shrink_dentry_list(&dispose);
952 return freed; 950 return freed;
953} 951}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..c15d6b216d0b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
171 if (!(sc->gfp_mask & __GFP_FS)) 171 if (!(sc->gfp_mask & __GFP_FS))
172 return SHRINK_STOP; 172 return SHRINK_STOP;
173 173
174 freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, 174 freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
175 &dispose, &sc->nr_to_scan); 175 gfs2_qd_isolate, &dispose);
176 176
177 gfs2_qd_dispose(&dispose); 177 gfs2_qd_dispose(&dispose);
178 178
@@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
182static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, 182static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
183 struct shrink_control *sc) 183 struct shrink_control *sc)
184{ 184{
185 return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); 185 return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
186} 186}
187 187
188struct shrinker gfs2_qd_shrinker = { 188struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/inode.c b/fs/inode.c
index 3a53b1da3fb8..524a32c2b0c6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -751,14 +751,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
751 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 751 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
752 * then are freed outside inode_lock by dispose_list(). 752 * then are freed outside inode_lock by dispose_list().
753 */ 753 */
754long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 754long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
755 int nid)
756{ 755{
757 LIST_HEAD(freeable); 756 LIST_HEAD(freeable);
758 long freed; 757 long freed;
759 758
760 freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, 759 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
761 &freeable, &nr_to_scan); 760 inode_lru_isolate, &freeable);
762 dispose_list(&freeable); 761 dispose_list(&freeable);
763 return freed; 762 return freed;
764} 763}
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..d92c346a793d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
14struct linux_binprm; 14struct linux_binprm;
15struct path; 15struct path;
16struct mount; 16struct mount;
17struct shrink_control;
17 18
18/* 19/*
19 * block_dev.c 20 * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
111 * inode.c 112 * inode.c
112 */ 113 */
113extern spinlock_t inode_sb_list_lock; 114extern spinlock_t inode_sb_list_lock;
114extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 115extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
115 int nid);
116extern void inode_add_lru(struct inode *inode); 116extern void inode_add_lru(struct inode *inode);
117 117
118/* 118/*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
129 */ 129 */
130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
131extern int d_set_mounted(struct dentry *dentry); 131extern int d_set_mounted(struct dentry *dentry);
132extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 132extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
133 int nid);
134 133
135/* 134/*
136 * read_write.c 135 * read_write.c
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..4554ac257647 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
77 if (sb->s_op->nr_cached_objects) 77 if (sb->s_op->nr_cached_objects)
78 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); 78 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
79 79
80 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 80 inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
81 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 81 dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
82 total_objects = dentries + inodes + fs_objects + 1; 82 total_objects = dentries + inodes + fs_objects + 1;
83 if (!total_objects) 83 if (!total_objects)
84 total_objects = 1; 84 total_objects = 1;
@@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
86 /* proportion the scan between the caches */ 86 /* proportion the scan between the caches */
87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); 88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
89 fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
89 90
90 /* 91 /*
91 * prune the dcache first as the icache is pinned by it, then 92 * prune the dcache first as the icache is pinned by it, then
92 * prune the icache, followed by the filesystem specific caches 93 * prune the icache, followed by the filesystem specific caches
93 */ 94 */
94 freed = prune_dcache_sb(sb, dentries, sc->nid); 95 sc->nr_to_scan = dentries;
95 freed += prune_icache_sb(sb, inodes, sc->nid); 96 freed = prune_dcache_sb(sb, sc);
97 sc->nr_to_scan = inodes;
98 freed += prune_icache_sb(sb, sc);
96 99
97 if (fs_objects) { 100 if (fs_objects)
98 fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
99 total_objects);
100 freed += sb->s_op->free_cached_objects(sb, fs_objects, 101 freed += sb->s_op->free_cached_objects(sb, fs_objects,
101 sc->nid); 102 sc->nid);
102 }
103 103
104 drop_super(sb); 104 drop_super(sb);
105 return freed; 105 return freed;
@@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink,
118 * scalability bottleneck. The counts could get updated 118 * scalability bottleneck. The counts could get updated
119 * between super_cache_count and super_cache_scan anyway. 119 * between super_cache_count and super_cache_scan anyway.
120 * Call to super_cache_count with shrinker_rwsem held 120 * Call to super_cache_count with shrinker_rwsem held
121 * ensures the safety of call to list_lru_count_node() and 121 * ensures the safety of call to list_lru_shrink_count() and
122 * s_op->nr_cached_objects(). 122 * s_op->nr_cached_objects().
123 */ 123 */
124 if (sb->s_op && sb->s_op->nr_cached_objects) 124 if (sb->s_op && sb->s_op->nr_cached_objects)
125 total_objects = sb->s_op->nr_cached_objects(sb, 125 total_objects = sb->s_op->nr_cached_objects(sb,
126 sc->nid); 126 sc->nid);
127 127
128 total_objects += list_lru_count_node(&sb->s_dentry_lru, 128 total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
129 sc->nid); 129 total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
130 total_objects += list_lru_count_node(&sb->s_inode_lru,
131 sc->nid);
132 130
133 total_objects = vfs_pressure_ratio(total_objects); 131 total_objects = vfs_pressure_ratio(total_objects);
134 return total_objects; 132 return total_objects;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..15c9d224c721 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan(
1583 struct xfs_buftarg, bt_shrinker); 1583 struct xfs_buftarg, bt_shrinker);
1584 LIST_HEAD(dispose); 1584 LIST_HEAD(dispose);
1585 unsigned long freed; 1585 unsigned long freed;
1586 unsigned long nr_to_scan = sc->nr_to_scan;
1587 1586
1588 freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, 1587 freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1589 &dispose, &nr_to_scan); 1588 xfs_buftarg_isolate, &dispose);
1590 1589
1591 while (!list_empty(&dispose)) { 1590 while (!list_empty(&dispose)) {
1592 struct xfs_buf *bp; 1591 struct xfs_buf *bp;
@@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count(
1605{ 1604{
1606 struct xfs_buftarg *btp = container_of(shrink, 1605 struct xfs_buftarg *btp = container_of(shrink,
1607 struct xfs_buftarg, bt_shrinker); 1606 struct xfs_buftarg, bt_shrinker);
1608 return list_lru_count_node(&btp->bt_lru, sc->nid); 1607 return list_lru_shrink_count(&btp->bt_lru, sc);
1609} 1608}
1610 1609
1611void 1610void
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e8186279541..4f4b1274e144 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -523,7 +523,6 @@ xfs_qm_shrink_scan(
523 struct xfs_qm_isolate isol; 523 struct xfs_qm_isolate isol;
524 unsigned long freed; 524 unsigned long freed;
525 int error; 525 int error;
526 unsigned long nr_to_scan = sc->nr_to_scan;
527 526
528 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 527 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
529 return 0; 528 return 0;
@@ -531,8 +530,8 @@ xfs_qm_shrink_scan(
531 INIT_LIST_HEAD(&isol.buffers); 530 INIT_LIST_HEAD(&isol.buffers);
532 INIT_LIST_HEAD(&isol.dispose); 531 INIT_LIST_HEAD(&isol.dispose);
533 532
534 freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, 533 freed = list_lru_shrink_walk(&qi->qi_lru, sc,
535 &nr_to_scan); 534 xfs_qm_dquot_isolate, &isol);
536 535
537 error = xfs_buf_delwri_submit(&isol.buffers); 536 error = xfs_buf_delwri_submit(&isol.buffers);
538 if (error) 537 if (error)
@@ -557,7 +556,7 @@ xfs_qm_shrink_count(
557 struct xfs_quotainfo *qi = container_of(shrink, 556 struct xfs_quotainfo *qi = container_of(shrink,
558 struct xfs_quotainfo, qi_shrinker); 557 struct xfs_quotainfo, qi_shrinker);
559 558
560 return list_lru_count_node(&qi->qi_lru, sc->nid); 559 return list_lru_shrink_count(&qi->qi_lru, sc);
561} 560}
562 561
563/* 562/*
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f3434533fbf8..f500a2e39b13 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,7 @@
9 9
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/nodemask.h> 11#include <linux/nodemask.h>
12#include <linux/shrinker.h>
12 13
13/* list_lru_walk_cb has to always return one of those */ 14/* list_lru_walk_cb has to always return one of those */
14enum lru_status { 15enum lru_status {
@@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
81 * Callers that want such a guarantee need to provide an outer lock. 82 * Callers that want such a guarantee need to provide an outer lock.
82 */ 83 */
83unsigned long list_lru_count_node(struct list_lru *lru, int nid); 84unsigned long list_lru_count_node(struct list_lru *lru, int nid);
85
86static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
87 struct shrink_control *sc)
88{
89 return list_lru_count_node(lru, sc->nid);
90}
91
84static inline unsigned long list_lru_count(struct list_lru *lru) 92static inline unsigned long list_lru_count(struct list_lru *lru)
85{ 93{
86 long count = 0; 94 long count = 0;
@@ -120,6 +128,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
120 unsigned long *nr_to_walk); 128 unsigned long *nr_to_walk);
121 129
122static inline unsigned long 130static inline unsigned long
131list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
132 list_lru_walk_cb isolate, void *cb_arg)
133{
134 return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
135 &sc->nr_to_scan);
136}
137
138static inline unsigned long
123list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, 139list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
124 void *cb_arg, unsigned long nr_to_walk) 140 void *cb_arg, unsigned long nr_to_walk)
125{ 141{
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..d4fa7fb10a52 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
275 275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable(); 277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); 278 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
279 local_irq_enable(); 279 local_irq_enable();
280 280
281 pages = node_present_pages(sc->nid); 281 pages = node_present_pages(sc->nid);
@@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
376 376
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable(); 378 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, 379 ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan); 380 shadow_lru_isolate, NULL);
381 local_irq_enable(); 381 local_irq_enable();
382 return ret; 382 return ret;
383} 383}