aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill Tkhai <ktkhai@virtuozzo.com>2018-08-17 18:47:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-17 19:20:30 -0400
commit0a4465d340282f92719f4e3a56545a848e638d15 (patch)
treecf12e7b70e65823552cb91a4bdf01ac694b76d43
parentb05706f1001fe662bafe198814c5999fd996dce0 (diff)
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let we have 200 containers, every container has 10 mounts, and 10 cgroups. All container tasks don't touch foreign containers mounts. If there is intensive pages write, and global reclaim happens, a writing task has to iterate over all memcgs to shrink slab, before it's able to go to shrink_page_list(). Iteration over all the memcg slabs is very expensive: the task has to visit 200 * 10 = 2000 shrinkers for every memcg, and since there are 2000 memcgs, the total calls are 2000 * 2000 = 4000000. So, the shrinker makes 4 million do_shrink_slab() calls just to try to isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via shrink_page_list(). I've observed a node spending almost 100% in kernel, making useless iteration over already shrinked slab. This patch adds bitmap of memcg-aware shrinkers to memcg. The size of the bitmap depends on bitmap_nr_ids, and during memcg life it's maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in the map is related to corresponding shrinker id. Next patches will maintain set bit only for really charged memcg. This will allow shrink_slab() to increase its performance in significant way. See the last patch for the numbers. [ktkhai@virtuozzo.com: v9] Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain [ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()] Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Tested-by: Shakeel Butt <shakeelb@google.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Guenter Roeck <linux@roeck-us.net> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Josef Bacik <jbacik@fb.com> Cc: Li RongQing <lirongqing@baidu.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Matthias Kaehlcke <mka@chromium.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Philippe Ombredanne <pombredanne@nexb.com> Cc: Roman Gushchin <guro@fb.com> Cc: Sahitya Tummala <stummala@codeaurora.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Waiman Long <longman@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h14
-rw-r--r--mm/memcontrol.c124
-rw-r--r--mm/vmscan.c8
3 files changed, 145 insertions, 1 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f3c026df7443..2cccbb9e1b3e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -112,6 +112,15 @@ struct lruvec_stat {
112}; 112};
113 113
114/* 114/*
115 * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
116 * which have elements charged to this memcg.
117 */
118struct memcg_shrinker_map {
119 struct rcu_head rcu;
120 unsigned long map[0];
121};
122
123/*
115 * per-zone information in memory controller. 124 * per-zone information in memory controller.
116 */ 125 */
117struct mem_cgroup_per_node { 126struct mem_cgroup_per_node {
@@ -124,6 +133,9 @@ struct mem_cgroup_per_node {
124 133
125 struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; 134 struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
126 135
136#ifdef CONFIG_MEMCG_KMEM
137 struct memcg_shrinker_map __rcu *shrinker_map;
138#endif
127 struct rb_node tree_node; /* RB tree node */ 139 struct rb_node tree_node; /* RB tree node */
128 unsigned long usage_in_excess;/* Set to the value by which */ 140 unsigned long usage_in_excess;/* Set to the value by which */
129 /* the soft limit is exceeded*/ 141 /* the soft limit is exceeded*/
@@ -1262,6 +1274,8 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
1262 return memcg ? memcg->kmemcg_id : -1; 1274 return memcg ? memcg->kmemcg_id : -1;
1263} 1275}
1264 1276
1277extern int memcg_expand_shrinker_maps(int new_id);
1278
1265#else 1279#else
1266#define for_each_memcg_cache_index(_idx) \ 1280#define for_each_memcg_cache_index(_idx) \
1267 for (; NULL; ) 1281 for (; NULL; )
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 313355dddf66..827c9e87ca08 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -320,6 +320,119 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
320 320
321struct workqueue_struct *memcg_kmem_cache_wq; 321struct workqueue_struct *memcg_kmem_cache_wq;
322 322
323static int memcg_shrinker_map_size;
324static DEFINE_MUTEX(memcg_shrinker_map_mutex);
325
326static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
327{
328 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
329}
330
331static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
332 int size, int old_size)
333{
334 struct memcg_shrinker_map *new, *old;
335 int nid;
336
337 lockdep_assert_held(&memcg_shrinker_map_mutex);
338
339 for_each_node(nid) {
340 old = rcu_dereference_protected(
341 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
342 /* Not yet online memcg */
343 if (!old)
344 return 0;
345
346 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
347 if (!new)
348 return -ENOMEM;
349
350 /* Set all old bits, clear all new bits */
351 memset(new->map, (int)0xff, old_size);
352 memset((void *)new->map + old_size, 0, size - old_size);
353
354 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
355 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
356 }
357
358 return 0;
359}
360
361static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
362{
363 struct mem_cgroup_per_node *pn;
364 struct memcg_shrinker_map *map;
365 int nid;
366
367 if (mem_cgroup_is_root(memcg))
368 return;
369
370 for_each_node(nid) {
371 pn = mem_cgroup_nodeinfo(memcg, nid);
372 map = rcu_dereference_protected(pn->shrinker_map, true);
373 if (map)
374 kvfree(map);
375 rcu_assign_pointer(pn->shrinker_map, NULL);
376 }
377}
378
379static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
380{
381 struct memcg_shrinker_map *map;
382 int nid, size, ret = 0;
383
384 if (mem_cgroup_is_root(memcg))
385 return 0;
386
387 mutex_lock(&memcg_shrinker_map_mutex);
388 size = memcg_shrinker_map_size;
389 for_each_node(nid) {
390 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
391 if (!map) {
392 memcg_free_shrinker_maps(memcg);
393 ret = -ENOMEM;
394 break;
395 }
396 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
397 }
398 mutex_unlock(&memcg_shrinker_map_mutex);
399
400 return ret;
401}
402
403int memcg_expand_shrinker_maps(int new_id)
404{
405 int size, old_size, ret = 0;
406 struct mem_cgroup *memcg;
407
408 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
409 old_size = memcg_shrinker_map_size;
410 if (size <= old_size)
411 return 0;
412
413 mutex_lock(&memcg_shrinker_map_mutex);
414 if (!root_mem_cgroup)
415 goto unlock;
416
417 for_each_mem_cgroup(memcg) {
418 if (mem_cgroup_is_root(memcg))
419 continue;
420 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
421 if (ret)
422 goto unlock;
423 }
424unlock:
425 if (!ret)
426 memcg_shrinker_map_size = size;
427 mutex_unlock(&memcg_shrinker_map_mutex);
428 return ret;
429}
430#else /* CONFIG_MEMCG_KMEM */
431static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
432{
433 return 0;
434}
435static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
323#endif /* CONFIG_MEMCG_KMEM */ 436#endif /* CONFIG_MEMCG_KMEM */
324 437
325/** 438/**
@@ -4356,6 +4469,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4356{ 4469{
4357 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4470 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4358 4471
4472 /*
4473 * A memcg must be visible for memcg_expand_shrinker_maps()
4474 * by the time the maps are allocated. So, we allocate maps
4475 * here, when for_each_mem_cgroup() can't skip it.
4476 */
4477 if (memcg_alloc_shrinker_maps(memcg)) {
4478 mem_cgroup_id_remove(memcg);
4479 return -ENOMEM;
4480 }
4481
4359 /* Online state pins memcg ID, memcg ID pins CSS */ 4482 /* Online state pins memcg ID, memcg ID pins CSS */
4360 atomic_set(&memcg->id.ref, 1); 4483 atomic_set(&memcg->id.ref, 1);
4361 css_get(css); 4484 css_get(css);
@@ -4408,6 +4531,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4408 vmpressure_cleanup(&memcg->vmpressure); 4531 vmpressure_cleanup(&memcg->vmpressure);
4409 cancel_work_sync(&memcg->high_work); 4532 cancel_work_sync(&memcg->high_work);
4410 mem_cgroup_remove_from_trees(memcg); 4533 mem_cgroup_remove_from_trees(memcg);
4534 memcg_free_shrinker_maps(memcg);
4411 memcg_free_kmem(memcg); 4535 memcg_free_kmem(memcg);
4412 mem_cgroup_free(memcg); 4536 mem_cgroup_free(memcg);
4413} 4537}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5cb4f779ea4a..db0970ba340d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,8 +183,14 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
183 if (id < 0) 183 if (id < 0)
184 goto unlock; 184 goto unlock;
185 185
186 if (id >= shrinker_nr_max) 186 if (id >= shrinker_nr_max) {
187 if (memcg_expand_shrinker_maps(id)) {
188 idr_remove(&shrinker_idr, id);
189 goto unlock;
190 }
191
187 shrinker_nr_max = id + 1; 192 shrinker_nr_max = id + 1;
193 }
188 shrinker->id = id; 194 shrinker->id = id;
189 ret = 0; 195 ret = 0;
190unlock: 196unlock: