aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c207
1 files changed, 191 insertions, 16 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3eafe6cf6ca4..db38b60e5f87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379} 379}
380 380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
381static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
382{ 387{
383 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
549#endif 554#endif
550 555
551#ifdef CONFIG_MEMCG_KMEM 556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575static int memcg_limited_groups_array_size;
576/*
577 * MIN_SIZE is different than 1, because we would like to avoid going through
578 * the alloc/free process all the time. In a small machine, 4 kmem-limited
579 * cgroups is a reasonable guess. In the future, it could be a parameter or
580 * tunable, but that is strictly not necessary.
581 *
582 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
583 * this constant directly from cgroup, but it is understandable that this is
584 * better kept as an internal representation in cgroup.c. In any case, the
585 * css_id space is not getting any smaller, and we don't have to necessarily
586 * increase ours as well if it increases.
587 */
588#define MEMCG_CACHES_MIN_SIZE 4
589#define MEMCG_CACHES_MAX_SIZE 65535
590
552struct static_key memcg_kmem_enabled_key; 591struct static_key memcg_kmem_enabled_key;
553 592
554static void disarm_kmem_keys(struct mem_cgroup *memcg) 593static void disarm_kmem_keys(struct mem_cgroup *memcg)
555{ 594{
556 if (memcg_kmem_is_active(memcg)) 595 if (memcg_kmem_is_active(memcg)) {
557 static_key_slow_dec(&memcg_kmem_enabled_key); 596 static_key_slow_dec(&memcg_kmem_enabled_key);
597 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
598 }
558 /* 599 /*
559 * This check can't live in kmem destruction function, 600 * This check can't live in kmem destruction function,
560 * since the charges will outlive the cgroup 601 * since the charges will outlive the cgroup
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg)
2813 return memcg ? memcg->kmemcg_id : -1; 2854 return memcg ? memcg->kmemcg_id : -1;
2814} 2855}
2815 2856
2857/*
2858 * This ends up being protected by the set_limit mutex, during normal
2859 * operation, because that is its main call site.
2860 *
2861 * But when we create a new cache, we can call this as well if its parent
2862 * is kmem-limited. That will have to hold set_limit_mutex as well.
2863 */
2864int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2865{
2866 int num, ret;
2867
2868 num = ida_simple_get(&kmem_limited_groups,
2869 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2870 if (num < 0)
2871 return num;
2872 /*
2873 * After this point, kmem_accounted (that we test atomically in
2874 * the beginning of this conditional), is no longer 0. This
2875 * guarantees only one process will set the following boolean
2876 * to true. We don't need test_and_set because we're protected
2877 * by the set_limit_mutex anyway.
2878 */
2879 memcg_kmem_set_activated(memcg);
2880
2881 ret = memcg_update_all_caches(num+1);
2882 if (ret) {
2883 ida_simple_remove(&kmem_limited_groups, num);
2884 memcg_kmem_clear_activated(memcg);
2885 return ret;
2886 }
2887
2888 memcg->kmemcg_id = num;
2889 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2890 mutex_init(&memcg->slab_caches_mutex);
2891 return 0;
2892}
2893
2894static size_t memcg_caches_array_size(int num_groups)
2895{
2896 ssize_t size;
2897 if (num_groups <= 0)
2898 return 0;
2899
2900 size = 2 * num_groups;
2901 if (size < MEMCG_CACHES_MIN_SIZE)
2902 size = MEMCG_CACHES_MIN_SIZE;
2903 else if (size > MEMCG_CACHES_MAX_SIZE)
2904 size = MEMCG_CACHES_MAX_SIZE;
2905
2906 return size;
2907}
2908
2909/*
2910 * We should update the current array size iff all caches updates succeed. This
2911 * can only be done from the slab side. The slab mutex needs to be held when
2912 * calling this.
2913 */
2914void memcg_update_array_size(int num)
2915{
2916 if (num > memcg_limited_groups_array_size)
2917 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2918}
2919
2920int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2921{
2922 struct memcg_cache_params *cur_params = s->memcg_params;
2923
2924 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2925
2926 if (num_groups > memcg_limited_groups_array_size) {
2927 int i;
2928 ssize_t size = memcg_caches_array_size(num_groups);
2929
2930 size *= sizeof(void *);
2931 size += sizeof(struct memcg_cache_params);
2932
2933 s->memcg_params = kzalloc(size, GFP_KERNEL);
2934 if (!s->memcg_params) {
2935 s->memcg_params = cur_params;
2936 return -ENOMEM;
2937 }
2938
2939 s->memcg_params->is_root_cache = true;
2940
2941 /*
2942 * There is the chance it will be bigger than
2943 * memcg_limited_groups_array_size, if we failed an allocation
2944 * in a cache, in which case all caches updated before it, will
2945 * have a bigger array.
2946 *
2947 * But if that is the case, the data after
2948 * memcg_limited_groups_array_size is certainly unused
2949 */
2950 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2951 if (!cur_params->memcg_caches[i])
2952 continue;
2953 s->memcg_params->memcg_caches[i] =
2954 cur_params->memcg_caches[i];
2955 }
2956
2957 /*
2958 * Ideally, we would wait until all caches succeed, and only
2959 * then free the old one. But this is not worth the extra
2960 * pointer per-cache we'd have to have for this.
2961 *
2962 * It is not a big deal if some caches are left with a size
2963 * bigger than the others. And all updates will reset this
2964 * anyway.
2965 */
2966 kfree(cur_params);
2967 }
2968 return 0;
2969}
2970
2816int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) 2971int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
2817{ 2972{
2818 size_t size = sizeof(struct memcg_cache_params); 2973 size_t size = sizeof(struct memcg_cache_params);
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
2820 if (!memcg_kmem_enabled()) 2975 if (!memcg_kmem_enabled())
2821 return 0; 2976 return 0;
2822 2977
2978 if (!memcg)
2979 size += memcg_limited_groups_array_size * sizeof(void *);
2980
2823 s->memcg_params = kzalloc(size, GFP_KERNEL); 2981 s->memcg_params = kzalloc(size, GFP_KERNEL);
2824 if (!s->memcg_params) 2982 if (!s->memcg_params)
2825 return -ENOMEM; 2983 return -ENOMEM;
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4326 ret = res_counter_set_limit(&memcg->kmem, val); 4484 ret = res_counter_set_limit(&memcg->kmem, val);
4327 VM_BUG_ON(ret); 4485 VM_BUG_ON(ret);
4328 4486
4329 /* 4487 ret = memcg_update_cache_sizes(memcg);
4330 * After this point, kmem_accounted (that we test atomically in 4488 if (ret) {
4331 * the beginning of this conditional), is no longer 0. This 4489 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4332 * guarantees only one process will set the following boolean 4490 goto out;
4333 * to true. We don't need test_and_set because we're protected 4491 }
4334 * by the set_limit_mutex anyway.
4335 */
4336 memcg_kmem_set_activated(memcg);
4337 must_inc_static_branch = true; 4492 must_inc_static_branch = true;
4338 /* 4493 /*
4339 * kmem charges can outlive the cgroup. In the case of slab 4494 * kmem charges can outlive the cgroup. In the case of slab
@@ -4372,11 +4527,13 @@ out:
4372 return ret; 4527 return ret;
4373} 4528}
4374 4529
4375static void memcg_propagate_kmem(struct mem_cgroup *memcg) 4530static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4376{ 4531{
4532 int ret = 0;
4377 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4533 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4378 if (!parent) 4534 if (!parent)
4379 return; 4535 goto out;
4536
4380 memcg->kmem_account_flags = parent->kmem_account_flags; 4537 memcg->kmem_account_flags = parent->kmem_account_flags;
4381#ifdef CONFIG_MEMCG_KMEM 4538#ifdef CONFIG_MEMCG_KMEM
4382 /* 4539 /*
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg)
4389 * It is a lot simpler just to do static_key_slow_inc() on every child 4546 * It is a lot simpler just to do static_key_slow_inc() on every child
4390 * that is accounted. 4547 * that is accounted.
4391 */ 4548 */
4392 if (memcg_kmem_is_active(memcg)) { 4549 if (!memcg_kmem_is_active(memcg))
4393 mem_cgroup_get(memcg); 4550 goto out;
4394 static_key_slow_inc(&memcg_kmem_enabled_key); 4551
4395 } 4552 /*
4553 * destroy(), called if we fail, will issue static_key_slow_inc() and
4554 * mem_cgroup_put() if kmem is enabled. We have to either call them
4555 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4556 * this more consistent, since it always leads to the same destroy path
4557 */
4558 mem_cgroup_get(memcg);
4559 static_key_slow_inc(&memcg_kmem_enabled_key);
4560
4561 mutex_lock(&set_limit_mutex);
4562 ret = memcg_update_cache_sizes(memcg);
4563 mutex_unlock(&set_limit_mutex);
4396#endif 4564#endif
4565out:
4566 return ret;
4397} 4567}
4398 4568
4399/* 4569/*
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5075#ifdef CONFIG_MEMCG_KMEM 5245#ifdef CONFIG_MEMCG_KMEM
5076static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5246static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5077{ 5247{
5248 int ret;
5249
5078 memcg->kmemcg_id = -1; 5250 memcg->kmemcg_id = -1;
5079 memcg_propagate_kmem(memcg); 5251 ret = memcg_propagate_kmem(memcg);
5252 if (ret)
5253 return ret;
5080 5254
5081 return mem_cgroup_sockets_init(memcg, ss); 5255 return mem_cgroup_sockets_init(memcg, ss);
5082}; 5256};
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5479 res_counter_init(&memcg->res, &parent->res); 5653 res_counter_init(&memcg->res, &parent->res);
5480 res_counter_init(&memcg->memsw, &parent->memsw); 5654 res_counter_init(&memcg->memsw, &parent->memsw);
5481 res_counter_init(&memcg->kmem, &parent->kmem); 5655 res_counter_init(&memcg->kmem, &parent->kmem);
5656
5482 /* 5657 /*
5483 * We increment refcnt of the parent to ensure that we can 5658 * We increment refcnt of the parent to ensure that we can
5484 * safely access it on res_counter_charge/uncharge. 5659 * safely access it on res_counter_charge/uncharge.