diff options
-rw-r--r-- | include/linux/memcontrol.h | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 207 | ||||
-rw-r--r-- | mm/slab_common.c | 28 |
3 files changed, 221 insertions, 16 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b69a0470007..45085e14e023 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -447,6 +447,8 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s); | |||
447 | void memcg_release_cache(struct kmem_cache *cachep); | 447 | void memcg_release_cache(struct kmem_cache *cachep); |
448 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); | 448 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); |
449 | 449 | ||
450 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); | ||
451 | void memcg_update_array_size(int num_groups); | ||
450 | /** | 452 | /** |
451 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. | 453 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. |
452 | * @gfp: the gfp allocation flags. | 454 | * @gfp: the gfp allocation flags. |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eafe6cf6ca4..db38b60e5f87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg) | |||
378 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | 378 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); |
379 | } | 379 | } |
380 | 380 | ||
381 | static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | ||
382 | { | ||
383 | clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
384 | } | ||
385 | |||
381 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | 386 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) |
382 | { | 387 | { |
383 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | 388 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) |
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) | |||
549 | #endif | 554 | #endif |
550 | 555 | ||
551 | #ifdef CONFIG_MEMCG_KMEM | 556 | #ifdef CONFIG_MEMCG_KMEM |
557 | /* | ||
558 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | ||
559 | * There are two main reasons for not using the css_id for this: | ||
560 | * 1) this works better in sparse environments, where we have a lot of memcgs, | ||
561 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | ||
562 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | ||
563 | * 200 entry array for that. | ||
564 | * | ||
565 | * 2) In order not to violate the cgroup API, we would like to do all memory | ||
566 | * allocation in ->create(). At that point, we haven't yet allocated the | ||
567 | * css_id. Having a separate index prevents us from messing with the cgroup | ||
568 | * core for this | ||
569 | * | ||
570 | * The current size of the caches array is stored in | ||
571 | * memcg_limited_groups_array_size. It will double each time we have to | ||
572 | * increase it. | ||
573 | */ | ||
574 | static DEFINE_IDA(kmem_limited_groups); | ||
575 | static int memcg_limited_groups_array_size; | ||
576 | /* | ||
577 | * MIN_SIZE is different than 1, because we would like to avoid going through | ||
578 | * the alloc/free process all the time. In a small machine, 4 kmem-limited | ||
579 | * cgroups is a reasonable guess. In the future, it could be a parameter or | ||
580 | * tunable, but that is strictly not necessary. | ||
581 | * | ||
582 | * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get | ||
583 | * this constant directly from cgroup, but it is understandable that this is | ||
584 | * better kept as an internal representation in cgroup.c. In any case, the | ||
585 | * css_id space is not getting any smaller, and we don't have to necessarily | ||
586 | * increase ours as well if it increases. | ||
587 | */ | ||
588 | #define MEMCG_CACHES_MIN_SIZE 4 | ||
589 | #define MEMCG_CACHES_MAX_SIZE 65535 | ||
590 | |||
552 | struct static_key memcg_kmem_enabled_key; | 591 | struct static_key memcg_kmem_enabled_key; |
553 | 592 | ||
554 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 593 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
555 | { | 594 | { |
556 | if (memcg_kmem_is_active(memcg)) | 595 | if (memcg_kmem_is_active(memcg)) { |
557 | static_key_slow_dec(&memcg_kmem_enabled_key); | 596 | static_key_slow_dec(&memcg_kmem_enabled_key); |
597 | ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); | ||
598 | } | ||
558 | /* | 599 | /* |
559 | * This check can't live in kmem destruction function, | 600 | * This check can't live in kmem destruction function, |
560 | * since the charges will outlive the cgroup | 601 | * since the charges will outlive the cgroup |
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
2813 | return memcg ? memcg->kmemcg_id : -1; | 2854 | return memcg ? memcg->kmemcg_id : -1; |
2814 | } | 2855 | } |
2815 | 2856 | ||
2857 | /* | ||
2858 | * This ends up being protected by the set_limit mutex, during normal | ||
2859 | * operation, because that is its main call site. | ||
2860 | * | ||
2861 | * But when we create a new cache, we can call this as well if its parent | ||
2862 | * is kmem-limited. That will have to hold set_limit_mutex as well. | ||
2863 | */ | ||
2864 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | ||
2865 | { | ||
2866 | int num, ret; | ||
2867 | |||
2868 | num = ida_simple_get(&kmem_limited_groups, | ||
2869 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
2870 | if (num < 0) | ||
2871 | return num; | ||
2872 | /* | ||
2873 | * After this point, kmem_accounted (that we test atomically in | ||
2874 | * the beginning of this conditional), is no longer 0. This | ||
2875 | * guarantees only one process will set the following boolean | ||
2876 | * to true. We don't need test_and_set because we're protected | ||
2877 | * by the set_limit_mutex anyway. | ||
2878 | */ | ||
2879 | memcg_kmem_set_activated(memcg); | ||
2880 | |||
2881 | ret = memcg_update_all_caches(num+1); | ||
2882 | if (ret) { | ||
2883 | ida_simple_remove(&kmem_limited_groups, num); | ||
2884 | memcg_kmem_clear_activated(memcg); | ||
2885 | return ret; | ||
2886 | } | ||
2887 | |||
2888 | memcg->kmemcg_id = num; | ||
2889 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
2890 | mutex_init(&memcg->slab_caches_mutex); | ||
2891 | return 0; | ||
2892 | } | ||
2893 | |||
2894 | static size_t memcg_caches_array_size(int num_groups) | ||
2895 | { | ||
2896 | ssize_t size; | ||
2897 | if (num_groups <= 0) | ||
2898 | return 0; | ||
2899 | |||
2900 | size = 2 * num_groups; | ||
2901 | if (size < MEMCG_CACHES_MIN_SIZE) | ||
2902 | size = MEMCG_CACHES_MIN_SIZE; | ||
2903 | else if (size > MEMCG_CACHES_MAX_SIZE) | ||
2904 | size = MEMCG_CACHES_MAX_SIZE; | ||
2905 | |||
2906 | return size; | ||
2907 | } | ||
2908 | |||
2909 | /* | ||
2910 | * We should update the current array size iff all caches updates succeed. This | ||
2911 | * can only be done from the slab side. The slab mutex needs to be held when | ||
2912 | * calling this. | ||
2913 | */ | ||
2914 | void memcg_update_array_size(int num) | ||
2915 | { | ||
2916 | if (num > memcg_limited_groups_array_size) | ||
2917 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | ||
2918 | } | ||
2919 | |||
2920 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | ||
2921 | { | ||
2922 | struct memcg_cache_params *cur_params = s->memcg_params; | ||
2923 | |||
2924 | VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); | ||
2925 | |||
2926 | if (num_groups > memcg_limited_groups_array_size) { | ||
2927 | int i; | ||
2928 | ssize_t size = memcg_caches_array_size(num_groups); | ||
2929 | |||
2930 | size *= sizeof(void *); | ||
2931 | size += sizeof(struct memcg_cache_params); | ||
2932 | |||
2933 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
2934 | if (!s->memcg_params) { | ||
2935 | s->memcg_params = cur_params; | ||
2936 | return -ENOMEM; | ||
2937 | } | ||
2938 | |||
2939 | s->memcg_params->is_root_cache = true; | ||
2940 | |||
2941 | /* | ||
2942 | * There is the chance it will be bigger than | ||
2943 | * memcg_limited_groups_array_size, if we failed an allocation | ||
2944 | * in a cache, in which case all caches updated before it, will | ||
2945 | * have a bigger array. | ||
2946 | * | ||
2947 | * But if that is the case, the data after | ||
2948 | * memcg_limited_groups_array_size is certainly unused | ||
2949 | */ | ||
2950 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
2951 | if (!cur_params->memcg_caches[i]) | ||
2952 | continue; | ||
2953 | s->memcg_params->memcg_caches[i] = | ||
2954 | cur_params->memcg_caches[i]; | ||
2955 | } | ||
2956 | |||
2957 | /* | ||
2958 | * Ideally, we would wait until all caches succeed, and only | ||
2959 | * then free the old one. But this is not worth the extra | ||
2960 | * pointer per-cache we'd have to have for this. | ||
2961 | * | ||
2962 | * It is not a big deal if some caches are left with a size | ||
2963 | * bigger than the others. And all updates will reset this | ||
2964 | * anyway. | ||
2965 | */ | ||
2966 | kfree(cur_params); | ||
2967 | } | ||
2968 | return 0; | ||
2969 | } | ||
2970 | |||
2816 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) | 2971 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) |
2817 | { | 2972 | { |
2818 | size_t size = sizeof(struct memcg_cache_params); | 2973 | size_t size = sizeof(struct memcg_cache_params); |
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) | |||
2820 | if (!memcg_kmem_enabled()) | 2975 | if (!memcg_kmem_enabled()) |
2821 | return 0; | 2976 | return 0; |
2822 | 2977 | ||
2978 | if (!memcg) | ||
2979 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
2980 | |||
2823 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 2981 | s->memcg_params = kzalloc(size, GFP_KERNEL); |
2824 | if (!s->memcg_params) | 2982 | if (!s->memcg_params) |
2825 | return -ENOMEM; | 2983 | return -ENOMEM; |
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4326 | ret = res_counter_set_limit(&memcg->kmem, val); | 4484 | ret = res_counter_set_limit(&memcg->kmem, val); |
4327 | VM_BUG_ON(ret); | 4485 | VM_BUG_ON(ret); |
4328 | 4486 | ||
4329 | /* | 4487 | ret = memcg_update_cache_sizes(memcg); |
4330 | * After this point, kmem_accounted (that we test atomically in | 4488 | if (ret) { |
4331 | * the beginning of this conditional), is no longer 0. This | 4489 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); |
4332 | * guarantees only one process will set the following boolean | 4490 | goto out; |
4333 | * to true. We don't need test_and_set because we're protected | 4491 | } |
4334 | * by the set_limit_mutex anyway. | ||
4335 | */ | ||
4336 | memcg_kmem_set_activated(memcg); | ||
4337 | must_inc_static_branch = true; | 4492 | must_inc_static_branch = true; |
4338 | /* | 4493 | /* |
4339 | * kmem charges can outlive the cgroup. In the case of slab | 4494 | * kmem charges can outlive the cgroup. In the case of slab |
@@ -4372,11 +4527,13 @@ out: | |||
4372 | return ret; | 4527 | return ret; |
4373 | } | 4528 | } |
4374 | 4529 | ||
4375 | static void memcg_propagate_kmem(struct mem_cgroup *memcg) | 4530 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
4376 | { | 4531 | { |
4532 | int ret = 0; | ||
4377 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 4533 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
4378 | if (!parent) | 4534 | if (!parent) |
4379 | return; | 4535 | goto out; |
4536 | |||
4380 | memcg->kmem_account_flags = parent->kmem_account_flags; | 4537 | memcg->kmem_account_flags = parent->kmem_account_flags; |
4381 | #ifdef CONFIG_MEMCG_KMEM | 4538 | #ifdef CONFIG_MEMCG_KMEM |
4382 | /* | 4539 | /* |
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
4389 | * It is a lot simpler just to do static_key_slow_inc() on every child | 4546 | * It is a lot simpler just to do static_key_slow_inc() on every child |
4390 | * that is accounted. | 4547 | * that is accounted. |
4391 | */ | 4548 | */ |
4392 | if (memcg_kmem_is_active(memcg)) { | 4549 | if (!memcg_kmem_is_active(memcg)) |
4393 | mem_cgroup_get(memcg); | 4550 | goto out; |
4394 | static_key_slow_inc(&memcg_kmem_enabled_key); | 4551 | |
4395 | } | 4552 | /* |
4553 | * destroy(), called if we fail, will issue static_key_slow_inc() and | ||
4554 | * mem_cgroup_put() if kmem is enabled. We have to either call them | ||
4555 | * unconditionally, or clear the KMEM_ACTIVE flag. I personally find | ||
4556 | * this more consistent, since it always leads to the same destroy path | ||
4557 | */ | ||
4558 | mem_cgroup_get(memcg); | ||
4559 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
4560 | |||
4561 | mutex_lock(&set_limit_mutex); | ||
4562 | ret = memcg_update_cache_sizes(memcg); | ||
4563 | mutex_unlock(&set_limit_mutex); | ||
4396 | #endif | 4564 | #endif |
4565 | out: | ||
4566 | return ret; | ||
4397 | } | 4567 | } |
4398 | 4568 | ||
4399 | /* | 4569 | /* |
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
5075 | #ifdef CONFIG_MEMCG_KMEM | 5245 | #ifdef CONFIG_MEMCG_KMEM |
5076 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5246 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
5077 | { | 5247 | { |
5248 | int ret; | ||
5249 | |||
5078 | memcg->kmemcg_id = -1; | 5250 | memcg->kmemcg_id = -1; |
5079 | memcg_propagate_kmem(memcg); | 5251 | ret = memcg_propagate_kmem(memcg); |
5252 | if (ret) | ||
5253 | return ret; | ||
5080 | 5254 | ||
5081 | return mem_cgroup_sockets_init(memcg, ss); | 5255 | return mem_cgroup_sockets_init(memcg, ss); |
5082 | }; | 5256 | }; |
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
5479 | res_counter_init(&memcg->res, &parent->res); | 5653 | res_counter_init(&memcg->res, &parent->res); |
5480 | res_counter_init(&memcg->memsw, &parent->memsw); | 5654 | res_counter_init(&memcg->memsw, &parent->memsw); |
5481 | res_counter_init(&memcg->kmem, &parent->kmem); | 5655 | res_counter_init(&memcg->kmem, &parent->kmem); |
5656 | |||
5482 | /* | 5657 | /* |
5483 | * We increment refcnt of the parent to ensure that we can | 5658 | * We increment refcnt of the parent to ensure that we can |
5484 | * safely access it on res_counter_charge/uncharge. | 5659 | * safely access it on res_counter_charge/uncharge. |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 3031badcc577..1c424b6511bf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, | |||
81 | } | 81 | } |
82 | #endif | 82 | #endif |
83 | 83 | ||
84 | #ifdef CONFIG_MEMCG_KMEM | ||
85 | int memcg_update_all_caches(int num_memcgs) | ||
86 | { | ||
87 | struct kmem_cache *s; | ||
88 | int ret = 0; | ||
89 | mutex_lock(&slab_mutex); | ||
90 | |||
91 | list_for_each_entry(s, &slab_caches, list) { | ||
92 | if (!is_root_cache(s)) | ||
93 | continue; | ||
94 | |||
95 | ret = memcg_update_cache_size(s, num_memcgs); | ||
96 | /* | ||
97 | * See comment in memcontrol.c, memcg_update_cache_size: | ||
98 | * Instead of freeing the memory, we'll just leave the caches | ||
99 | * up to this point in an updated state. | ||
100 | */ | ||
101 | if (ret) | ||
102 | goto out; | ||
103 | } | ||
104 | |||
105 | memcg_update_array_size(num_memcgs); | ||
106 | out: | ||
107 | mutex_unlock(&slab_mutex); | ||
108 | return ret; | ||
109 | } | ||
110 | #endif | ||
111 | |||
84 | /* | 112 | /* |
85 | * Figure out what the alignment of the objects will be given a set of | 113 | * Figure out what the alignment of the objects will be given a set of |
86 | * flags, a user specified alignment and the size of the objects. | 114 | * flags, a user specified alignment and the size of the objects. |