aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-01-20 18:02:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-20 20:09:18 -0500
commit0b8f73e104285a4badf9d768d1c39b06d77d1f97 (patch)
treead6b42d0f49d108caa296f7f7e76a6e00ec9d136 /mm
parent0db1529817b7b16226421f01470c5ba982c5f302 (diff)
mm: memcontrol: clean up alloc, online, offline, free functions
The creation and teardown of struct mem_cgroup is fairly messy and that has attracted mistakes and subtle bugs before. The main cause for this is that there is no clear model about what needs to happen when, and that attracts more chaos. So create one: 1. mem_cgroup_alloc() should allocate struct mem_cgroup and its auxiliary members and initialize work items, locks etc. so that the object it returns is fully initialized and in a neutral state. 2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new memcg object and configure it and the system according to the role of the new memory-controlled cgroup in the hierarchy. 3. mem_cgroup_css_online() is no longer needed to synchronize with iterators, but it verifies css->id which isn't available earlier. 4. mem_cgroup_css_offline() implements stuff that needs to happen upon the user-visible destruction of a cgroup, which includes stopping all user interfacing as well as releasing certain structures when continued memory consumption would be unexpected at that point. 5. mem_cgroup_css_free() prepares the system and the memcg object for the object's disappearance, neutralizes its state, and then gives it back to mem_cgroup_free(). 6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory. [arnd@arndb.de: fix SLOB build regression] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c257
1 files changed, 84 insertions, 173 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6937f16f5ecb..f6bc78f4ed13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,13 +250,6 @@ enum res_type {
250/* Used for OOM nofiier */ 250/* Used for OOM nofiier */
251#define OOM_CONTROL (0) 251#define OOM_CONTROL (0)
252 252
253/*
254 * The memcg_create_mutex will be held whenever a new cgroup is created.
255 * As a consequence, any change that needs to protect against new child cgroups
256 * appearing has to hold it as well.
257 */
258static DEFINE_MUTEX(memcg_create_mutex);
259
260/* Some nice accessors for the vmpressure. */ 253/* Some nice accessors for the vmpressure. */
261struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 254struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
262{ 255{
@@ -899,17 +892,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
899 if (css == &root->css) 892 if (css == &root->css)
900 break; 893 break;
901 894
902 if (css_tryget(css)) { 895 if (css_tryget(css))
903 /* 896 break;
904 * Make sure the memcg is initialized:
905 * mem_cgroup_css_online() orders the the
906 * initialization against setting the flag.
907 */
908 if (smp_load_acquire(&memcg->initialized))
909 break;
910
911 css_put(css);
912 }
913 897
914 memcg = NULL; 898 memcg = NULL;
915 } 899 }
@@ -2690,14 +2674,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
2690{ 2674{
2691 bool ret; 2675 bool ret;
2692 2676
2693 /*
2694 * The lock does not prevent addition or deletion of children, but
2695 * it prevents a new child from being initialized based on this
2696 * parent in css_online(), so it's enough to decide whether
2697 * hierarchically inherited attributes can still be changed or not.
2698 */
2699 lockdep_assert_held(&memcg_create_mutex);
2700
2701 rcu_read_lock(); 2677 rcu_read_lock();
2702 ret = css_next_child(NULL, &memcg->css); 2678 ret = css_next_child(NULL, &memcg->css);
2703 rcu_read_unlock(); 2679 rcu_read_unlock();
@@ -2760,10 +2736,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2760 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2736 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2761 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 2737 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2762 2738
2763 mutex_lock(&memcg_create_mutex);
2764
2765 if (memcg->use_hierarchy == val) 2739 if (memcg->use_hierarchy == val)
2766 goto out; 2740 return 0;
2767 2741
2768 /* 2742 /*
2769 * If parent's use_hierarchy is set, we can't make any modifications 2743 * If parent's use_hierarchy is set, we can't make any modifications
@@ -2782,9 +2756,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2782 } else 2756 } else
2783 retval = -EINVAL; 2757 retval = -EINVAL;
2784 2758
2785out:
2786 mutex_unlock(&memcg_create_mutex);
2787
2788 return retval; 2759 return retval;
2789} 2760}
2790 2761
@@ -2872,37 +2843,14 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2872#ifndef CONFIG_SLOB 2843#ifndef CONFIG_SLOB
2873static int memcg_online_kmem(struct mem_cgroup *memcg) 2844static int memcg_online_kmem(struct mem_cgroup *memcg)
2874{ 2845{
2875 int err = 0;
2876 int memcg_id; 2846 int memcg_id;
2877 2847
2878 BUG_ON(memcg->kmemcg_id >= 0); 2848 BUG_ON(memcg->kmemcg_id >= 0);
2879 BUG_ON(memcg->kmem_state); 2849 BUG_ON(memcg->kmem_state);
2880 2850
2881 /*
2882 * For simplicity, we won't allow this to be disabled. It also can't
2883 * be changed if the cgroup has children already, or if tasks had
2884 * already joined.
2885 *
2886 * If tasks join before we set the limit, a person looking at
2887 * kmem.usage_in_bytes will have no way to determine when it took
2888 * place, which makes the value quite meaningless.
2889 *
2890 * After it first became limited, changes in the value of the limit are
2891 * of course permitted.
2892 */
2893 mutex_lock(&memcg_create_mutex);
2894 if (cgroup_is_populated(memcg->css.cgroup) ||
2895 (memcg->use_hierarchy && memcg_has_children(memcg)))
2896 err = -EBUSY;
2897 mutex_unlock(&memcg_create_mutex);
2898 if (err)
2899 goto out;
2900
2901 memcg_id = memcg_alloc_cache_id(); 2851 memcg_id = memcg_alloc_cache_id();
2902 if (memcg_id < 0) { 2852 if (memcg_id < 0)
2903 err = memcg_id; 2853 return memcg_id;
2904 goto out;
2905 }
2906 2854
2907 static_branch_inc(&memcg_kmem_enabled_key); 2855 static_branch_inc(&memcg_kmem_enabled_key);
2908 /* 2856 /*
@@ -2913,17 +2861,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
2913 */ 2861 */
2914 memcg->kmemcg_id = memcg_id; 2862 memcg->kmemcg_id = memcg_id;
2915 memcg->kmem_state = KMEM_ONLINE; 2863 memcg->kmem_state = KMEM_ONLINE;
2916out: 2864
2917 return err; 2865 return 0;
2918} 2866}
2919 2867
2920static int memcg_propagate_kmem(struct mem_cgroup *memcg) 2868static int memcg_propagate_kmem(struct mem_cgroup *parent,
2869 struct mem_cgroup *memcg)
2921{ 2870{
2922 int ret = 0; 2871 int ret = 0;
2923 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
2924
2925 if (!parent)
2926 return 0;
2927 2872
2928 mutex_lock(&memcg_limit_mutex); 2873 mutex_lock(&memcg_limit_mutex);
2929 /* 2874 /*
@@ -2985,6 +2930,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
2985 2930
2986static void memcg_free_kmem(struct mem_cgroup *memcg) 2931static void memcg_free_kmem(struct mem_cgroup *memcg)
2987{ 2932{
2933 /* css_alloc() failed, offlining didn't happen */
2934 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2935 memcg_offline_kmem(memcg);
2936
2988 if (memcg->kmem_state == KMEM_ALLOCATED) { 2937 if (memcg->kmem_state == KMEM_ALLOCATED) {
2989 memcg_destroy_kmem_caches(memcg); 2938 memcg_destroy_kmem_caches(memcg);
2990 static_branch_dec(&memcg_kmem_enabled_key); 2939 static_branch_dec(&memcg_kmem_enabled_key);
@@ -2992,7 +2941,11 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
2992 } 2941 }
2993} 2942}
2994#else 2943#else
2995static int memcg_propagate_kmem(struct mem_cgroup *memcg) 2944static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
2945{
2946 return 0;
2947}
2948static int memcg_online_kmem(struct mem_cgroup *memcg)
2996{ 2949{
2997 return 0; 2950 return 0;
2998} 2951}
@@ -3007,11 +2960,16 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
3007static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 2960static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3008 unsigned long limit) 2961 unsigned long limit)
3009{ 2962{
3010 int ret; 2963 int ret = 0;
3011 2964
3012 mutex_lock(&memcg_limit_mutex); 2965 mutex_lock(&memcg_limit_mutex);
3013 /* Top-level cgroup doesn't propagate from root */ 2966 /* Top-level cgroup doesn't propagate from root */
3014 if (!memcg_kmem_online(memcg)) { 2967 if (!memcg_kmem_online(memcg)) {
2968 if (cgroup_is_populated(memcg->css.cgroup) ||
2969 (memcg->use_hierarchy && memcg_has_children(memcg)))
2970 ret = -EBUSY;
2971 if (ret)
2972 goto out;
3015 ret = memcg_online_kmem(memcg); 2973 ret = memcg_online_kmem(memcg);
3016 if (ret) 2974 if (ret)
3017 goto out; 2975 goto out;
@@ -4167,90 +4125,44 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4167 kfree(memcg->nodeinfo[node]); 4125 kfree(memcg->nodeinfo[node]);
4168} 4126}
4169 4127
4170static struct mem_cgroup *mem_cgroup_alloc(void) 4128static void mem_cgroup_free(struct mem_cgroup *memcg)
4171{
4172 struct mem_cgroup *memcg;
4173 size_t size;
4174
4175 size = sizeof(struct mem_cgroup);
4176 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4177
4178 memcg = kzalloc(size, GFP_KERNEL);
4179 if (!memcg)
4180 return NULL;
4181
4182 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4183 if (!memcg->stat)
4184 goto out_free;
4185
4186 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4187 goto out_free_stat;
4188
4189 return memcg;
4190
4191out_free_stat:
4192 free_percpu(memcg->stat);
4193out_free:
4194 kfree(memcg);
4195 return NULL;
4196}
4197
4198/*
4199 * At destroying mem_cgroup, references from swap_cgroup can remain.
4200 * (scanning all at force_empty is too costly...)
4201 *
4202 * Instead of clearing all references at force_empty, we remember
4203 * the number of reference from swap_cgroup and free mem_cgroup when
4204 * it goes down to 0.
4205 *
4206 * Removal of cgroup itself succeeds regardless of refs from swap.
4207 */
4208
4209static void __mem_cgroup_free(struct mem_cgroup *memcg)
4210{ 4129{
4211 int node; 4130 int node;
4212 4131
4213 cancel_work_sync(&memcg->high_work); 4132 memcg_wb_domain_exit(memcg);
4214
4215 mem_cgroup_remove_from_trees(memcg);
4216
4217 for_each_node(node) 4133 for_each_node(node)
4218 free_mem_cgroup_per_zone_info(memcg, node); 4134 free_mem_cgroup_per_zone_info(memcg, node);
4219
4220 free_percpu(memcg->stat); 4135 free_percpu(memcg->stat);
4221 memcg_wb_domain_exit(memcg);
4222 kfree(memcg); 4136 kfree(memcg);
4223} 4137}
4224 4138
4225static struct cgroup_subsys_state * __ref 4139static struct mem_cgroup *mem_cgroup_alloc(void)
4226mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4227{ 4140{
4228 struct mem_cgroup *memcg; 4141 struct mem_cgroup *memcg;
4229 long error = -ENOMEM; 4142 size_t size;
4230 int node; 4143 int node;
4231 4144
4232 memcg = mem_cgroup_alloc(); 4145 size = sizeof(struct mem_cgroup);
4146 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4147
4148 memcg = kzalloc(size, GFP_KERNEL);
4233 if (!memcg) 4149 if (!memcg)
4234 return ERR_PTR(error); 4150 return NULL;
4151
4152 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4153 if (!memcg->stat)
4154 goto fail;
4235 4155
4236 for_each_node(node) 4156 for_each_node(node)
4237 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4157 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4238 goto free_out; 4158 goto fail;
4239 4159
4240 /* root ? */ 4160 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4241 if (parent_css == NULL) { 4161 goto fail;
4242 root_mem_cgroup = memcg;
4243 page_counter_init(&memcg->memory, NULL);
4244 memcg->high = PAGE_COUNTER_MAX;
4245 memcg->soft_limit = PAGE_COUNTER_MAX;
4246 page_counter_init(&memcg->memsw, NULL);
4247 page_counter_init(&memcg->kmem, NULL);
4248 }
4249 4162
4250 INIT_WORK(&memcg->high_work, high_work_func); 4163 INIT_WORK(&memcg->high_work, high_work_func);
4251 memcg->last_scanned_node = MAX_NUMNODES; 4164 memcg->last_scanned_node = MAX_NUMNODES;
4252 INIT_LIST_HEAD(&memcg->oom_notify); 4165 INIT_LIST_HEAD(&memcg->oom_notify);
4253 memcg->move_charge_at_immigrate = 0;
4254 mutex_init(&memcg->thresholds_lock); 4166 mutex_init(&memcg->thresholds_lock);
4255 spin_lock_init(&memcg->move_lock); 4167 spin_lock_init(&memcg->move_lock);
4256 vmpressure_init(&memcg->vmpressure); 4168 vmpressure_init(&memcg->vmpressure);
@@ -4263,48 +4175,37 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4263#ifdef CONFIG_CGROUP_WRITEBACK 4175#ifdef CONFIG_CGROUP_WRITEBACK
4264 INIT_LIST_HEAD(&memcg->cgwb_list); 4176 INIT_LIST_HEAD(&memcg->cgwb_list);
4265#endif 4177#endif
4266 return &memcg->css; 4178 return memcg;
4267 4179fail:
4268free_out: 4180 mem_cgroup_free(memcg);
4269 __mem_cgroup_free(memcg); 4181 return NULL;
4270 return ERR_PTR(error);
4271} 4182}
4272 4183
4273static int 4184static struct cgroup_subsys_state * __ref
4274mem_cgroup_css_online(struct cgroup_subsys_state *css) 4185mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4275{ 4186{
4276 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4187 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4277 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4188 struct mem_cgroup *memcg;
4278 int ret; 4189 long error = -ENOMEM;
4279
4280 if (css->id > MEM_CGROUP_ID_MAX)
4281 return -ENOSPC;
4282
4283 if (!parent)
4284 return 0;
4285
4286 mutex_lock(&memcg_create_mutex);
4287 4190
4288 memcg->use_hierarchy = parent->use_hierarchy; 4191 memcg = mem_cgroup_alloc();
4289 memcg->oom_kill_disable = parent->oom_kill_disable; 4192 if (!memcg)
4290 memcg->swappiness = mem_cgroup_swappiness(parent); 4193 return ERR_PTR(error);
4291 4194
4292 if (parent->use_hierarchy) { 4195 memcg->high = PAGE_COUNTER_MAX;
4196 memcg->soft_limit = PAGE_COUNTER_MAX;
4197 if (parent) {
4198 memcg->swappiness = mem_cgroup_swappiness(parent);
4199 memcg->oom_kill_disable = parent->oom_kill_disable;
4200 }
4201 if (parent && parent->use_hierarchy) {
4202 memcg->use_hierarchy = true;
4293 page_counter_init(&memcg->memory, &parent->memory); 4203 page_counter_init(&memcg->memory, &parent->memory);
4294 memcg->high = PAGE_COUNTER_MAX;
4295 memcg->soft_limit = PAGE_COUNTER_MAX;
4296 page_counter_init(&memcg->memsw, &parent->memsw); 4204 page_counter_init(&memcg->memsw, &parent->memsw);
4297 page_counter_init(&memcg->kmem, &parent->kmem); 4205 page_counter_init(&memcg->kmem, &parent->kmem);
4298 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 4206 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4299
4300 /*
4301 * No need to take a reference to the parent because cgroup
4302 * core guarantees its existence.
4303 */
4304 } else { 4207 } else {
4305 page_counter_init(&memcg->memory, NULL); 4208 page_counter_init(&memcg->memory, NULL);
4306 memcg->high = PAGE_COUNTER_MAX;
4307 memcg->soft_limit = PAGE_COUNTER_MAX;
4308 page_counter_init(&memcg->memsw, NULL); 4209 page_counter_init(&memcg->memsw, NULL);
4309 page_counter_init(&memcg->kmem, NULL); 4210 page_counter_init(&memcg->kmem, NULL);
4310 page_counter_init(&memcg->tcpmem, NULL); 4211 page_counter_init(&memcg->tcpmem, NULL);
@@ -4316,21 +4217,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4316 if (parent != root_mem_cgroup) 4217 if (parent != root_mem_cgroup)
4317 memory_cgrp_subsys.broken_hierarchy = true; 4218 memory_cgrp_subsys.broken_hierarchy = true;
4318 } 4219 }
4319 mutex_unlock(&memcg_create_mutex);
4320 4220
4321 ret = memcg_propagate_kmem(memcg); 4221 /* The following stuff does not apply to the root */
4322 if (ret) 4222 if (!parent) {
4323 return ret; 4223 root_mem_cgroup = memcg;
4224 return &memcg->css;
4225 }
4226
4227 error = memcg_propagate_kmem(parent, memcg);
4228 if (error)
4229 goto fail;
4324 4230
4325 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 4231 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4326 static_branch_inc(&memcg_sockets_enabled_key); 4232 static_branch_inc(&memcg_sockets_enabled_key);
4327 4233
4328 /* 4234 return &memcg->css;
4329 * Make sure the memcg is initialized: mem_cgroup_iter() 4235fail:
4330 * orders reading memcg->initialized against its callers 4236 mem_cgroup_free(memcg);
4331 * reading the memcg members. 4237 return NULL;
4332 */ 4238}
4333 smp_store_release(&memcg->initialized, 1); 4239
4240static int
4241mem_cgroup_css_online(struct cgroup_subsys_state *css)
4242{
4243 if (css->id > MEM_CGROUP_ID_MAX)
4244 return -ENOSPC;
4334 4245
4335 return 0; 4246 return 0;
4336} 4247}
@@ -4352,10 +4263,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4352 } 4263 }
4353 spin_unlock(&memcg->event_list_lock); 4264 spin_unlock(&memcg->event_list_lock);
4354 4265
4355 vmpressure_cleanup(&memcg->vmpressure);
4356
4357 memcg_offline_kmem(memcg); 4266 memcg_offline_kmem(memcg);
4358
4359 wb_memcg_offline(memcg); 4267 wb_memcg_offline(memcg);
4360} 4268}
4361 4269
@@ -4376,8 +4284,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4376 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 4284 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4377 static_branch_dec(&memcg_sockets_enabled_key); 4285 static_branch_dec(&memcg_sockets_enabled_key);
4378 4286
4287 vmpressure_cleanup(&memcg->vmpressure);
4288 cancel_work_sync(&memcg->high_work);
4289 mem_cgroup_remove_from_trees(memcg);
4379 memcg_free_kmem(memcg); 4290 memcg_free_kmem(memcg);
4380 __mem_cgroup_free(memcg); 4291 mem_cgroup_free(memcg);
4381} 4292}
4382 4293
4383/** 4294/**