aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2012-12-18 17:22:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 18:02:14 -0500
commitd7f25f8a2f81252d1ac134470ba1d0a287cf8fcd (patch)
treeecde8b5d98762e15a6fa1984d098ddf86646942b
parent55007d849759252ddd573aeb36143b947202d509 (diff)
memcg: infrastructure to match an allocation to the right cache
The page allocator is able to bind a page to a memcg when it is allocated. But for the caches, we'd like to have as many objects as possible in a page belonging to the same cache. This is done in this patch by calling memcg_kmem_get_cache in the beginning of every allocation function. This function is patched out by static branches when kernel memory controller is not being used. It assumes that the task allocating, which determines the memcg in the page allocator, belongs to the same cgroup throughout the whole process. Misaccounting can happen if the task calls memcg_kmem_get_cache() while belonging to a cgroup, and later on changes. This is considered acceptable, and should only happen upon task migration. Before the cache is created by the memcg core, there is also a possible imbalance: the task belongs to a memcg, but the cache being allocated from is the global cache, since the child cache is not yet guaranteed to be ready. This case is also fine, since in this case the GFP_KMEMCG will not be passed and the page allocator will not attempt any cgroup accounting. Signed-off-by: Glauber Costa <glommer@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h41
-rw-r--r--init/Kconfig1
-rw-r--r--mm/memcontrol.c217
3 files changed, 258 insertions, 1 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 45085e14e023..bd9b5d73bc2b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -449,6 +449,10 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
449 449
450int memcg_update_cache_size(struct kmem_cache *s, int num_groups); 450int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
451void memcg_update_array_size(int num_groups); 451void memcg_update_array_size(int num_groups);
452
453struct kmem_cache *
454__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
455
452/** 456/**
453 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. 457 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
454 * @gfp: the gfp allocation flags. 458 * @gfp: the gfp allocation flags.
@@ -518,6 +522,37 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
518 __memcg_kmem_commit_charge(page, memcg, order); 522 __memcg_kmem_commit_charge(page, memcg, order);
519} 523}
520 524
525/**
526 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
527 * @cachep: the original global kmem cache
528 * @gfp: allocation flags.
529 *
530 * This function assumes that the task allocating, which determines the memcg
531 * in the page allocator, belongs to the same cgroup throughout the whole
532 * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
533 * while belonging to a cgroup, and later on changes. This is considered
534 * acceptable, and should only happen upon task migration.
535 *
536 * Before the cache is created by the memcg core, there is also a possible
537 * imbalance: the task belongs to a memcg, but the cache being allocated from
538 * is the global cache, since the child cache is not yet guaranteed to be
539 * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
540 * passed and the page allocator will not attempt any cgroup accounting.
541 */
542static __always_inline struct kmem_cache *
543memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
544{
545 if (!memcg_kmem_enabled())
546 return cachep;
547 if (gfp & __GFP_NOFAIL)
548 return cachep;
549 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
550 return cachep;
551 if (unlikely(fatal_signal_pending(current)))
552 return cachep;
553
554 return __memcg_kmem_get_cache(cachep, gfp);
555}
521#else 556#else
522static inline bool 557static inline bool
523memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) 558memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
@@ -553,6 +588,12 @@ static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
553 struct kmem_cache *s) 588 struct kmem_cache *s)
554{ 589{
555} 590}
591
592static inline struct kmem_cache *
593memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
594{
595 return cachep;
596}
556#endif /* CONFIG_MEMCG_KMEM */ 597#endif /* CONFIG_MEMCG_KMEM */
557#endif /* _LINUX_MEMCONTROL_H */ 598#endif /* _LINUX_MEMCONTROL_H */
558 599
diff --git a/init/Kconfig b/init/Kconfig
index 19ccb33c99d9..7d30240e5bfe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -883,7 +883,6 @@ config MEMCG_KMEM
883 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" 883 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
884 depends on MEMCG && EXPERIMENTAL 884 depends on MEMCG && EXPERIMENTAL
885 depends on SLUB || SLAB 885 depends on SLUB || SLAB
886 default n
887 help 886 help
888 The Kernel Memory extension for Memory Resource Controller can limit 887 The Kernel Memory extension for Memory Resource Controller can limit
889 the amount of memory used by kernel objects in the system. Those are 888 the amount of memory used by kernel objects in the system. Those are
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db38b60e5f87..efd26620a60b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size;
588#define MEMCG_CACHES_MIN_SIZE 4 588#define MEMCG_CACHES_MIN_SIZE 4
589#define MEMCG_CACHES_MAX_SIZE 65535 589#define MEMCG_CACHES_MAX_SIZE 65535
590 590
591/*
592 * A lot of the calls to the cache allocation functions are expected to be
593 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
594 * conditional to this static branch, we'll have to allow modules that does
595 * kmem_cache_alloc and the such to see this symbol as well
596 */
591struct static_key memcg_kmem_enabled_key; 597struct static_key memcg_kmem_enabled_key;
598EXPORT_SYMBOL(memcg_kmem_enabled_key);
592 599
593static void disarm_kmem_keys(struct mem_cgroup *memcg) 600static void disarm_kmem_keys(struct mem_cgroup *memcg)
594{ 601{
@@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
2989 2996
2990void memcg_release_cache(struct kmem_cache *s) 2997void memcg_release_cache(struct kmem_cache *s)
2991{ 2998{
2999 struct kmem_cache *root;
3000 struct mem_cgroup *memcg;
3001 int id;
3002
3003 /*
3004 * This happens, for instance, when a root cache goes away before we
3005 * add any memcg.
3006 */
3007 if (!s->memcg_params)
3008 return;
3009
3010 if (s->memcg_params->is_root_cache)
3011 goto out;
3012
3013 memcg = s->memcg_params->memcg;
3014 id = memcg_cache_id(memcg);
3015
3016 root = s->memcg_params->root_cache;
3017 root->memcg_params->memcg_caches[id] = NULL;
3018 mem_cgroup_put(memcg);
3019
3020 mutex_lock(&memcg->slab_caches_mutex);
3021 list_del(&s->memcg_params->list);
3022 mutex_unlock(&memcg->slab_caches_mutex);
3023
3024out:
2992 kfree(s->memcg_params); 3025 kfree(s->memcg_params);
2993} 3026}
2994 3027
3028static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3029{
3030 char *name;
3031 struct dentry *dentry;
3032
3033 rcu_read_lock();
3034 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3035 rcu_read_unlock();
3036
3037 BUG_ON(dentry == NULL);
3038
3039 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3040 memcg_cache_id(memcg), dentry->d_name.name);
3041
3042 return name;
3043}
3044
3045static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3046 struct kmem_cache *s)
3047{
3048 char *name;
3049 struct kmem_cache *new;
3050
3051 name = memcg_cache_name(memcg, s);
3052 if (!name)
3053 return NULL;
3054
3055 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3056 (s->flags & ~SLAB_PANIC), s->ctor);
3057
3058 kfree(name);
3059 return new;
3060}
3061
3062/*
3063 * This lock protects updaters, not readers. We want readers to be as fast as
3064 * they can, and they will either see NULL or a valid cache value. Our model
3065 * allow them to see NULL, in which case the root memcg will be selected.
3066 *
3067 * We need this lock because multiple allocations to the same cache from a non
3068 * will span more than one worker. Only one of them can create the cache.
3069 */
3070static DEFINE_MUTEX(memcg_cache_mutex);
3071static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3072 struct kmem_cache *cachep)
3073{
3074 struct kmem_cache *new_cachep;
3075 int idx;
3076
3077 BUG_ON(!memcg_can_account_kmem(memcg));
3078
3079 idx = memcg_cache_id(memcg);
3080
3081 mutex_lock(&memcg_cache_mutex);
3082 new_cachep = cachep->memcg_params->memcg_caches[idx];
3083 if (new_cachep)
3084 goto out;
3085
3086 new_cachep = kmem_cache_dup(memcg, cachep);
3087
3088 if (new_cachep == NULL) {
3089 new_cachep = cachep;
3090 goto out;
3091 }
3092
3093 mem_cgroup_get(memcg);
3094 new_cachep->memcg_params->root_cache = cachep;
3095
3096 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3097 /*
3098 * the readers won't lock, make sure everybody sees the updated value,
3099 * so they won't put stuff in the queue again for no reason
3100 */
3101 wmb();
3102out:
3103 mutex_unlock(&memcg_cache_mutex);
3104 return new_cachep;
3105}
3106
3107struct create_work {
3108 struct mem_cgroup *memcg;
3109 struct kmem_cache *cachep;
3110 struct work_struct work;
3111};
3112
3113static void memcg_create_cache_work_func(struct work_struct *w)
3114{
3115 struct create_work *cw;
3116
3117 cw = container_of(w, struct create_work, work);
3118 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3119 /* Drop the reference gotten when we enqueued. */
3120 css_put(&cw->memcg->css);
3121 kfree(cw);
3122}
3123
3124/*
3125 * Enqueue the creation of a per-memcg kmem_cache.
3126 * Called with rcu_read_lock.
3127 */
3128static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3129 struct kmem_cache *cachep)
3130{
3131 struct create_work *cw;
3132
3133 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3134 if (cw == NULL)
3135 return;
3136
3137 /* The corresponding put will be done in the workqueue. */
3138 if (!css_tryget(&memcg->css)) {
3139 kfree(cw);
3140 return;
3141 }
3142
3143 cw->memcg = memcg;
3144 cw->cachep = cachep;
3145
3146 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3147 schedule_work(&cw->work);
3148}
3149
3150/*
3151 * Return the kmem_cache we're supposed to use for a slab allocation.
3152 * We try to use the current memcg's version of the cache.
3153 *
3154 * If the cache does not exist yet, if we are the first user of it,
3155 * we either create it immediately, if possible, or create it asynchronously
3156 * in a workqueue.
3157 * In the latter case, we will let the current allocation go through with
3158 * the original cache.
3159 *
3160 * Can't be called in interrupt context or from kernel threads.
3161 * This function needs to be called with rcu_read_lock() held.
3162 */
3163struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3164 gfp_t gfp)
3165{
3166 struct mem_cgroup *memcg;
3167 int idx;
3168
3169 VM_BUG_ON(!cachep->memcg_params);
3170 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3171
3172 rcu_read_lock();
3173 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3174 rcu_read_unlock();
3175
3176 if (!memcg_can_account_kmem(memcg))
3177 return cachep;
3178
3179 idx = memcg_cache_id(memcg);
3180
3181 /*
3182 * barrier to mare sure we're always seeing the up to date value. The
3183 * code updating memcg_caches will issue a write barrier to match this.
3184 */
3185 read_barrier_depends();
3186 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3187 /*
3188 * If we are in a safe context (can wait, and not in interrupt
3189 * context), we could be be predictable and return right away.
3190 * This would guarantee that the allocation being performed
3191 * already belongs in the new cache.
3192 *
3193 * However, there are some clashes that can arrive from locking.
3194 * For instance, because we acquire the slab_mutex while doing
3195 * kmem_cache_dup, this means no further allocation could happen
3196 * with the slab_mutex held.
3197 *
3198 * Also, because cache creation issue get_online_cpus(), this
3199 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3200 * that ends up reversed during cpu hotplug. (cpuset allocates
3201 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3202 * better to defer everything.
3203 */
3204 memcg_create_cache_enqueue(memcg, cachep);
3205 return cachep;
3206 }
3207
3208 return cachep->memcg_params->memcg_caches[idx];
3209}
3210EXPORT_SYMBOL(__memcg_kmem_get_cache);
3211
2995/* 3212/*
2996 * We need to verify if the allocation against current->mm->owner's memcg is 3213 * We need to verify if the allocation against current->mm->owner's memcg is
2997 * possible for the given order. But the page is not allocated yet, so we'll 3214 * possible for the given order. But the page is not allocated yet, so we'll