diff options
-rw-r--r-- | include/linux/memcontrol.h | 41 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | mm/memcontrol.c | 217 |
3 files changed, 258 insertions, 1 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 45085e14e023..bd9b5d73bc2b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -449,6 +449,10 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep); | |||
449 | 449 | ||
450 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); | 450 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); |
451 | void memcg_update_array_size(int num_groups); | 451 | void memcg_update_array_size(int num_groups); |
452 | |||
453 | struct kmem_cache * | ||
454 | __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); | ||
455 | |||
452 | /** | 456 | /** |
453 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. | 457 | * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. |
454 | * @gfp: the gfp allocation flags. | 458 | * @gfp: the gfp allocation flags. |
@@ -518,6 +522,37 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | |||
518 | __memcg_kmem_commit_charge(page, memcg, order); | 522 | __memcg_kmem_commit_charge(page, memcg, order); |
519 | } | 523 | } |
520 | 524 | ||
525 | /** | ||
526 | * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation | ||
527 | * @cachep: the original global kmem cache | ||
528 | * @gfp: allocation flags. | ||
529 | * | ||
530 | * This function assumes that the task allocating, which determines the memcg | ||
531 | * in the page allocator, belongs to the same cgroup throughout the whole | ||
532 | * process. Misacounting can happen if the task calls memcg_kmem_get_cache() | ||
533 | * while belonging to a cgroup, and later on changes. This is considered | ||
534 | * acceptable, and should only happen upon task migration. | ||
535 | * | ||
536 | * Before the cache is created by the memcg core, there is also a possible | ||
537 | * imbalance: the task belongs to a memcg, but the cache being allocated from | ||
538 | * is the global cache, since the child cache is not yet guaranteed to be | ||
539 | * ready. This case is also fine, since in this case the GFP_KMEMCG will not be | ||
540 | * passed and the page allocator will not attempt any cgroup accounting. | ||
541 | */ | ||
542 | static __always_inline struct kmem_cache * | ||
543 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | ||
544 | { | ||
545 | if (!memcg_kmem_enabled()) | ||
546 | return cachep; | ||
547 | if (gfp & __GFP_NOFAIL) | ||
548 | return cachep; | ||
549 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) | ||
550 | return cachep; | ||
551 | if (unlikely(fatal_signal_pending(current))) | ||
552 | return cachep; | ||
553 | |||
554 | return __memcg_kmem_get_cache(cachep, gfp); | ||
555 | } | ||
521 | #else | 556 | #else |
522 | static inline bool | 557 | static inline bool |
523 | memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | 558 | memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) |
@@ -553,6 +588,12 @@ static inline void memcg_cache_list_add(struct mem_cgroup *memcg, | |||
553 | struct kmem_cache *s) | 588 | struct kmem_cache *s) |
554 | { | 589 | { |
555 | } | 590 | } |
591 | |||
592 | static inline struct kmem_cache * | ||
593 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | ||
594 | { | ||
595 | return cachep; | ||
596 | } | ||
556 | #endif /* CONFIG_MEMCG_KMEM */ | 597 | #endif /* CONFIG_MEMCG_KMEM */ |
557 | #endif /* _LINUX_MEMCONTROL_H */ | 598 | #endif /* _LINUX_MEMCONTROL_H */ |
558 | 599 | ||
diff --git a/init/Kconfig b/init/Kconfig index 19ccb33c99d9..7d30240e5bfe 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -883,7 +883,6 @@ config MEMCG_KMEM | |||
883 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" | 883 | bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" |
884 | depends on MEMCG && EXPERIMENTAL | 884 | depends on MEMCG && EXPERIMENTAL |
885 | depends on SLUB || SLAB | 885 | depends on SLUB || SLAB |
886 | default n | ||
887 | help | 886 | help |
888 | The Kernel Memory extension for Memory Resource Controller can limit | 887 | The Kernel Memory extension for Memory Resource Controller can limit |
889 | the amount of memory used by kernel objects in the system. Those are | 888 | the amount of memory used by kernel objects in the system. Those are |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db38b60e5f87..efd26620a60b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size; | |||
588 | #define MEMCG_CACHES_MIN_SIZE 4 | 588 | #define MEMCG_CACHES_MIN_SIZE 4 |
589 | #define MEMCG_CACHES_MAX_SIZE 65535 | 589 | #define MEMCG_CACHES_MAX_SIZE 65535 |
590 | 590 | ||
591 | /* | ||
592 | * A lot of the calls to the cache allocation functions are expected to be | ||
593 | * inlined by the compiler. Since the calls to memcg_kmem_get_cache are | ||
594 | * conditional to this static branch, we'll have to allow modules that does | ||
595 | * kmem_cache_alloc and the such to see this symbol as well | ||
596 | */ | ||
591 | struct static_key memcg_kmem_enabled_key; | 597 | struct static_key memcg_kmem_enabled_key; |
598 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | ||
592 | 599 | ||
593 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 600 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
594 | { | 601 | { |
@@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) | |||
2989 | 2996 | ||
2990 | void memcg_release_cache(struct kmem_cache *s) | 2997 | void memcg_release_cache(struct kmem_cache *s) |
2991 | { | 2998 | { |
2999 | struct kmem_cache *root; | ||
3000 | struct mem_cgroup *memcg; | ||
3001 | int id; | ||
3002 | |||
3003 | /* | ||
3004 | * This happens, for instance, when a root cache goes away before we | ||
3005 | * add any memcg. | ||
3006 | */ | ||
3007 | if (!s->memcg_params) | ||
3008 | return; | ||
3009 | |||
3010 | if (s->memcg_params->is_root_cache) | ||
3011 | goto out; | ||
3012 | |||
3013 | memcg = s->memcg_params->memcg; | ||
3014 | id = memcg_cache_id(memcg); | ||
3015 | |||
3016 | root = s->memcg_params->root_cache; | ||
3017 | root->memcg_params->memcg_caches[id] = NULL; | ||
3018 | mem_cgroup_put(memcg); | ||
3019 | |||
3020 | mutex_lock(&memcg->slab_caches_mutex); | ||
3021 | list_del(&s->memcg_params->list); | ||
3022 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3023 | |||
3024 | out: | ||
2992 | kfree(s->memcg_params); | 3025 | kfree(s->memcg_params); |
2993 | } | 3026 | } |
2994 | 3027 | ||
3028 | static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) | ||
3029 | { | ||
3030 | char *name; | ||
3031 | struct dentry *dentry; | ||
3032 | |||
3033 | rcu_read_lock(); | ||
3034 | dentry = rcu_dereference(memcg->css.cgroup->dentry); | ||
3035 | rcu_read_unlock(); | ||
3036 | |||
3037 | BUG_ON(dentry == NULL); | ||
3038 | |||
3039 | name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, | ||
3040 | memcg_cache_id(memcg), dentry->d_name.name); | ||
3041 | |||
3042 | return name; | ||
3043 | } | ||
3044 | |||
3045 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | ||
3046 | struct kmem_cache *s) | ||
3047 | { | ||
3048 | char *name; | ||
3049 | struct kmem_cache *new; | ||
3050 | |||
3051 | name = memcg_cache_name(memcg, s); | ||
3052 | if (!name) | ||
3053 | return NULL; | ||
3054 | |||
3055 | new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, | ||
3056 | (s->flags & ~SLAB_PANIC), s->ctor); | ||
3057 | |||
3058 | kfree(name); | ||
3059 | return new; | ||
3060 | } | ||
3061 | |||
3062 | /* | ||
3063 | * This lock protects updaters, not readers. We want readers to be as fast as | ||
3064 | * they can, and they will either see NULL or a valid cache value. Our model | ||
3065 | * allow them to see NULL, in which case the root memcg will be selected. | ||
3066 | * | ||
3067 | * We need this lock because multiple allocations to the same cache from a non | ||
3068 | * will span more than one worker. Only one of them can create the cache. | ||
3069 | */ | ||
3070 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
3071 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | ||
3072 | struct kmem_cache *cachep) | ||
3073 | { | ||
3074 | struct kmem_cache *new_cachep; | ||
3075 | int idx; | ||
3076 | |||
3077 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
3078 | |||
3079 | idx = memcg_cache_id(memcg); | ||
3080 | |||
3081 | mutex_lock(&memcg_cache_mutex); | ||
3082 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | ||
3083 | if (new_cachep) | ||
3084 | goto out; | ||
3085 | |||
3086 | new_cachep = kmem_cache_dup(memcg, cachep); | ||
3087 | |||
3088 | if (new_cachep == NULL) { | ||
3089 | new_cachep = cachep; | ||
3090 | goto out; | ||
3091 | } | ||
3092 | |||
3093 | mem_cgroup_get(memcg); | ||
3094 | new_cachep->memcg_params->root_cache = cachep; | ||
3095 | |||
3096 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | ||
3097 | /* | ||
3098 | * the readers won't lock, make sure everybody sees the updated value, | ||
3099 | * so they won't put stuff in the queue again for no reason | ||
3100 | */ | ||
3101 | wmb(); | ||
3102 | out: | ||
3103 | mutex_unlock(&memcg_cache_mutex); | ||
3104 | return new_cachep; | ||
3105 | } | ||
3106 | |||
3107 | struct create_work { | ||
3108 | struct mem_cgroup *memcg; | ||
3109 | struct kmem_cache *cachep; | ||
3110 | struct work_struct work; | ||
3111 | }; | ||
3112 | |||
3113 | static void memcg_create_cache_work_func(struct work_struct *w) | ||
3114 | { | ||
3115 | struct create_work *cw; | ||
3116 | |||
3117 | cw = container_of(w, struct create_work, work); | ||
3118 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | ||
3119 | /* Drop the reference gotten when we enqueued. */ | ||
3120 | css_put(&cw->memcg->css); | ||
3121 | kfree(cw); | ||
3122 | } | ||
3123 | |||
3124 | /* | ||
3125 | * Enqueue the creation of a per-memcg kmem_cache. | ||
3126 | * Called with rcu_read_lock. | ||
3127 | */ | ||
3128 | static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | ||
3129 | struct kmem_cache *cachep) | ||
3130 | { | ||
3131 | struct create_work *cw; | ||
3132 | |||
3133 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | ||
3134 | if (cw == NULL) | ||
3135 | return; | ||
3136 | |||
3137 | /* The corresponding put will be done in the workqueue. */ | ||
3138 | if (!css_tryget(&memcg->css)) { | ||
3139 | kfree(cw); | ||
3140 | return; | ||
3141 | } | ||
3142 | |||
3143 | cw->memcg = memcg; | ||
3144 | cw->cachep = cachep; | ||
3145 | |||
3146 | INIT_WORK(&cw->work, memcg_create_cache_work_func); | ||
3147 | schedule_work(&cw->work); | ||
3148 | } | ||
3149 | |||
3150 | /* | ||
3151 | * Return the kmem_cache we're supposed to use for a slab allocation. | ||
3152 | * We try to use the current memcg's version of the cache. | ||
3153 | * | ||
3154 | * If the cache does not exist yet, if we are the first user of it, | ||
3155 | * we either create it immediately, if possible, or create it asynchronously | ||
3156 | * in a workqueue. | ||
3157 | * In the latter case, we will let the current allocation go through with | ||
3158 | * the original cache. | ||
3159 | * | ||
3160 | * Can't be called in interrupt context or from kernel threads. | ||
3161 | * This function needs to be called with rcu_read_lock() held. | ||
3162 | */ | ||
3163 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | ||
3164 | gfp_t gfp) | ||
3165 | { | ||
3166 | struct mem_cgroup *memcg; | ||
3167 | int idx; | ||
3168 | |||
3169 | VM_BUG_ON(!cachep->memcg_params); | ||
3170 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | ||
3171 | |||
3172 | rcu_read_lock(); | ||
3173 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
3174 | rcu_read_unlock(); | ||
3175 | |||
3176 | if (!memcg_can_account_kmem(memcg)) | ||
3177 | return cachep; | ||
3178 | |||
3179 | idx = memcg_cache_id(memcg); | ||
3180 | |||
3181 | /* | ||
3182 | * barrier to mare sure we're always seeing the up to date value. The | ||
3183 | * code updating memcg_caches will issue a write barrier to match this. | ||
3184 | */ | ||
3185 | read_barrier_depends(); | ||
3186 | if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { | ||
3187 | /* | ||
3188 | * If we are in a safe context (can wait, and not in interrupt | ||
3189 | * context), we could be be predictable and return right away. | ||
3190 | * This would guarantee that the allocation being performed | ||
3191 | * already belongs in the new cache. | ||
3192 | * | ||
3193 | * However, there are some clashes that can arrive from locking. | ||
3194 | * For instance, because we acquire the slab_mutex while doing | ||
3195 | * kmem_cache_dup, this means no further allocation could happen | ||
3196 | * with the slab_mutex held. | ||
3197 | * | ||
3198 | * Also, because cache creation issue get_online_cpus(), this | ||
3199 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3200 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3201 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3202 | * better to defer everything. | ||
3203 | */ | ||
3204 | memcg_create_cache_enqueue(memcg, cachep); | ||
3205 | return cachep; | ||
3206 | } | ||
3207 | |||
3208 | return cachep->memcg_params->memcg_caches[idx]; | ||
3209 | } | ||
3210 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | ||
3211 | |||
2995 | /* | 3212 | /* |
2996 | * We need to verify if the allocation against current->mm->owner's memcg is | 3213 | * We need to verify if the allocation against current->mm->owner's memcg is |
2997 | * possible for the given order. But the page is not allocated yet, so we'll | 3214 | * possible for the given order. But the page is not allocated yet, so we'll |