aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c978
1 files changed, 625 insertions, 353 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h> 52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h> 54#include <linux/page_cgroup.h>
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/oom.h> 56#include <linux/oom.h>
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
58#include <linux/file.h>
58#include "internal.h" 59#include "internal.h"
59#include <net/sock.h> 60#include <net/sock.h>
60#include <net/ip.h> 61#include <net/ip.h>
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
148 * matches memcg->dead_count of the hierarchy root group. 149 * matches memcg->dead_count of the hierarchy root group.
149 */ 150 */
150 struct mem_cgroup *last_visited; 151 struct mem_cgroup *last_visited;
151 unsigned long last_dead_count; 152 int last_dead_count;
152 153
153 /* scan generation, increased every round-trip */ 154 /* scan generation, increased every round-trip */
154 unsigned int generation; 155 unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 228 struct eventfd_ctx *eventfd;
228}; 229};
229 230
231/*
232 * cgroup_event represents events which userspace want to receive.
233 */
234struct mem_cgroup_event {
235 /*
236 * memcg which the event belongs to.
237 */
238 struct mem_cgroup *memcg;
239 /*
240 * eventfd to signal userspace about the event.
241 */
242 struct eventfd_ctx *eventfd;
243 /*
244 * Each of these stored in a list by the cgroup.
245 */
246 struct list_head list;
247 /*
248 * register_event() callback will be used to add new userspace
249 * waiter for changes related to this event. Use eventfd_signal()
250 * on eventfd to send notification to userspace.
251 */
252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args);
254 /*
255 * unregister_event() callback will be called when userspace closes
256 * the eventfd or on cgroup removing. This callback must be set,
257 * if you want provide notification functionality.
258 */
259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd);
261 /*
262 * All fields below needed to unregister event when
263 * userspace closes eventfd.
264 */
265 poll_table pt;
266 wait_queue_head_t *wqh;
267 wait_queue_t wait;
268 struct work_struct remove;
269};
270
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 271static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 272static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 273
@@ -331,27 +372,20 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 372 atomic_t numainfo_updating;
332#endif 373#endif
333 374
375 /* List of events which userspace want to receive */
376 struct list_head event_list;
377 spinlock_t event_list_lock;
378
334 struct mem_cgroup_per_node *nodeinfo[0]; 379 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 380 /* WARNING: nodeinfo must be the last member here */
336}; 381};
337 382
338static size_t memcg_size(void)
339{
340 return sizeof(struct mem_cgroup) +
341 nr_node_ids * sizeof(struct mem_cgroup_per_node *);
342}
343
344/* internal only representation about the status of kmem accounting. */ 383/* internal only representation about the status of kmem accounting. */
345enum { 384enum {
346 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 385 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
347 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
348 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 386 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
349}; 387};
350 388
351/* We account when limit is on, but only after call sites are patched */
352#define KMEM_ACCOUNTED_MASK \
353 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
354
355#ifdef CONFIG_MEMCG_KMEM 389#ifdef CONFIG_MEMCG_KMEM
356static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 390static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
357{ 391{
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
363 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
364} 398}
365 399
366static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
369}
370
371static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
372{
373 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 400static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
377{ 401{
378 /* 402 /*
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 515}
492 516
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 517static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 518{
500 return (memcg == root_mem_cgroup); 519 return (memcg == root_mem_cgroup);
@@ -1098,16 +1117,22 @@ skip_node:
1098 * skipped and we should continue the tree walk. 1117 * skipped and we should continue the tree walk.
1099 * last_visited css is safe to use because it is 1118 * last_visited css is safe to use because it is
1100 * protected by css_get and the tree walk is rcu safe. 1119 * protected by css_get and the tree walk is rcu safe.
1120 *
1121 * We do not take a reference on the root of the tree walk
1122 * because we might race with the root removal when it would
1123 * be the only node in the iterated hierarchy and mem_cgroup_iter
1124 * would end up in an endless loop because it expects that at
1125 * least one valid node will be returned. Root cannot disappear
1126 * because caller of the iterator should hold it already so
1127 * skipping css reference should be safe.
1101 */ 1128 */
1102 if (next_css) { 1129 if (next_css) {
1103 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1130 if ((next_css == &root->css) ||
1131 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1132 return mem_cgroup_from_css(next_css);
1104 1133
1105 if (css_tryget(&mem->css)) 1134 prev_css = next_css;
1106 return mem; 1135 goto skip_node;
1107 else {
1108 prev_css = next_css;
1109 goto skip_node;
1110 }
1111 } 1136 }
1112 1137
1113 return NULL; 1138 return NULL;
@@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1141 if (iter->last_dead_count == *sequence) { 1166 if (iter->last_dead_count == *sequence) {
1142 smp_rmb(); 1167 smp_rmb();
1143 position = iter->last_visited; 1168 position = iter->last_visited;
1144 if (position && !css_tryget(&position->css)) 1169
1170 /*
1171 * We cannot take a reference to root because we might race
1172 * with root removal and returning NULL would end up in
1173 * an endless loop on the iterator user level when root
1174 * would be returned all the time.
1175 */
1176 if (position && position != root &&
1177 !css_tryget(&position->css))
1145 position = NULL; 1178 position = NULL;
1146 } 1179 }
1147 return position; 1180 return position;
@@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1150static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1183static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1151 struct mem_cgroup *last_visited, 1184 struct mem_cgroup *last_visited,
1152 struct mem_cgroup *new_position, 1185 struct mem_cgroup *new_position,
1186 struct mem_cgroup *root,
1153 int sequence) 1187 int sequence)
1154{ 1188{
1155 if (last_visited) 1189 /* root reference counting symmetric to mem_cgroup_iter_load */
1190 if (last_visited && last_visited != root)
1156 css_put(&last_visited->css); 1191 css_put(&last_visited->css);
1157 /* 1192 /*
1158 * We store the sequence count from the time @last_visited was 1193 * We store the sequence count from the time @last_visited was
@@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1227 memcg = __mem_cgroup_iter_next(root, last_visited); 1262 memcg = __mem_cgroup_iter_next(root, last_visited);
1228 1263
1229 if (reclaim) { 1264 if (reclaim) {
1230 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1265 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1266 seq);
1231 1267
1232 if (!memcg) 1268 if (!memcg)
1233 iter->generation++; 1269 iter->generation++;
@@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1647 */ 1683 */
1648void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1649{ 1685{
1650 struct cgroup *task_cgrp;
1651 struct cgroup *mem_cgrp;
1652 /* 1686 /*
1653 * Need a buffer in BSS, can't rely on allocations. The code relies 1687 * protects memcg_name and makes sure that parallel ooms do not
1654 * on the assumption that OOM is serialized for memory controller. 1688 * interleave
1655 * If this assumption is broken, revisit this code.
1656 */ 1689 */
1690 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1657 static char memcg_name[PATH_MAX]; 1693 static char memcg_name[PATH_MAX];
1658 int ret; 1694 int ret;
1659 struct mem_cgroup *iter; 1695 struct mem_cgroup *iter;
@@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1662 if (!p) 1698 if (!p)
1663 return; 1699 return;
1664 1700
1701 mutex_lock(&oom_info_lock);
1665 rcu_read_lock(); 1702 rcu_read_lock();
1666 1703
1667 mem_cgrp = memcg->css.cgroup; 1704 mem_cgrp = memcg->css.cgroup;
@@ -1730,6 +1767,7 @@ done:
1730 1767
1731 pr_cont("\n"); 1768 pr_cont("\n");
1732 } 1769 }
1770 mutex_unlock(&oom_info_lock);
1733} 1771}
1734 1772
1735/* 1773/*
@@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1822 break; 1860 break;
1823 }; 1861 };
1824 points = oom_badness(task, memcg, NULL, totalpages); 1862 points = oom_badness(task, memcg, NULL, totalpages);
1825 if (points > chosen_points) { 1863 if (!points || points < chosen_points)
1826 if (chosen) 1864 continue;
1827 put_task_struct(chosen); 1865 /* Prefer thread group leaders for display purposes */
1828 chosen = task; 1866 if (points == chosen_points &&
1829 chosen_points = points; 1867 thread_group_leader(chosen))
1830 get_task_struct(chosen); 1868 continue;
1831 } 1869
1870 if (chosen)
1871 put_task_struct(chosen);
1872 chosen = task;
1873 chosen_points = points;
1874 get_task_struct(chosen);
1832 } 1875 }
1833 css_task_iter_end(&it); 1876 css_task_iter_end(&it);
1834 } 1877 }
@@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2861 unsigned short id; 2904 unsigned short id;
2862 swp_entry_t ent; 2905 swp_entry_t ent;
2863 2906
2864 VM_BUG_ON(!PageLocked(page)); 2907 VM_BUG_ON_PAGE(!PageLocked(page), page);
2865 2908
2866 pc = lookup_page_cgroup(page); 2909 pc = lookup_page_cgroup(page);
2867 lock_page_cgroup(pc); 2910 lock_page_cgroup(pc);
@@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2895 bool anon; 2938 bool anon;
2896 2939
2897 lock_page_cgroup(pc); 2940 lock_page_cgroup(pc);
2898 VM_BUG_ON(PageCgroupUsed(pc)); 2941 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2899 /* 2942 /*
2900 * we don't need page_cgroup_lock about tail pages, becase they are not 2943 * we don't need page_cgroup_lock about tail pages, becase they are not
2901 * accessed by any other context at this point. 2944 * accessed by any other context at this point.
@@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2930 if (lrucare) { 2973 if (lrucare) {
2931 if (was_on_lru) { 2974 if (was_on_lru) {
2932 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2975 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2933 VM_BUG_ON(PageLRU(page)); 2976 VM_BUG_ON_PAGE(PageLRU(page), page);
2934 SetPageLRU(page); 2977 SetPageLRU(page);
2935 add_page_to_lru_list(page, lruvec, page_lru(page)); 2978 add_page_to_lru_list(page, lruvec, page_lru(page));
2936 } 2979 }
@@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2956static DEFINE_MUTEX(set_limit_mutex); 2999static DEFINE_MUTEX(set_limit_mutex);
2957 3000
2958#ifdef CONFIG_MEMCG_KMEM 3001#ifdef CONFIG_MEMCG_KMEM
3002static DEFINE_MUTEX(activate_kmem_mutex);
3003
2959static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 3004static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2960{ 3005{
2961 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 3006 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2962 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 3007 memcg_kmem_is_active(memcg);
2963} 3008}
2964 3009
2965/* 3010/*
@@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2976} 3021}
2977 3022
2978#ifdef CONFIG_SLABINFO 3023#ifdef CONFIG_SLABINFO
2979static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 3024static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2980 struct cftype *cft, struct seq_file *m)
2981{ 3025{
2982 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3026 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2983 struct memcg_cache_params *params; 3027 struct memcg_cache_params *params;
2984 3028
2985 if (!memcg_can_account_kmem(memcg)) 3029 if (!memcg_can_account_kmem(memcg))
@@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3059 css_put(&memcg->css); 3103 css_put(&memcg->css);
3060} 3104}
3061 3105
3062void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
3063{
3064 if (!memcg)
3065 return;
3066
3067 mutex_lock(&memcg->slab_caches_mutex);
3068 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3069 mutex_unlock(&memcg->slab_caches_mutex);
3070}
3071
3072/* 3106/*
3073 * helper for acessing a memcg's index. It will be used as an index in the 3107 * helper for acessing a memcg's index. It will be used as an index in the
3074 * child cache array in kmem_cache, and also to derive its name. This function 3108 * child cache array in kmem_cache, and also to derive its name. This function
@@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
3079 return memcg ? memcg->kmemcg_id : -1; 3113 return memcg ? memcg->kmemcg_id : -1;
3080} 3114}
3081 3115
3082/*
3083 * This ends up being protected by the set_limit mutex, during normal
3084 * operation, because that is its main call site.
3085 *
3086 * But when we create a new cache, we can call this as well if its parent
3087 * is kmem-limited. That will have to hold set_limit_mutex as well.
3088 */
3089int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3090{
3091 int num, ret;
3092
3093 num = ida_simple_get(&kmem_limited_groups,
3094 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3095 if (num < 0)
3096 return num;
3097 /*
3098 * After this point, kmem_accounted (that we test atomically in
3099 * the beginning of this conditional), is no longer 0. This
3100 * guarantees only one process will set the following boolean
3101 * to true. We don't need test_and_set because we're protected
3102 * by the set_limit_mutex anyway.
3103 */
3104 memcg_kmem_set_activated(memcg);
3105
3106 ret = memcg_update_all_caches(num+1);
3107 if (ret) {
3108 ida_simple_remove(&kmem_limited_groups, num);
3109 memcg_kmem_clear_activated(memcg);
3110 return ret;
3111 }
3112
3113 memcg->kmemcg_id = num;
3114 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3115 mutex_init(&memcg->slab_caches_mutex);
3116 return 0;
3117}
3118
3119static size_t memcg_caches_array_size(int num_groups) 3116static size_t memcg_caches_array_size(int num_groups)
3120{ 3117{
3121 ssize_t size; 3118 ssize_t size;
@@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3152 3149
3153 if (num_groups > memcg_limited_groups_array_size) { 3150 if (num_groups > memcg_limited_groups_array_size) {
3154 int i; 3151 int i;
3152 struct memcg_cache_params *new_params;
3155 ssize_t size = memcg_caches_array_size(num_groups); 3153 ssize_t size = memcg_caches_array_size(num_groups);
3156 3154
3157 size *= sizeof(void *); 3155 size *= sizeof(void *);
3158 size += offsetof(struct memcg_cache_params, memcg_caches); 3156 size += offsetof(struct memcg_cache_params, memcg_caches);
3159 3157
3160 s->memcg_params = kzalloc(size, GFP_KERNEL); 3158 new_params = kzalloc(size, GFP_KERNEL);
3161 if (!s->memcg_params) { 3159 if (!new_params)
3162 s->memcg_params = cur_params;
3163 return -ENOMEM; 3160 return -ENOMEM;
3164 }
3165 3161
3166 s->memcg_params->is_root_cache = true; 3162 new_params->is_root_cache = true;
3167 3163
3168 /* 3164 /*
3169 * There is the chance it will be bigger than 3165 * There is the chance it will be bigger than
@@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3177 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3173 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3178 if (!cur_params->memcg_caches[i]) 3174 if (!cur_params->memcg_caches[i])
3179 continue; 3175 continue;
3180 s->memcg_params->memcg_caches[i] = 3176 new_params->memcg_caches[i] =
3181 cur_params->memcg_caches[i]; 3177 cur_params->memcg_caches[i];
3182 } 3178 }
3183 3179
@@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3190 * bigger than the others. And all updates will reset this 3186 * bigger than the others. And all updates will reset this
3191 * anyway. 3187 * anyway.
3192 */ 3188 */
3193 kfree(cur_params); 3189 rcu_assign_pointer(s->memcg_params, new_params);
3190 if (cur_params)
3191 kfree_rcu(cur_params, rcu_head);
3194 } 3192 }
3195 return 0; 3193 return 0;
3196} 3194}
3197 3195
3198int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3196int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3199 struct kmem_cache *root_cache) 3197 struct kmem_cache *root_cache)
3200{ 3198{
3201 size_t size; 3199 size_t size;
3202 3200
@@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3224 return 0; 3222 return 0;
3225} 3223}
3226 3224
3227void memcg_release_cache(struct kmem_cache *s) 3225void memcg_free_cache_params(struct kmem_cache *s)
3226{
3227 kfree(s->memcg_params);
3228}
3229
3230void memcg_register_cache(struct kmem_cache *s)
3228{ 3231{
3229 struct kmem_cache *root; 3232 struct kmem_cache *root;
3230 struct mem_cgroup *memcg; 3233 struct mem_cgroup *memcg;
3231 int id; 3234 int id;
3232 3235
3233 /* 3236 if (is_root_cache(s))
3234 * This happens, for instance, when a root cache goes away before we
3235 * add any memcg.
3236 */
3237 if (!s->memcg_params)
3238 return; 3237 return;
3239 3238
3240 if (s->memcg_params->is_root_cache) 3239 /*
3241 goto out; 3240 * Holding the slab_mutex assures nobody will touch the memcg_caches
3241 * array while we are modifying it.
3242 */
3243 lockdep_assert_held(&slab_mutex);
3242 3244
3245 root = s->memcg_params->root_cache;
3243 memcg = s->memcg_params->memcg; 3246 memcg = s->memcg_params->memcg;
3244 id = memcg_cache_id(memcg); 3247 id = memcg_cache_id(memcg);
3248
3249 css_get(&memcg->css);
3250
3251
3252 /*
3253 * Since readers won't lock (see cache_from_memcg_idx()), we need a
3254 * barrier here to ensure nobody will see the kmem_cache partially
3255 * initialized.
3256 */
3257 smp_wmb();
3258
3259 /*
3260 * Initialize the pointer to this cache in its parent's memcg_params
3261 * before adding it to the memcg_slab_caches list, otherwise we can
3262 * fail to convert memcg_params_to_cache() while traversing the list.
3263 */
3264 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3265 root->memcg_params->memcg_caches[id] = s;
3266
3267 mutex_lock(&memcg->slab_caches_mutex);
3268 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3269 mutex_unlock(&memcg->slab_caches_mutex);
3270}
3271
3272void memcg_unregister_cache(struct kmem_cache *s)
3273{
3274 struct kmem_cache *root;
3275 struct mem_cgroup *memcg;
3276 int id;
3277
3278 if (is_root_cache(s))
3279 return;
3280
3281 /*
3282 * Holding the slab_mutex assures nobody will touch the memcg_caches
3283 * array while we are modifying it.
3284 */
3285 lockdep_assert_held(&slab_mutex);
3245 3286
3246 root = s->memcg_params->root_cache; 3287 root = s->memcg_params->root_cache;
3247 root->memcg_params->memcg_caches[id] = NULL; 3288 memcg = s->memcg_params->memcg;
3289 id = memcg_cache_id(memcg);
3248 3290
3249 mutex_lock(&memcg->slab_caches_mutex); 3291 mutex_lock(&memcg->slab_caches_mutex);
3250 list_del(&s->memcg_params->list); 3292 list_del(&s->memcg_params->list);
3251 mutex_unlock(&memcg->slab_caches_mutex); 3293 mutex_unlock(&memcg->slab_caches_mutex);
3252 3294
3295 /*
3296 * Clear the pointer to this cache in its parent's memcg_params only
3297 * after removing it from the memcg_slab_caches list, otherwise we can
3298 * fail to convert memcg_params_to_cache() while traversing the list.
3299 */
3300 VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
3301 root->memcg_params->memcg_caches[id] = NULL;
3302
3253 css_put(&memcg->css); 3303 css_put(&memcg->css);
3254out:
3255 kfree(s->memcg_params);
3256} 3304}
3257 3305
3258/* 3306/*
@@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
3311 * So if we aren't down to zero, we'll just schedule a worker and try 3359 * So if we aren't down to zero, we'll just schedule a worker and try
3312 * again 3360 * again
3313 */ 3361 */
3314 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3362 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3315 kmem_cache_shrink(cachep); 3363 kmem_cache_shrink(cachep);
3316 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3364 else
3317 return;
3318 } else
3319 kmem_cache_destroy(cachep); 3365 kmem_cache_destroy(cachep);
3320} 3366}
3321 3367
@@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3351 schedule_work(&cachep->memcg_params->destroy); 3397 schedule_work(&cachep->memcg_params->destroy);
3352} 3398}
3353 3399
3354/* 3400static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3355 * This lock protects updaters, not readers. We want readers to be as fast as 3401 struct kmem_cache *s)
3356 * they can, and they will either see NULL or a valid cache value. Our model
3357 * allow them to see NULL, in which case the root memcg will be selected.
3358 *
3359 * We need this lock because multiple allocations to the same cache from a non
3360 * will span more than one worker. Only one of them can create the cache.
3361 */
3362static DEFINE_MUTEX(memcg_cache_mutex);
3363
3364/*
3365 * Called with memcg_cache_mutex held
3366 */
3367static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3368 struct kmem_cache *s)
3369{ 3402{
3370 struct kmem_cache *new; 3403 struct kmem_cache *new = NULL;
3371 static char *tmp_name = NULL; 3404 static char *tmp_name = NULL;
3405 static DEFINE_MUTEX(mutex); /* protects tmp_name */
3372 3406
3373 lockdep_assert_held(&memcg_cache_mutex); 3407 BUG_ON(!memcg_can_account_kmem(memcg));
3374 3408
3409 mutex_lock(&mutex);
3375 /* 3410 /*
3376 * kmem_cache_create_memcg duplicates the given name and 3411 * kmem_cache_create_memcg duplicates the given name and
3377 * cgroup_name for this name requires RCU context. 3412 * cgroup_name for this name requires RCU context.
@@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3381 if (!tmp_name) { 3416 if (!tmp_name) {
3382 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3417 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3383 if (!tmp_name) 3418 if (!tmp_name)
3384 return NULL; 3419 goto out;
3385 } 3420 }
3386 3421
3387 rcu_read_lock(); 3422 rcu_read_lock();
@@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3391 3426
3392 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3427 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3393 (s->flags & ~SLAB_PANIC), s->ctor, s); 3428 (s->flags & ~SLAB_PANIC), s->ctor, s);
3394
3395 if (new) 3429 if (new)
3396 new->allocflags |= __GFP_KMEMCG; 3430 new->allocflags |= __GFP_KMEMCG;
3397 3431 else
3398 return new; 3432 new = s;
3399}
3400
3401static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3402 struct kmem_cache *cachep)
3403{
3404 struct kmem_cache *new_cachep;
3405 int idx;
3406
3407 BUG_ON(!memcg_can_account_kmem(memcg));
3408
3409 idx = memcg_cache_id(memcg);
3410
3411 mutex_lock(&memcg_cache_mutex);
3412 new_cachep = cache_from_memcg_idx(cachep, idx);
3413 if (new_cachep) {
3414 css_put(&memcg->css);
3415 goto out;
3416 }
3417
3418 new_cachep = kmem_cache_dup(memcg, cachep);
3419 if (new_cachep == NULL) {
3420 new_cachep = cachep;
3421 css_put(&memcg->css);
3422 goto out;
3423 }
3424
3425 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3426
3427 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3428 /*
3429 * the readers won't lock, make sure everybody sees the updated value,
3430 * so they won't put stuff in the queue again for no reason
3431 */
3432 wmb();
3433out: 3433out:
3434 mutex_unlock(&memcg_cache_mutex); 3434 mutex_unlock(&mutex);
3435 return new_cachep; 3435 return new;
3436} 3436}
3437 3437
3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3452 * 3452 *
3453 * Still, we don't want anyone else freeing memcg_caches under our 3453 * Still, we don't want anyone else freeing memcg_caches under our
3454 * noses, which can happen if a new memcg comes to life. As usual, 3454 * noses, which can happen if a new memcg comes to life. As usual,
3455 * we'll take the set_limit_mutex to protect ourselves against this. 3455 * we'll take the activate_kmem_mutex to protect ourselves against
3456 * this.
3456 */ 3457 */
3457 mutex_lock(&set_limit_mutex); 3458 mutex_lock(&activate_kmem_mutex);
3458 for_each_memcg_cache_index(i) { 3459 for_each_memcg_cache_index(i) {
3459 c = cache_from_memcg_idx(s, i); 3460 c = cache_from_memcg_idx(s, i);
3460 if (!c) 3461 if (!c)
@@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3477 cancel_work_sync(&c->memcg_params->destroy); 3478 cancel_work_sync(&c->memcg_params->destroy);
3478 kmem_cache_destroy(c); 3479 kmem_cache_destroy(c);
3479 } 3480 }
3480 mutex_unlock(&set_limit_mutex); 3481 mutex_unlock(&activate_kmem_mutex);
3481} 3482}
3482 3483
3483struct create_work { 3484struct create_work {
@@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3509 3510
3510 cw = container_of(w, struct create_work, work); 3511 cw = container_of(w, struct create_work, work);
3511 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3512 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3513 css_put(&cw->memcg->css);
3512 kfree(cw); 3514 kfree(cw);
3513} 3515}
3514 3516
@@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3568 gfp_t gfp) 3570 gfp_t gfp)
3569{ 3571{
3570 struct mem_cgroup *memcg; 3572 struct mem_cgroup *memcg;
3571 int idx; 3573 struct kmem_cache *memcg_cachep;
3572 3574
3573 VM_BUG_ON(!cachep->memcg_params); 3575 VM_BUG_ON(!cachep->memcg_params);
3574 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3576 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3582 if (!memcg_can_account_kmem(memcg)) 3584 if (!memcg_can_account_kmem(memcg))
3583 goto out; 3585 goto out;
3584 3586
3585 idx = memcg_cache_id(memcg); 3587 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3586 3588 if (likely(memcg_cachep)) {
3587 /* 3589 cachep = memcg_cachep;
3588 * barrier to mare sure we're always seeing the up to date value. The
3589 * code updating memcg_caches will issue a write barrier to match this.
3590 */
3591 read_barrier_depends();
3592 if (likely(cache_from_memcg_idx(cachep, idx))) {
3593 cachep = cache_from_memcg_idx(cachep, idx);
3594 goto out; 3590 goto out;
3595 } 3591 }
3596 3592
@@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3744 if (!memcg) 3740 if (!memcg)
3745 return; 3741 return;
3746 3742
3747 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3743 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3748 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3744 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3749} 3745}
3750#else 3746#else
@@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
3823 bool anon = PageAnon(page); 3819 bool anon = PageAnon(page);
3824 3820
3825 VM_BUG_ON(from == to); 3821 VM_BUG_ON(from == to);
3826 VM_BUG_ON(PageLRU(page)); 3822 VM_BUG_ON_PAGE(PageLRU(page), page);
3827 /* 3823 /*
3828 * The page is isolated from LRU. So, collapse function 3824 * The page is isolated from LRU. So, collapse function
3829 * will not handle this page. But page splitting can happen. 3825 * will not handle this page. But page splitting can happen.
@@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
3916 parent = root_mem_cgroup; 3912 parent = root_mem_cgroup;
3917 3913
3918 if (nr_pages > 1) { 3914 if (nr_pages > 1) {
3919 VM_BUG_ON(!PageTransHuge(page)); 3915 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3920 flags = compound_lock_irqsave(page); 3916 flags = compound_lock_irqsave(page);
3921 } 3917 }
3922 3918
@@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3950 3946
3951 if (PageTransHuge(page)) { 3947 if (PageTransHuge(page)) {
3952 nr_pages <<= compound_order(page); 3948 nr_pages <<= compound_order(page);
3953 VM_BUG_ON(!PageTransHuge(page)); 3949 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3954 /* 3950 /*
3955 * Never OOM-kill a process for a huge page. The 3951 * Never OOM-kill a process for a huge page. The
3956 * fault handler will fall back to regular pages. 3952 * fault handler will fall back to regular pages.
@@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
3970{ 3966{
3971 if (mem_cgroup_disabled()) 3967 if (mem_cgroup_disabled())
3972 return 0; 3968 return 0;
3973 VM_BUG_ON(page_mapped(page)); 3969 VM_BUG_ON_PAGE(page_mapped(page), page);
3974 VM_BUG_ON(page->mapping && !PageAnon(page)); 3970 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3975 VM_BUG_ON(!mm); 3971 VM_BUG_ON(!mm);
3976 return mem_cgroup_charge_common(page, mm, gfp_mask, 3972 return mem_cgroup_charge_common(page, mm, gfp_mask,
3977 MEM_CGROUP_CHARGE_TYPE_ANON); 3973 MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4175 4171
4176 if (PageTransHuge(page)) { 4172 if (PageTransHuge(page)) {
4177 nr_pages <<= compound_order(page); 4173 nr_pages <<= compound_order(page);
4178 VM_BUG_ON(!PageTransHuge(page)); 4174 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4179 } 4175 }
4180 /* 4176 /*
4181 * Check if our page_cgroup is valid 4177 * Check if our page_cgroup is valid
@@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
4267 /* early check. */ 4263 /* early check. */
4268 if (page_mapped(page)) 4264 if (page_mapped(page))
4269 return; 4265 return;
4270 VM_BUG_ON(page->mapping && !PageAnon(page)); 4266 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4271 /* 4267 /*
4272 * If the page is in swap cache, uncharge should be deferred 4268 * If the page is in swap cache, uncharge should be deferred
4273 * to the swap path, which also properly accounts swap usage 4269 * to the swap path, which also properly accounts swap usage
@@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
4287 4283
4288void mem_cgroup_uncharge_cache_page(struct page *page) 4284void mem_cgroup_uncharge_cache_page(struct page *page)
4289{ 4285{
4290 VM_BUG_ON(page_mapped(page)); 4286 VM_BUG_ON_PAGE(page_mapped(page), page);
4291 VM_BUG_ON(page->mapping); 4287 VM_BUG_ON_PAGE(page->mapping, page);
4292 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4288 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4293} 4289}
4294 4290
@@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5112 return val << PAGE_SHIFT; 5108 return val << PAGE_SHIFT;
5113} 5109}
5114 5110
5115static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5111static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
5116 struct cftype *cft, struct file *file, 5112 struct cftype *cft)
5117 char __user *buf, size_t nbytes, loff_t *ppos)
5118{ 5113{
5119 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5114 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5120 char str[64];
5121 u64 val; 5115 u64 val;
5122 int name, len; 5116 int name;
5123 enum res_type type; 5117 enum res_type type;
5124 5118
5125 type = MEMFILE_TYPE(cft->private); 5119 type = MEMFILE_TYPE(cft->private);
@@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5145 BUG(); 5139 BUG();
5146 } 5140 }
5147 5141
5148 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5142 return val;
5149 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5150} 5143}
5151 5144
5152static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5153{
5154 int ret = -EINVAL;
5155#ifdef CONFIG_MEMCG_KMEM 5145#ifdef CONFIG_MEMCG_KMEM
5156 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5146/* should be called with activate_kmem_mutex held */
5147static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5148 unsigned long long limit)
5149{
5150 int err = 0;
5151 int memcg_id;
5152
5153 if (memcg_kmem_is_active(memcg))
5154 return 0;
5155
5156 /*
5157 * We are going to allocate memory for data shared by all memory
5158 * cgroups so let's stop accounting here.
5159 */
5160 memcg_stop_kmem_account();
5161
5157 /* 5162 /*
5158 * For simplicity, we won't allow this to be disabled. It also can't 5163 * For simplicity, we won't allow this to be disabled. It also can't
5159 * be changed if the cgroup has children already, or if tasks had 5164 * be changed if the cgroup has children already, or if tasks had
@@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5167 * of course permitted. 5172 * of course permitted.
5168 */ 5173 */
5169 mutex_lock(&memcg_create_mutex); 5174 mutex_lock(&memcg_create_mutex);
5170 mutex_lock(&set_limit_mutex); 5175 if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
5171 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { 5176 err = -EBUSY;
5172 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 5177 mutex_unlock(&memcg_create_mutex);
5173 ret = -EBUSY; 5178 if (err)
5174 goto out; 5179 goto out;
5175 }
5176 ret = res_counter_set_limit(&memcg->kmem, val);
5177 VM_BUG_ON(ret);
5178 5180
5179 ret = memcg_update_cache_sizes(memcg); 5181 memcg_id = ida_simple_get(&kmem_limited_groups,
5180 if (ret) { 5182 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
5181 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); 5183 if (memcg_id < 0) {
5182 goto out; 5184 err = memcg_id;
5183 } 5185 goto out;
5184 static_key_slow_inc(&memcg_kmem_enabled_key); 5186 }
5185 /* 5187
5186 * setting the active bit after the inc will guarantee no one 5188 /*
5187 * starts accounting before all call sites are patched 5189 * Make sure we have enough space for this cgroup in each root cache's
5188 */ 5190 * memcg_params.
5189 memcg_kmem_set_active(memcg); 5191 */
5190 } else 5192 err = memcg_update_all_caches(memcg_id + 1);
5191 ret = res_counter_set_limit(&memcg->kmem, val); 5193 if (err)
5194 goto out_rmid;
5195
5196 memcg->kmemcg_id = memcg_id;
5197 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5198 mutex_init(&memcg->slab_caches_mutex);
5199
5200 /*
5201 * We couldn't have accounted to this cgroup, because it hasn't got the
5202 * active bit set yet, so this should succeed.
5203 */
5204 err = res_counter_set_limit(&memcg->kmem, limit);
5205 VM_BUG_ON(err);
5206
5207 static_key_slow_inc(&memcg_kmem_enabled_key);
5208 /*
5209 * Setting the active bit after enabling static branching will
5210 * guarantee no one starts accounting before all call sites are
5211 * patched.
5212 */
5213 memcg_kmem_set_active(memcg);
5192out: 5214out:
5193 mutex_unlock(&set_limit_mutex); 5215 memcg_resume_kmem_account();
5194 mutex_unlock(&memcg_create_mutex); 5216 return err;
5195#endif 5217
5218out_rmid:
5219 ida_simple_remove(&kmem_limited_groups, memcg_id);
5220 goto out;
5221}
5222
5223static int memcg_activate_kmem(struct mem_cgroup *memcg,
5224 unsigned long long limit)
5225{
5226 int ret;
5227
5228 mutex_lock(&activate_kmem_mutex);
5229 ret = __memcg_activate_kmem(memcg, limit);
5230 mutex_unlock(&activate_kmem_mutex);
5231 return ret;
5232}
5233
5234static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5235 unsigned long long val)
5236{
5237 int ret;
5238
5239 if (!memcg_kmem_is_active(memcg))
5240 ret = memcg_activate_kmem(memcg, val);
5241 else
5242 ret = res_counter_set_limit(&memcg->kmem, val);
5196 return ret; 5243 return ret;
5197} 5244}
5198 5245
5199#ifdef CONFIG_MEMCG_KMEM
5200static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5246static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5201{ 5247{
5202 int ret = 0; 5248 int ret = 0;
5203 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5249 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5204 if (!parent)
5205 goto out;
5206 5250
5207 memcg->kmem_account_flags = parent->kmem_account_flags; 5251 if (!parent)
5208 /* 5252 return 0;
5209 * When that happen, we need to disable the static branch only on those
5210 * memcgs that enabled it. To achieve this, we would be forced to
5211 * complicate the code by keeping track of which memcgs were the ones
5212 * that actually enabled limits, and which ones got it from its
5213 * parents.
5214 *
5215 * It is a lot simpler just to do static_key_slow_inc() on every child
5216 * that is accounted.
5217 */
5218 if (!memcg_kmem_is_active(memcg))
5219 goto out;
5220 5253
5254 mutex_lock(&activate_kmem_mutex);
5221 /* 5255 /*
5222 * __mem_cgroup_free() will issue static_key_slow_dec() because this 5256 * If the parent cgroup is not kmem-active now, it cannot be activated
5223 * memcg is active already. If the later initialization fails then the 5257 * after this point, because it has at least one child already.
5224 * cgroup core triggers the cleanup so we do not have to do it here.
5225 */ 5258 */
5226 static_key_slow_inc(&memcg_kmem_enabled_key); 5259 if (memcg_kmem_is_active(parent))
5227 5260 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
5228 mutex_lock(&set_limit_mutex); 5261 mutex_unlock(&activate_kmem_mutex);
5229 memcg_stop_kmem_account();
5230 ret = memcg_update_cache_sizes(memcg);
5231 memcg_resume_kmem_account();
5232 mutex_unlock(&set_limit_mutex);
5233out:
5234 return ret; 5262 return ret;
5235} 5263}
5264#else
5265static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5266 unsigned long long val)
5267{
5268 return -EINVAL;
5269}
5236#endif /* CONFIG_MEMCG_KMEM */ 5270#endif /* CONFIG_MEMCG_KMEM */
5237 5271
5238/* 5272/*
@@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5266 else if (type == _MEMSWAP) 5300 else if (type == _MEMSWAP)
5267 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5301 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5268 else if (type == _KMEM) 5302 else if (type == _KMEM)
5269 ret = memcg_update_kmem_limit(css, val); 5303 ret = memcg_update_kmem_limit(memcg, val);
5270 else 5304 else
5271 return -EINVAL; 5305 return -EINVAL;
5272 break; 5306 break;
@@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5383#endif 5417#endif
5384 5418
5385#ifdef CONFIG_NUMA 5419#ifdef CONFIG_NUMA
5386static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5420static int memcg_numa_stat_show(struct seq_file *m, void *v)
5387 struct cftype *cft, struct seq_file *m)
5388{ 5421{
5389 struct numa_stat { 5422 struct numa_stat {
5390 const char *name; 5423 const char *name;
@@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5400 const struct numa_stat *stat; 5433 const struct numa_stat *stat;
5401 int nid; 5434 int nid;
5402 unsigned long nr; 5435 unsigned long nr;
5403 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5436 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5404 5437
5405 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5438 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5406 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5439 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5439 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5472 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5440} 5473}
5441 5474
5442static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5475static int memcg_stat_show(struct seq_file *m, void *v)
5443 struct seq_file *m)
5444{ 5476{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5477 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5446 struct mem_cgroup *mi; 5478 struct mem_cgroup *mi;
5447 unsigned int i; 5479 unsigned int i;
5448 5480
@@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5651 mem_cgroup_oom_notify_cb(iter); 5683 mem_cgroup_oom_notify_cb(iter);
5652} 5684}
5653 5685
5654static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5686static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5655 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5687 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5656{ 5688{
5657 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5658 struct mem_cgroup_thresholds *thresholds; 5689 struct mem_cgroup_thresholds *thresholds;
5659 struct mem_cgroup_threshold_ary *new; 5690 struct mem_cgroup_threshold_ary *new;
5660 enum res_type type = MEMFILE_TYPE(cft->private);
5661 u64 threshold, usage; 5691 u64 threshold, usage;
5662 int i, size, ret; 5692 int i, size, ret;
5663 5693
@@ -5734,13 +5764,23 @@ unlock:
5734 return ret; 5764 return ret;
5735} 5765}
5736 5766
5737static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5767static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5738 struct cftype *cft, struct eventfd_ctx *eventfd) 5768 struct eventfd_ctx *eventfd, const char *args)
5769{
5770 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5771}
5772
5773static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5777}
5778
5779static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, enum res_type type)
5739{ 5781{
5740 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5741 struct mem_cgroup_thresholds *thresholds; 5782 struct mem_cgroup_thresholds *thresholds;
5742 struct mem_cgroup_threshold_ary *new; 5783 struct mem_cgroup_threshold_ary *new;
5743 enum res_type type = MEMFILE_TYPE(cft->private);
5744 u64 usage; 5784 u64 usage;
5745 int i, j, size; 5785 int i, j, size;
5746 5786
@@ -5813,14 +5853,23 @@ unlock:
5813 mutex_unlock(&memcg->thresholds_lock); 5853 mutex_unlock(&memcg->thresholds_lock);
5814} 5854}
5815 5855
5816static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5856static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5817 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5857 struct eventfd_ctx *eventfd)
5858{
5859 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5860}
5861
5862static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5866}
5867
5868static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd, const char *args)
5818{ 5870{
5819 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5820 struct mem_cgroup_eventfd_list *event; 5871 struct mem_cgroup_eventfd_list *event;
5821 enum res_type type = MEMFILE_TYPE(cft->private);
5822 5872
5823 BUG_ON(type != _OOM_TYPE);
5824 event = kmalloc(sizeof(*event), GFP_KERNEL); 5873 event = kmalloc(sizeof(*event), GFP_KERNEL);
5825 if (!event) 5874 if (!event)
5826 return -ENOMEM; 5875 return -ENOMEM;
@@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5838 return 0; 5887 return 0;
5839} 5888}
5840 5889
5841static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5890static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5842 struct cftype *cft, struct eventfd_ctx *eventfd) 5891 struct eventfd_ctx *eventfd)
5843{ 5892{
5844 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5845 struct mem_cgroup_eventfd_list *ev, *tmp; 5893 struct mem_cgroup_eventfd_list *ev, *tmp;
5846 enum res_type type = MEMFILE_TYPE(cft->private);
5847
5848 BUG_ON(type != _OOM_TYPE);
5849 5894
5850 spin_lock(&memcg_oom_lock); 5895 spin_lock(&memcg_oom_lock);
5851 5896
@@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5859 spin_unlock(&memcg_oom_lock); 5904 spin_unlock(&memcg_oom_lock);
5860} 5905}
5861 5906
5862static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5907static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5863 struct cftype *cft, struct cgroup_map_cb *cb)
5864{ 5908{
5865 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5909 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5866 5910
5867 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5911 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5868 5912 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5869 if (atomic_read(&memcg->under_oom))
5870 cb->fill(cb, "under_oom", 1);
5871 else
5872 cb->fill(cb, "under_oom", 0);
5873 return 0; 5913 return 0;
5874} 5914}
5875 5915
@@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5962} 6002}
5963#endif 6003#endif
5964 6004
6005/*
6006 * DO NOT USE IN NEW FILES.
6007 *
6008 * "cgroup.event_control" implementation.
6009 *
6010 * This is way over-engineered. It tries to support fully configurable
6011 * events for each user. Such level of flexibility is completely
6012 * unnecessary especially in the light of the planned unified hierarchy.
6013 *
6014 * Please deprecate this and replace with something simpler if at all
6015 * possible.
6016 */
6017
6018/*
6019 * Unregister event and free resources.
6020 *
6021 * Gets called from workqueue.
6022 */
6023static void memcg_event_remove(struct work_struct *work)
6024{
6025 struct mem_cgroup_event *event =
6026 container_of(work, struct mem_cgroup_event, remove);
6027 struct mem_cgroup *memcg = event->memcg;
6028
6029 remove_wait_queue(event->wqh, &event->wait);
6030
6031 event->unregister_event(memcg, event->eventfd);
6032
6033 /* Notify userspace the event is going away. */
6034 eventfd_signal(event->eventfd, 1);
6035
6036 eventfd_ctx_put(event->eventfd);
6037 kfree(event);
6038 css_put(&memcg->css);
6039}
6040
6041/*
6042 * Gets called on POLLHUP on eventfd when user closes it.
6043 *
6044 * Called with wqh->lock held and interrupts disabled.
6045 */
6046static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6047 int sync, void *key)
6048{
6049 struct mem_cgroup_event *event =
6050 container_of(wait, struct mem_cgroup_event, wait);
6051 struct mem_cgroup *memcg = event->memcg;
6052 unsigned long flags = (unsigned long)key;
6053
6054 if (flags & POLLHUP) {
6055 /*
6056 * If the event has been detached at cgroup removal, we
6057 * can simply return knowing the other side will cleanup
6058 * for us.
6059 *
6060 * We can't race against event freeing since the other
6061 * side will require wqh->lock via remove_wait_queue(),
6062 * which we hold.
6063 */
6064 spin_lock(&memcg->event_list_lock);
6065 if (!list_empty(&event->list)) {
6066 list_del_init(&event->list);
6067 /*
6068 * We are in atomic context, but cgroup_event_remove()
6069 * may sleep, so we have to call it in workqueue.
6070 */
6071 schedule_work(&event->remove);
6072 }
6073 spin_unlock(&memcg->event_list_lock);
6074 }
6075
6076 return 0;
6077}
6078
6079static void memcg_event_ptable_queue_proc(struct file *file,
6080 wait_queue_head_t *wqh, poll_table *pt)
6081{
6082 struct mem_cgroup_event *event =
6083 container_of(pt, struct mem_cgroup_event, pt);
6084
6085 event->wqh = wqh;
6086 add_wait_queue(wqh, &event->wait);
6087}
6088
6089/*
6090 * DO NOT USE IN NEW FILES.
6091 *
6092 * Parse input and register new cgroup event handler.
6093 *
6094 * Input must be in format '<event_fd> <control_fd> <args>'.
6095 * Interpretation of args is defined by control file implementation.
6096 */
6097static int memcg_write_event_control(struct cgroup_subsys_state *css,
6098 struct cftype *cft, const char *buffer)
6099{
6100 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6101 struct mem_cgroup_event *event;
6102 struct cgroup_subsys_state *cfile_css;
6103 unsigned int efd, cfd;
6104 struct fd efile;
6105 struct fd cfile;
6106 const char *name;
6107 char *endp;
6108 int ret;
6109
6110 efd = simple_strtoul(buffer, &endp, 10);
6111 if (*endp != ' ')
6112 return -EINVAL;
6113 buffer = endp + 1;
6114
6115 cfd = simple_strtoul(buffer, &endp, 10);
6116 if ((*endp != ' ') && (*endp != '\0'))
6117 return -EINVAL;
6118 buffer = endp + 1;
6119
6120 event = kzalloc(sizeof(*event), GFP_KERNEL);
6121 if (!event)
6122 return -ENOMEM;
6123
6124 event->memcg = memcg;
6125 INIT_LIST_HEAD(&event->list);
6126 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6127 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6128 INIT_WORK(&event->remove, memcg_event_remove);
6129
6130 efile = fdget(efd);
6131 if (!efile.file) {
6132 ret = -EBADF;
6133 goto out_kfree;
6134 }
6135
6136 event->eventfd = eventfd_ctx_fileget(efile.file);
6137 if (IS_ERR(event->eventfd)) {
6138 ret = PTR_ERR(event->eventfd);
6139 goto out_put_efile;
6140 }
6141
6142 cfile = fdget(cfd);
6143 if (!cfile.file) {
6144 ret = -EBADF;
6145 goto out_put_eventfd;
6146 }
6147
6148 /* the process need read permission on control file */
6149 /* AV: shouldn't we check that it's been opened for read instead? */
6150 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6151 if (ret < 0)
6152 goto out_put_cfile;
6153
6154 /*
6155 * Determine the event callbacks and set them in @event. This used
6156 * to be done via struct cftype but cgroup core no longer knows
6157 * about these events. The following is crude but the whole thing
6158 * is for compatibility anyway.
6159 *
6160 * DO NOT ADD NEW FILES.
6161 */
6162 name = cfile.file->f_dentry->d_name.name;
6163
6164 if (!strcmp(name, "memory.usage_in_bytes")) {
6165 event->register_event = mem_cgroup_usage_register_event;
6166 event->unregister_event = mem_cgroup_usage_unregister_event;
6167 } else if (!strcmp(name, "memory.oom_control")) {
6168 event->register_event = mem_cgroup_oom_register_event;
6169 event->unregister_event = mem_cgroup_oom_unregister_event;
6170 } else if (!strcmp(name, "memory.pressure_level")) {
6171 event->register_event = vmpressure_register_event;
6172 event->unregister_event = vmpressure_unregister_event;
6173 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6174 event->register_event = memsw_cgroup_usage_register_event;
6175 event->unregister_event = memsw_cgroup_usage_unregister_event;
6176 } else {
6177 ret = -EINVAL;
6178 goto out_put_cfile;
6179 }
6180
6181 /*
6182 * Verify @cfile should belong to @css. Also, remaining events are
6183 * automatically removed on cgroup destruction but the removal is
6184 * asynchronous, so take an extra ref on @css.
6185 */
6186 rcu_read_lock();
6187
6188 ret = -EINVAL;
6189 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6190 &mem_cgroup_subsys);
6191 if (cfile_css == css && css_tryget(css))
6192 ret = 0;
6193
6194 rcu_read_unlock();
6195 if (ret)
6196 goto out_put_cfile;
6197
6198 ret = event->register_event(memcg, event->eventfd, buffer);
6199 if (ret)
6200 goto out_put_css;
6201
6202 efile.file->f_op->poll(efile.file, &event->pt);
6203
6204 spin_lock(&memcg->event_list_lock);
6205 list_add(&event->list, &memcg->event_list);
6206 spin_unlock(&memcg->event_list_lock);
6207
6208 fdput(cfile);
6209 fdput(efile);
6210
6211 return 0;
6212
6213out_put_css:
6214 css_put(css);
6215out_put_cfile:
6216 fdput(cfile);
6217out_put_eventfd:
6218 eventfd_ctx_put(event->eventfd);
6219out_put_efile:
6220 fdput(efile);
6221out_kfree:
6222 kfree(event);
6223
6224 return ret;
6225}
6226
5965static struct cftype mem_cgroup_files[] = { 6227static struct cftype mem_cgroup_files[] = {
5966 { 6228 {
5967 .name = "usage_in_bytes", 6229 .name = "usage_in_bytes",
5968 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6230 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5969 .read = mem_cgroup_read, 6231 .read_u64 = mem_cgroup_read_u64,
5970 .register_event = mem_cgroup_usage_register_event,
5971 .unregister_event = mem_cgroup_usage_unregister_event,
5972 }, 6232 },
5973 { 6233 {
5974 .name = "max_usage_in_bytes", 6234 .name = "max_usage_in_bytes",
5975 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6235 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5976 .trigger = mem_cgroup_reset, 6236 .trigger = mem_cgroup_reset,
5977 .read = mem_cgroup_read, 6237 .read_u64 = mem_cgroup_read_u64,
5978 }, 6238 },
5979 { 6239 {
5980 .name = "limit_in_bytes", 6240 .name = "limit_in_bytes",
5981 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6241 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5982 .write_string = mem_cgroup_write, 6242 .write_string = mem_cgroup_write,
5983 .read = mem_cgroup_read, 6243 .read_u64 = mem_cgroup_read_u64,
5984 }, 6244 },
5985 { 6245 {
5986 .name = "soft_limit_in_bytes", 6246 .name = "soft_limit_in_bytes",
5987 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6247 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5988 .write_string = mem_cgroup_write, 6248 .write_string = mem_cgroup_write,
5989 .read = mem_cgroup_read, 6249 .read_u64 = mem_cgroup_read_u64,
5990 }, 6250 },
5991 { 6251 {
5992 .name = "failcnt", 6252 .name = "failcnt",
5993 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6253 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5994 .trigger = mem_cgroup_reset, 6254 .trigger = mem_cgroup_reset,
5995 .read = mem_cgroup_read, 6255 .read_u64 = mem_cgroup_read_u64,
5996 }, 6256 },
5997 { 6257 {
5998 .name = "stat", 6258 .name = "stat",
5999 .read_seq_string = memcg_stat_show, 6259 .seq_show = memcg_stat_show,
6000 }, 6260 },
6001 { 6261 {
6002 .name = "force_empty", 6262 .name = "force_empty",
@@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = {
6009 .read_u64 = mem_cgroup_hierarchy_read, 6269 .read_u64 = mem_cgroup_hierarchy_read,
6010 }, 6270 },
6011 { 6271 {
6272 .name = "cgroup.event_control", /* XXX: for compat */
6273 .write_string = memcg_write_event_control,
6274 .flags = CFTYPE_NO_PREFIX,
6275 .mode = S_IWUGO,
6276 },
6277 {
6012 .name = "swappiness", 6278 .name = "swappiness",
6013 .read_u64 = mem_cgroup_swappiness_read, 6279 .read_u64 = mem_cgroup_swappiness_read,
6014 .write_u64 = mem_cgroup_swappiness_write, 6280 .write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = {
6020 }, 6286 },
6021 { 6287 {
6022 .name = "oom_control", 6288 .name = "oom_control",
6023 .read_map = mem_cgroup_oom_control_read, 6289 .seq_show = mem_cgroup_oom_control_read,
6024 .write_u64 = mem_cgroup_oom_control_write, 6290 .write_u64 = mem_cgroup_oom_control_write,
6025 .register_event = mem_cgroup_oom_register_event,
6026 .unregister_event = mem_cgroup_oom_unregister_event,
6027 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6291 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6028 }, 6292 },
6029 { 6293 {
6030 .name = "pressure_level", 6294 .name = "pressure_level",
6031 .register_event = vmpressure_register_event,
6032 .unregister_event = vmpressure_unregister_event,
6033 }, 6295 },
6034#ifdef CONFIG_NUMA 6296#ifdef CONFIG_NUMA
6035 { 6297 {
6036 .name = "numa_stat", 6298 .name = "numa_stat",
6037 .read_seq_string = memcg_numa_stat_show, 6299 .seq_show = memcg_numa_stat_show,
6038 }, 6300 },
6039#endif 6301#endif
6040#ifdef CONFIG_MEMCG_KMEM 6302#ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = {
6042 .name = "kmem.limit_in_bytes", 6304 .name = "kmem.limit_in_bytes",
6043 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6305 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6044 .write_string = mem_cgroup_write, 6306 .write_string = mem_cgroup_write,
6045 .read = mem_cgroup_read, 6307 .read_u64 = mem_cgroup_read_u64,
6046 }, 6308 },
6047 { 6309 {
6048 .name = "kmem.usage_in_bytes", 6310 .name = "kmem.usage_in_bytes",
6049 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6311 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6050 .read = mem_cgroup_read, 6312 .read_u64 = mem_cgroup_read_u64,
6051 }, 6313 },
6052 { 6314 {
6053 .name = "kmem.failcnt", 6315 .name = "kmem.failcnt",
6054 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6316 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6055 .trigger = mem_cgroup_reset, 6317 .trigger = mem_cgroup_reset,
6056 .read = mem_cgroup_read, 6318 .read_u64 = mem_cgroup_read_u64,
6057 }, 6319 },
6058 { 6320 {
6059 .name = "kmem.max_usage_in_bytes", 6321 .name = "kmem.max_usage_in_bytes",
6060 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6322 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6061 .trigger = mem_cgroup_reset, 6323 .trigger = mem_cgroup_reset,
6062 .read = mem_cgroup_read, 6324 .read_u64 = mem_cgroup_read_u64,
6063 }, 6325 },
6064#ifdef CONFIG_SLABINFO 6326#ifdef CONFIG_SLABINFO
6065 { 6327 {
6066 .name = "kmem.slabinfo", 6328 .name = "kmem.slabinfo",
6067 .read_seq_string = mem_cgroup_slabinfo_read, 6329 .seq_show = mem_cgroup_slabinfo_read,
6068 }, 6330 },
6069#endif 6331#endif
6070#endif 6332#endif
@@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = {
6076 { 6338 {
6077 .name = "memsw.usage_in_bytes", 6339 .name = "memsw.usage_in_bytes",
6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6340 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6079 .read = mem_cgroup_read, 6341 .read_u64 = mem_cgroup_read_u64,
6080 .register_event = mem_cgroup_usage_register_event,
6081 .unregister_event = mem_cgroup_usage_unregister_event,
6082 }, 6342 },
6083 { 6343 {
6084 .name = "memsw.max_usage_in_bytes", 6344 .name = "memsw.max_usage_in_bytes",
6085 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6345 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6086 .trigger = mem_cgroup_reset, 6346 .trigger = mem_cgroup_reset,
6087 .read = mem_cgroup_read, 6347 .read_u64 = mem_cgroup_read_u64,
6088 }, 6348 },
6089 { 6349 {
6090 .name = "memsw.limit_in_bytes", 6350 .name = "memsw.limit_in_bytes",
6091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6092 .write_string = mem_cgroup_write, 6352 .write_string = mem_cgroup_write,
6093 .read = mem_cgroup_read, 6353 .read_u64 = mem_cgroup_read_u64,
6094 }, 6354 },
6095 { 6355 {
6096 .name = "memsw.failcnt", 6356 .name = "memsw.failcnt",
6097 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6357 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6098 .trigger = mem_cgroup_reset, 6358 .trigger = mem_cgroup_reset,
6099 .read = mem_cgroup_read, 6359 .read_u64 = mem_cgroup_read_u64,
6100 }, 6360 },
6101 { }, /* terminate */ 6361 { }, /* terminate */
6102}; 6362};
@@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6139static struct mem_cgroup *mem_cgroup_alloc(void) 6399static struct mem_cgroup *mem_cgroup_alloc(void)
6140{ 6400{
6141 struct mem_cgroup *memcg; 6401 struct mem_cgroup *memcg;
6142 size_t size = memcg_size(); 6402 size_t size;
6143 6403
6144 /* Can be very big if nr_node_ids is very big */ 6404 size = sizeof(struct mem_cgroup);
6145 if (size < PAGE_SIZE) 6405 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
6146 memcg = kzalloc(size, GFP_KERNEL);
6147 else
6148 memcg = vzalloc(size);
6149 6406
6407 memcg = kzalloc(size, GFP_KERNEL);
6150 if (!memcg) 6408 if (!memcg)
6151 return NULL; 6409 return NULL;
6152 6410
@@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
6157 return memcg; 6415 return memcg;
6158 6416
6159out_free: 6417out_free:
6160 if (size < PAGE_SIZE) 6418 kfree(memcg);
6161 kfree(memcg);
6162 else
6163 vfree(memcg);
6164 return NULL; 6419 return NULL;
6165} 6420}
6166 6421
@@ -6178,7 +6433,6 @@ out_free:
6178static void __mem_cgroup_free(struct mem_cgroup *memcg) 6433static void __mem_cgroup_free(struct mem_cgroup *memcg)
6179{ 6434{
6180 int node; 6435 int node;
6181 size_t size = memcg_size();
6182 6436
6183 mem_cgroup_remove_from_trees(memcg); 6437 mem_cgroup_remove_from_trees(memcg);
6184 6438
@@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
6199 * the cgroup_lock. 6453 * the cgroup_lock.
6200 */ 6454 */
6201 disarm_static_keys(memcg); 6455 disarm_static_keys(memcg);
6202 if (size < PAGE_SIZE) 6456 kfree(memcg);
6203 kfree(memcg);
6204 else
6205 vfree(memcg);
6206} 6457}
6207 6458
6208/* 6459/*
@@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6268 mutex_init(&memcg->thresholds_lock); 6519 mutex_init(&memcg->thresholds_lock);
6269 spin_lock_init(&memcg->move_lock); 6520 spin_lock_init(&memcg->move_lock);
6270 vmpressure_init(&memcg->vmpressure); 6521 vmpressure_init(&memcg->vmpressure);
6522 INIT_LIST_HEAD(&memcg->event_list);
6523 spin_lock_init(&memcg->event_list_lock);
6271 6524
6272 return &memcg->css; 6525 return &memcg->css;
6273 6526
@@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6281{ 6534{
6282 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6535 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6283 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6536 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6284 int error = 0;
6285 6537
6286 if (css->cgroup->id > MEM_CGROUP_ID_MAX) 6538 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6287 return -ENOSPC; 6539 return -ENOSPC;
@@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6316 if (parent != root_mem_cgroup) 6568 if (parent != root_mem_cgroup)
6317 mem_cgroup_subsys.broken_hierarchy = true; 6569 mem_cgroup_subsys.broken_hierarchy = true;
6318 } 6570 }
6319
6320 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6321 mutex_unlock(&memcg_create_mutex); 6571 mutex_unlock(&memcg_create_mutex);
6322 return error; 6572
6573 return memcg_init_kmem(memcg, &mem_cgroup_subsys);
6323} 6574}
6324 6575
6325/* 6576/*
@@ -6343,11 +6594,32 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6343static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6594static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6344{ 6595{
6345 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597 struct mem_cgroup_event *event, *tmp;
6598 struct cgroup_subsys_state *iter;
6599
6600 /*
6601 * Unregister events and notify userspace.
6602 * Notify userspace about cgroup removing only after rmdir of cgroup
6603 * directory to avoid race between userspace and kernelspace.
6604 */
6605 spin_lock(&memcg->event_list_lock);
6606 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6607 list_del_init(&event->list);
6608 schedule_work(&event->remove);
6609 }
6610 spin_unlock(&memcg->event_list_lock);
6346 6611
6347 kmem_cgroup_css_offline(memcg); 6612 kmem_cgroup_css_offline(memcg);
6348 6613
6349 mem_cgroup_invalidate_reclaim_iterators(memcg); 6614 mem_cgroup_invalidate_reclaim_iterators(memcg);
6350 mem_cgroup_reparent_charges(memcg); 6615
6616 /*
6617 * This requires that offlining is serialized. Right now that is
6618 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
6619 */
6620 css_for_each_descendant_post(iter, css)
6621 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6622
6351 mem_cgroup_destroy_all_caches(memcg); 6623 mem_cgroup_destroy_all_caches(memcg);
6352 vmpressure_cleanup(&memcg->vmpressure); 6624 vmpressure_cleanup(&memcg->vmpressure);
6353} 6625}
@@ -6615,7 +6887,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6615 enum mc_target_type ret = MC_TARGET_NONE; 6887 enum mc_target_type ret = MC_TARGET_NONE;
6616 6888
6617 page = pmd_page(pmd); 6889 page = pmd_page(pmd);
6618 VM_BUG_ON(!page || !PageHead(page)); 6890 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6619 if (!move_anon()) 6891 if (!move_anon())
6620 return ret; 6892 return ret;
6621 pc = lookup_page_cgroup(page); 6893 pc = lookup_page_cgroup(page);