aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1242
1 files changed, 1176 insertions, 66 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bbfac5063ca8..f3009b4bae51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal
16 *
13 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
@@ -268,6 +272,10 @@ struct mem_cgroup {
268 }; 272 };
269 273
270 /* 274 /*
275 * the counter to account for kernel memory usage.
276 */
277 struct res_counter kmem;
278 /*
271 * Per cgroup active and inactive list, similar to the 279 * Per cgroup active and inactive list, similar to the
272 * per zone LRU lists. 280 * per zone LRU lists.
273 */ 281 */
@@ -282,6 +290,7 @@ struct mem_cgroup {
282 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
283 */ 291 */
284 bool use_hierarchy; 292 bool use_hierarchy;
293 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
285 294
286 bool oom_lock; 295 bool oom_lock;
287 atomic_t under_oom; 296 atomic_t under_oom;
@@ -332,8 +341,61 @@ struct mem_cgroup {
332#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
333 struct tcp_memcontrol tcp_mem; 342 struct tcp_memcontrol tcp_mem;
334#endif 343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345 /* analogous to slab_common's slab_caches list. per-memcg */
346 struct list_head memcg_slab_caches;
347 /* Not a spinlock, we can take a lot of time walking the list */
348 struct mutex slab_caches_mutex;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id;
351#endif
335}; 352};
336 353
354/* internal only representation about the status of kmem accounting. */
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
357 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
358 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
359};
360
361/* We account when limit is on, but only after call sites are patched */
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
337/* Stuffs for move charges at task migration. */ 399/* Stuffs for move charges at task migration. */
338/* 400/*
339 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -388,9 +450,13 @@ enum charge_type {
388}; 450};
389 451
390/* for encoding cft->private value on file */ 452/* for encoding cft->private value on file */
391#define _MEM (0) 453enum res_type {
392#define _MEMSWAP (1) 454 _MEM,
393#define _OOM_TYPE (2) 455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
394#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
395#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
396#define MEMFILE_ATTR(val) ((val) & 0xffff) 462#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
487} 553}
488#endif 554#endif
489 555
556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577/*
578 * MIN_SIZE is different than 1, because we would like to avoid going through
579 * the alloc/free process all the time. In a small machine, 4 kmem-limited
580 * cgroups is a reasonable guess. In the future, it could be a parameter or
581 * tunable, but that is strictly not necessary.
582 *
583 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
584 * this constant directly from cgroup, but it is understandable that this is
585 * better kept as an internal representation in cgroup.c. In any case, the
586 * css_id space is not getting any smaller, and we don't have to necessarily
587 * increase ours as well if it increases.
588 */
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592/*
593 * A lot of the calls to the cache allocation functions are expected to be
594 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
595 * conditional to this static branch, we'll have to allow modules that does
596 * kmem_cache_alloc and the such to see this symbol as well
597 */
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
490static void drain_all_stock_async(struct mem_cgroup *memcg); 625static void drain_all_stock_async(struct mem_cgroup *memcg);
491 626
492static struct mem_cgroup_per_zone * 627static struct mem_cgroup_per_zone *
@@ -1453,6 +1588,10 @@ done:
1453 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1454 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1456} 1595}
1457 1596
1458/* 1597/*
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
2060static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2061static DEFINE_MUTEX(percpu_charge_mutex); 2200static DEFINE_MUTEX(percpu_charge_mutex);
2062 2201
2063/* 2202/**
2064 * Try to consume stocked charge on this cpu. If success, one page is consumed 2203 * consume_stock: Try to consume stocked charge on this cpu.
2065 * from local stock and true is returned. If the stock is 0 or charges from a 2204 * @memcg: memcg to consume from.
2066 * cgroup which is not current target, returns false. This stock will be 2205 * @nr_pages: how many pages to charge.
2067 * refilled. 2206 *
2207 * The charges will only happen if @memcg matches the current cpu's memcg
2208 * stock, and at least @nr_pages are available in that stock. Failure to
2209 * service an allocation will refill the stock.
2210 *
2211 * returns true if successful, false otherwise.
2068 */ 2212 */
2069static bool consume_stock(struct mem_cgroup *memcg) 2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2070{ 2214{
2071 struct memcg_stock_pcp *stock; 2215 struct memcg_stock_pcp *stock;
2072 bool ret = true; 2216 bool ret = true;
2073 2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2074 stock = &get_cpu_var(memcg_stock); 2221 stock = &get_cpu_var(memcg_stock);
2075 if (memcg == stock->cached && stock->nr_pages) 2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2076 stock->nr_pages--; 2223 stock->nr_pages -= nr_pages;
2077 else /* need to call res_counter_charge */ 2224 else /* need to call res_counter_charge */
2078 ret = false; 2225 ret = false;
2079 put_cpu_var(memcg_stock); 2226 put_cpu_var(memcg_stock);
@@ -2250,7 +2397,8 @@ enum {
2250}; 2397};
2251 2398
2252static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2253 unsigned int nr_pages, bool oom_check) 2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2254{ 2402{
2255 unsigned long csize = nr_pages * PAGE_SIZE; 2403 unsigned long csize = nr_pages * PAGE_SIZE;
2256 struct mem_cgroup *mem_over_limit; 2404 struct mem_cgroup *mem_over_limit;
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2273 } else 2421 } else
2274 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2275 /* 2423 /*
2276 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2277 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2278 *
2279 * Never reclaim on behalf of optional batching, retry with a 2424 * Never reclaim on behalf of optional batching, retry with a
2280 * single page instead. 2425 * single page instead.
2281 */ 2426 */
2282 if (nr_pages == CHARGE_BATCH) 2427 if (nr_pages > min_pages)
2283 return CHARGE_RETRY; 2428 return CHARGE_RETRY;
2284 2429
2285 if (!(gfp_mask & __GFP_WAIT)) 2430 if (!(gfp_mask & __GFP_WAIT))
2286 return CHARGE_WOULDBLOCK; 2431 return CHARGE_WOULDBLOCK;
2287 2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2288 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2289 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2290 return CHARGE_RETRY; 2438 return CHARGE_RETRY;
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2297 * unlikely to succeed so close to the limit, and we fall back 2445 * unlikely to succeed so close to the limit, and we fall back
2298 * to regular pages anyway in case of failure. 2446 * to regular pages anyway in case of failure.
2299 */ 2447 */
2300 if (nr_pages == 1 && ret) 2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2301 return CHARGE_RETRY; 2449 return CHARGE_RETRY;
2302 2450
2303 /* 2451 /*
@@ -2371,7 +2519,7 @@ again:
2371 memcg = *ptr; 2519 memcg = *ptr;
2372 if (mem_cgroup_is_root(memcg)) 2520 if (mem_cgroup_is_root(memcg))
2373 goto done; 2521 goto done;
2374 if (nr_pages == 1 && consume_stock(memcg)) 2522 if (consume_stock(memcg, nr_pages))
2375 goto done; 2523 goto done;
2376 css_get(&memcg->css); 2524 css_get(&memcg->css);
2377 } else { 2525 } else {
@@ -2396,7 +2544,7 @@ again:
2396 rcu_read_unlock(); 2544 rcu_read_unlock();
2397 goto done; 2545 goto done;
2398 } 2546 }
2399 if (nr_pages == 1 && consume_stock(memcg)) { 2547 if (consume_stock(memcg, nr_pages)) {
2400 /* 2548 /*
2401 * It seems dagerous to access memcg without css_get(). 2549 * It seems dagerous to access memcg without css_get().
2402 * But considering how consume_stok works, it's not 2550 * But considering how consume_stok works, it's not
@@ -2431,7 +2579,8 @@ again:
2431 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2432 } 2580 }
2433 2581
2434 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2435 switch (ret) { 2584 switch (ret) {
2436 case CHARGE_OK: 2585 case CHARGE_OK:
2437 break; 2586 break;
@@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2624 memcg_check_events(memcg, page); 2773 memcg_check_events(memcg, page);
2625} 2774}
2626 2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785/*
2786 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2787 * in the memcg_cache_params struct.
2788 */
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830 /*
2831 * Conditions under which we can wait for the oom_killer. Those are
2832 * the same conditions tested by the core page allocator
2833 */
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841 /*
2842 * __mem_cgroup_try_charge() chosed to bypass to root due to
2843 * OOM kill or fatal signal. Since our only options are to
2844 * either fail the allocation or charge it to this cgroup, do
2845 * it as a temporary condition. But we can't fail. From a
2846 * kmem/slab perspective, the cache has already been selected,
2847 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2848 * our minds.
2849 *
2850 * This condition will only trigger if the task entered
2851 * memcg_charge_kmem in a sane state, but was OOM-killed during
2852 * __mem_cgroup_try_charge() above. Tasks that were already
2853 * dying when the allocation triggers should have been already
2854 * directed to the root cgroup in memcontrol.h
2855 */
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873 /* Not down to 0 */
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891/*
2892 * helper for acessing a memcg's index. It will be used as an index in the
2893 * child cache array in kmem_cache, and also to derive its name. This function
2894 * will return -1 when this is not a kmem-limited memcg.
2895 */
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901/*
2902 * This ends up being protected by the set_limit mutex, during normal
2903 * operation, because that is its main call site.
2904 *
2905 * But when we create a new cache, we can call this as well if its parent
2906 * is kmem-limited. That will have to hold set_limit_mutex as well.
2907 */
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916 /*
2917 * After this point, kmem_accounted (that we test atomically in
2918 * the beginning of this conditional), is no longer 0. This
2919 * guarantees only one process will set the following boolean
2920 * to true. We don't need test_and_set because we're protected
2921 * by the set_limit_mutex anyway.
2922 */
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953/*
2954 * We should update the current array size iff all caches updates succeed. This
2955 * can only be done from the slab side. The slab mutex needs to be held when
2956 * calling this.
2957 */
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985 /*
2986 * There is the chance it will be bigger than
2987 * memcg_limited_groups_array_size, if we failed an allocation
2988 * in a cache, in which case all caches updated before it, will
2989 * have a bigger array.
2990 *
2991 * But if that is the case, the data after
2992 * memcg_limited_groups_array_size is certainly unused
2993 */
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001 /*
3002 * Ideally, we would wait until all caches succeed, and only
3003 * then free the old one. But this is not worth the extra
3004 * pointer per-cache we'd have to have for this.
3005 *
3006 * It is not a big deal if some caches are left with a size
3007 * bigger than the others. And all updates will reset this
3008 * anyway.
3009 */
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 }
3034 return 0;
3035}
3036
3037void memcg_release_cache(struct kmem_cache *s)
3038{
3039 struct kmem_cache *root;
3040 struct mem_cgroup *memcg;
3041 int id;
3042
3043 /*
3044 * This happens, for instance, when a root cache goes away before we
3045 * add any memcg.
3046 */
3047 if (!s->memcg_params)
3048 return;
3049
3050 if (s->memcg_params->is_root_cache)
3051 goto out;
3052
3053 memcg = s->memcg_params->memcg;
3054 id = memcg_cache_id(memcg);
3055
3056 root = s->memcg_params->root_cache;
3057 root->memcg_params->memcg_caches[id] = NULL;
3058 mem_cgroup_put(memcg);
3059
3060 mutex_lock(&memcg->slab_caches_mutex);
3061 list_del(&s->memcg_params->list);
3062 mutex_unlock(&memcg->slab_caches_mutex);
3063
3064out:
3065 kfree(s->memcg_params);
3066}
3067
3068/*
3069 * During the creation a new cache, we need to disable our accounting mechanism
3070 * altogether. This is true even if we are not creating, but rather just
3071 * enqueing new caches to be created.
3072 *
3073 * This is because that process will trigger allocations; some visible, like
3074 * explicit kmallocs to auxiliary data structures, name strings and internal
3075 * cache structures; some well concealed, like INIT_WORK() that can allocate
3076 * objects during debug.
3077 *
3078 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3079 * to it. This may not be a bounded recursion: since the first cache creation
3080 * failed to complete (waiting on the allocation), we'll just try to create the
3081 * cache again, failing at the same point.
3082 *
3083 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3084 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3085 * inside the following two functions.
3086 */
3087static inline void memcg_stop_kmem_account(void)
3088{
3089 VM_BUG_ON(!current->mm);
3090 current->memcg_kmem_skip_account++;
3091}
3092
3093static inline void memcg_resume_kmem_account(void)
3094{
3095 VM_BUG_ON(!current->mm);
3096 current->memcg_kmem_skip_account--;
3097}
3098
3099static void kmem_cache_destroy_work_func(struct work_struct *w)
3100{
3101 struct kmem_cache *cachep;
3102 struct memcg_cache_params *p;
3103
3104 p = container_of(w, struct memcg_cache_params, destroy);
3105
3106 cachep = memcg_params_to_cache(p);
3107
3108 /*
3109 * If we get down to 0 after shrink, we could delete right away.
3110 * However, memcg_release_pages() already puts us back in the workqueue
3111 * in that case. If we proceed deleting, we'll get a dangling
3112 * reference, and removing the object from the workqueue in that case
3113 * is unnecessary complication. We are not a fast path.
3114 *
3115 * Note that this case is fundamentally different from racing with
3116 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3117 * kmem_cache_shrink, not only we would be reinserting a dead cache
3118 * into the queue, but doing so from inside the worker racing to
3119 * destroy it.
3120 *
3121 * So if we aren't down to zero, we'll just schedule a worker and try
3122 * again
3123 */
3124 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3125 kmem_cache_shrink(cachep);
3126 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3127 return;
3128 } else
3129 kmem_cache_destroy(cachep);
3130}
3131
3132void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3133{
3134 if (!cachep->memcg_params->dead)
3135 return;
3136
3137 /*
3138 * There are many ways in which we can get here.
3139 *
3140 * We can get to a memory-pressure situation while the delayed work is
3141 * still pending to run. The vmscan shrinkers can then release all
3142 * cache memory and get us to destruction. If this is the case, we'll
3143 * be executed twice, which is a bug (the second time will execute over
3144 * bogus data). In this case, cancelling the work should be fine.
3145 *
3146 * But we can also get here from the worker itself, if
3147 * kmem_cache_shrink is enough to shake all the remaining objects and
3148 * get the page count to 0. In this case, we'll deadlock if we try to
3149 * cancel the work (the worker runs with an internal lock held, which
3150 * is the same lock we would hold for cancel_work_sync().)
3151 *
3152 * Since we can't possibly know who got us here, just refrain from
3153 * running if there is already work pending
3154 */
3155 if (work_pending(&cachep->memcg_params->destroy))
3156 return;
3157 /*
3158 * We have to defer the actual destroying to a workqueue, because
3159 * we might currently be in a context that cannot sleep.
3160 */
3161 schedule_work(&cachep->memcg_params->destroy);
3162}
3163
3164static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3165{
3166 char *name;
3167 struct dentry *dentry;
3168
3169 rcu_read_lock();
3170 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3171 rcu_read_unlock();
3172
3173 BUG_ON(dentry == NULL);
3174
3175 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3176 memcg_cache_id(memcg), dentry->d_name.name);
3177
3178 return name;
3179}
3180
3181static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3182 struct kmem_cache *s)
3183{
3184 char *name;
3185 struct kmem_cache *new;
3186
3187 name = memcg_cache_name(memcg, s);
3188 if (!name)
3189 return NULL;
3190
3191 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3192 (s->flags & ~SLAB_PANIC), s->ctor, s);
3193
3194 if (new)
3195 new->allocflags |= __GFP_KMEMCG;
3196
3197 kfree(name);
3198 return new;
3199}
3200
3201/*
3202 * This lock protects updaters, not readers. We want readers to be as fast as
3203 * they can, and they will either see NULL or a valid cache value. Our model
3204 * allow them to see NULL, in which case the root memcg will be selected.
3205 *
3206 * We need this lock because multiple allocations to the same cache from a non
3207 * will span more than one worker. Only one of them can create the cache.
3208 */
3209static DEFINE_MUTEX(memcg_cache_mutex);
3210static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3211 struct kmem_cache *cachep)
3212{
3213 struct kmem_cache *new_cachep;
3214 int idx;
3215
3216 BUG_ON(!memcg_can_account_kmem(memcg));
3217
3218 idx = memcg_cache_id(memcg);
3219
3220 mutex_lock(&memcg_cache_mutex);
3221 new_cachep = cachep->memcg_params->memcg_caches[idx];
3222 if (new_cachep)
3223 goto out;
3224
3225 new_cachep = kmem_cache_dup(memcg, cachep);
3226 if (new_cachep == NULL) {
3227 new_cachep = cachep;
3228 goto out;
3229 }
3230
3231 mem_cgroup_get(memcg);
3232 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3233
3234 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3235 /*
3236 * the readers won't lock, make sure everybody sees the updated value,
3237 * so they won't put stuff in the queue again for no reason
3238 */
3239 wmb();
3240out:
3241 mutex_unlock(&memcg_cache_mutex);
3242 return new_cachep;
3243}
3244
3245void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3246{
3247 struct kmem_cache *c;
3248 int i;
3249
3250 if (!s->memcg_params)
3251 return;
3252 if (!s->memcg_params->is_root_cache)
3253 return;
3254
3255 /*
3256 * If the cache is being destroyed, we trust that there is no one else
3257 * requesting objects from it. Even if there are, the sanity checks in
3258 * kmem_cache_destroy should caught this ill-case.
3259 *
3260 * Still, we don't want anyone else freeing memcg_caches under our
3261 * noses, which can happen if a new memcg comes to life. As usual,
3262 * we'll take the set_limit_mutex to protect ourselves against this.
3263 */
3264 mutex_lock(&set_limit_mutex);
3265 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3266 c = s->memcg_params->memcg_caches[i];
3267 if (!c)
3268 continue;
3269
3270 /*
3271 * We will now manually delete the caches, so to avoid races
3272 * we need to cancel all pending destruction workers and
3273 * proceed with destruction ourselves.
3274 *
3275 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3276 * and that could spawn the workers again: it is likely that
3277 * the cache still have active pages until this very moment.
3278 * This would lead us back to mem_cgroup_destroy_cache.
3279 *
3280 * But that will not execute at all if the "dead" flag is not
3281 * set, so flip it down to guarantee we are in control.
3282 */
3283 c->memcg_params->dead = false;
3284 cancel_work_sync(&c->memcg_params->destroy);
3285 kmem_cache_destroy(c);
3286 }
3287 mutex_unlock(&set_limit_mutex);
3288}
3289
3290struct create_work {
3291 struct mem_cgroup *memcg;
3292 struct kmem_cache *cachep;
3293 struct work_struct work;
3294};
3295
3296static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3297{
3298 struct kmem_cache *cachep;
3299 struct memcg_cache_params *params;
3300
3301 if (!memcg_kmem_is_active(memcg))
3302 return;
3303
3304 mutex_lock(&memcg->slab_caches_mutex);
3305 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3306 cachep = memcg_params_to_cache(params);
3307 cachep->memcg_params->dead = true;
3308 INIT_WORK(&cachep->memcg_params->destroy,
3309 kmem_cache_destroy_work_func);
3310 schedule_work(&cachep->memcg_params->destroy);
3311 }
3312 mutex_unlock(&memcg->slab_caches_mutex);
3313}
3314
3315static void memcg_create_cache_work_func(struct work_struct *w)
3316{
3317 struct create_work *cw;
3318
3319 cw = container_of(w, struct create_work, work);
3320 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3321 /* Drop the reference gotten when we enqueued. */
3322 css_put(&cw->memcg->css);
3323 kfree(cw);
3324}
3325
3326/*
3327 * Enqueue the creation of a per-memcg kmem_cache.
3328 * Called with rcu_read_lock.
3329 */
3330static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3331 struct kmem_cache *cachep)
3332{
3333 struct create_work *cw;
3334
3335 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3336 if (cw == NULL)
3337 return;
3338
3339 /* The corresponding put will be done in the workqueue. */
3340 if (!css_tryget(&memcg->css)) {
3341 kfree(cw);
3342 return;
3343 }
3344
3345 cw->memcg = memcg;
3346 cw->cachep = cachep;
3347
3348 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3349 schedule_work(&cw->work);
3350}
3351
3352static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3353 struct kmem_cache *cachep)
3354{
3355 /*
3356 * We need to stop accounting when we kmalloc, because if the
3357 * corresponding kmalloc cache is not yet created, the first allocation
3358 * in __memcg_create_cache_enqueue will recurse.
3359 *
3360 * However, it is better to enclose the whole function. Depending on
3361 * the debugging options enabled, INIT_WORK(), for instance, can
3362 * trigger an allocation. This too, will make us recurse. Because at
3363 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3364 * the safest choice is to do it like this, wrapping the whole function.
3365 */
3366 memcg_stop_kmem_account();
3367 __memcg_create_cache_enqueue(memcg, cachep);
3368 memcg_resume_kmem_account();
3369}
3370/*
3371 * Return the kmem_cache we're supposed to use for a slab allocation.
3372 * We try to use the current memcg's version of the cache.
3373 *
3374 * If the cache does not exist yet, if we are the first user of it,
3375 * we either create it immediately, if possible, or create it asynchronously
3376 * in a workqueue.
3377 * In the latter case, we will let the current allocation go through with
3378 * the original cache.
3379 *
3380 * Can't be called in interrupt context or from kernel threads.
3381 * This function needs to be called with rcu_read_lock() held.
3382 */
3383struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3384 gfp_t gfp)
3385{
3386 struct mem_cgroup *memcg;
3387 int idx;
3388
3389 VM_BUG_ON(!cachep->memcg_params);
3390 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3391
3392 if (!current->mm || current->memcg_kmem_skip_account)
3393 return cachep;
3394
3395 rcu_read_lock();
3396 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3397 rcu_read_unlock();
3398
3399 if (!memcg_can_account_kmem(memcg))
3400 return cachep;
3401
3402 idx = memcg_cache_id(memcg);
3403
3404 /*
3405 * barrier to mare sure we're always seeing the up to date value. The
3406 * code updating memcg_caches will issue a write barrier to match this.
3407 */
3408 read_barrier_depends();
3409 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3410 /*
3411 * If we are in a safe context (can wait, and not in interrupt
3412 * context), we could be be predictable and return right away.
3413 * This would guarantee that the allocation being performed
3414 * already belongs in the new cache.
3415 *
3416 * However, there are some clashes that can arrive from locking.
3417 * For instance, because we acquire the slab_mutex while doing
3418 * kmem_cache_dup, this means no further allocation could happen
3419 * with the slab_mutex held.
3420 *
3421 * Also, because cache creation issue get_online_cpus(), this
3422 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3423 * that ends up reversed during cpu hotplug. (cpuset allocates
3424 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3425 * better to defer everything.
3426 */
3427 memcg_create_cache_enqueue(memcg, cachep);
3428 return cachep;
3429 }
3430
3431 return cachep->memcg_params->memcg_caches[idx];
3432}
3433EXPORT_SYMBOL(__memcg_kmem_get_cache);
3434
3435/*
3436 * We need to verify if the allocation against current->mm->owner's memcg is
3437 * possible for the given order. But the page is not allocated yet, so we'll
3438 * need a further commit step to do the final arrangements.
3439 *
3440 * It is possible for the task to switch cgroups in this mean time, so at
3441 * commit time, we can't rely on task conversion any longer. We'll then use
3442 * the handle argument to return to the caller which cgroup we should commit
3443 * against. We could also return the memcg directly and avoid the pointer
3444 * passing, but a boolean return value gives better semantics considering
3445 * the compiled-out case as well.
3446 *
3447 * Returning true means the allocation is possible.
3448 */
3449bool
3450__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3451{
3452 struct mem_cgroup *memcg;
3453 int ret;
3454
3455 *_memcg = NULL;
3456 memcg = try_get_mem_cgroup_from_mm(current->mm);
3457
3458 /*
3459 * very rare case described in mem_cgroup_from_task. Unfortunately there
3460 * isn't much we can do without complicating this too much, and it would
3461 * be gfp-dependent anyway. Just let it go
3462 */
3463 if (unlikely(!memcg))
3464 return true;
3465
3466 if (!memcg_can_account_kmem(memcg)) {
3467 css_put(&memcg->css);
3468 return true;
3469 }
3470
3471 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3472 if (!ret)
3473 *_memcg = memcg;
3474
3475 css_put(&memcg->css);
3476 return (ret == 0);
3477}
3478
3479void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3480 int order)
3481{
3482 struct page_cgroup *pc;
3483
3484 VM_BUG_ON(mem_cgroup_is_root(memcg));
3485
3486 /* The page allocation failed. Revert */
3487 if (!page) {
3488 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3489 return;
3490 }
3491
3492 pc = lookup_page_cgroup(page);
3493 lock_page_cgroup(pc);
3494 pc->mem_cgroup = memcg;
3495 SetPageCgroupUsed(pc);
3496 unlock_page_cgroup(pc);
3497}
3498
3499void __memcg_kmem_uncharge_pages(struct page *page, int order)
3500{
3501 struct mem_cgroup *memcg = NULL;
3502 struct page_cgroup *pc;
3503
3504
3505 pc = lookup_page_cgroup(page);
3506 /*
3507 * Fast unlocked return. Theoretically might have changed, have to
3508 * check again after locking.
3509 */
3510 if (!PageCgroupUsed(pc))
3511 return;
3512
3513 lock_page_cgroup(pc);
3514 if (PageCgroupUsed(pc)) {
3515 memcg = pc->mem_cgroup;
3516 ClearPageCgroupUsed(pc);
3517 }
3518 unlock_page_cgroup(pc);
3519
3520 /*
3521 * We trust that only if there is a memcg associated with the page, it
3522 * is a valid allocation
3523 */
3524 if (!memcg)
3525 return;
3526
3527 VM_BUG_ON(mem_cgroup_is_root(memcg));
3528 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3529}
3530#else
3531static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3532{
3533}
3534#endif /* CONFIG_MEMCG_KMEM */
3535
2627#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2628 3537
2629#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3538#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
3486} 4395}
3487#endif 4396#endif
3488 4397
3489static DEFINE_MUTEX(set_limit_mutex);
3490
3491static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4398static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3492 unsigned long long val) 4399 unsigned long long val)
3493{ 4400{
@@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3772static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4679static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 4680{
3774 int node, zid; 4681 int node, zid;
4682 u64 usage;
3775 4683
3776 do { 4684 do {
3777 /* This is for making all *used* pages to be on LRU. */ 4685 /* This is for making all *used* pages to be on LRU. */
@@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3792 cond_resched(); 4700 cond_resched();
3793 4701
3794 /* 4702 /*
4703 * Kernel memory may not necessarily be trackable to a specific
4704 * process. So they are not migrated, and therefore we can't
4705 * expect their value to drop to 0 here.
4706 * Having res filled up with kmem only is enough.
4707 *
3795 * This is a safety check because mem_cgroup_force_empty_list 4708 * This is a safety check because mem_cgroup_force_empty_list
3796 * could have raced with mem_cgroup_replace_page_cache callers 4709 * could have raced with mem_cgroup_replace_page_cache callers
3797 * so the lru seemed empty but the page could have been added 4710 * so the lru seemed empty but the page could have been added
3798 * right after the check. RES_USAGE should be safe as we always 4711 * right after the check. RES_USAGE should be safe as we always
3799 * charge before adding to the LRU. 4712 * charge before adding to the LRU.
3800 */ 4713 */
3801 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); 4714 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4715 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4716 } while (usage > 0);
3802} 4717}
3803 4718
3804/* 4719/*
@@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3942 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4857 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3943 char str[64]; 4858 char str[64];
3944 u64 val; 4859 u64 val;
3945 int type, name, len; 4860 int name, len;
4861 enum res_type type;
3946 4862
3947 type = MEMFILE_TYPE(cft->private); 4863 type = MEMFILE_TYPE(cft->private);
3948 name = MEMFILE_ATTR(cft->private); 4864 name = MEMFILE_ATTR(cft->private);
@@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3963 else 4879 else
3964 val = res_counter_read_u64(&memcg->memsw, name); 4880 val = res_counter_read_u64(&memcg->memsw, name);
3965 break; 4881 break;
4882 case _KMEM:
4883 val = res_counter_read_u64(&memcg->kmem, name);
4884 break;
3966 default: 4885 default:
3967 BUG(); 4886 BUG();
3968 } 4887 }
@@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3970 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4889 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3971 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4890 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3972} 4891}
4892
4893static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{
4895 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't
4902 * be changed if the cgroup has children already, or if tasks had
4903 * already joined.
4904 *
4905 * If tasks join before we set the limit, a person looking at
4906 * kmem.usage_in_bytes will have no way to determine when it took
4907 * place, which makes the value quite meaningless.
4908 *
4909 * After it first became limited, changes in the value of the limit are
4910 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */
4918 cgroup_lock();
4919 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY;
4924 goto out;
4925 }
4926 ret = res_counter_set_limit(&memcg->kmem, val);
4927 VM_BUG_ON(ret);
4928
4929 ret = memcg_update_cache_sizes(memcg);
4930 if (ret) {
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out;
4933 }
4934 must_inc_static_branch = true;
4935 /*
4936 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various
4938 * processes, so it is unfeasible to migrate them away. We
4939 * need to reference count the memcg because of that.
4940 */
4941 mem_cgroup_get(memcg);
4942 } else
4943 ret = res_counter_set_limit(&memcg->kmem, val);
4944out:
4945 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock();
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif
4969 return ret;
4970}
4971
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{
4974 int ret = 0;
4975 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4976 if (!parent)
4977 goto out;
4978
4979 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /*
4982 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to
4984 * complicate the code by keeping track of which memcgs were the ones
4985 * that actually enabled limits, and which ones got it from its
4986 * parents.
4987 *
4988 * It is a lot simpler just to do static_key_slow_inc() on every child
4989 * that is accounted.
4990 */
4991 if (!memcg_kmem_is_active(memcg))
4992 goto out;
4993
4994 /*
4995 * destroy(), called if we fail, will issue static_key_slow_inc() and
4996 * mem_cgroup_put() if kmem is enabled. We have to either call them
4997 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4998 * this more consistent, since it always leads to the same destroy path
4999 */
5000 mem_cgroup_get(memcg);
5001 static_key_slow_inc(&memcg_kmem_enabled_key);
5002
5003 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex);
5006#endif
5007out:
5008 return ret;
5009}
5010
3973/* 5011/*
3974 * The user of this function is... 5012 * The user of this function is...
3975 * RES_LIMIT. 5013 * RES_LIMIT.
@@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3978 const char *buffer) 5016 const char *buffer)
3979{ 5017{
3980 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5018 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3981 int type, name; 5019 enum res_type type;
5020 int name;
3982 unsigned long long val; 5021 unsigned long long val;
3983 int ret; 5022 int ret;
3984 5023
@@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
4000 break; 5039 break;
4001 if (type == _MEM) 5040 if (type == _MEM)
4002 ret = mem_cgroup_resize_limit(memcg, val); 5041 ret = mem_cgroup_resize_limit(memcg, val);
4003 else 5042 else if (type == _MEMSWAP)
4004 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5043 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5044 else if (type == _KMEM)
5045 ret = memcg_update_kmem_limit(cont, val);
5046 else
5047 return -EINVAL;
4005 break; 5048 break;
4006 case RES_SOFT_LIMIT: 5049 case RES_SOFT_LIMIT:
4007 ret = res_counter_memparse_write_strategy(buffer, &val); 5050 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4054,7 +5097,8 @@ out:
4054static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5097static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4055{ 5098{
4056 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5099 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4057 int type, name; 5100 int name;
5101 enum res_type type;
4058 5102
4059 type = MEMFILE_TYPE(event); 5103 type = MEMFILE_TYPE(event);
4060 name = MEMFILE_ATTR(event); 5104 name = MEMFILE_ATTR(event);
@@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4066 case RES_MAX_USAGE: 5110 case RES_MAX_USAGE:
4067 if (type == _MEM) 5111 if (type == _MEM)
4068 res_counter_reset_max(&memcg->res); 5112 res_counter_reset_max(&memcg->res);
4069 else 5113 else if (type == _MEMSWAP)
4070 res_counter_reset_max(&memcg->memsw); 5114 res_counter_reset_max(&memcg->memsw);
5115 else if (type == _KMEM)
5116 res_counter_reset_max(&memcg->kmem);
5117 else
5118 return -EINVAL;
4071 break; 5119 break;
4072 case RES_FAILCNT: 5120 case RES_FAILCNT:
4073 if (type == _MEM) 5121 if (type == _MEM)
4074 res_counter_reset_failcnt(&memcg->res); 5122 res_counter_reset_failcnt(&memcg->res);
4075 else 5123 else if (type == _MEMSWAP)
4076 res_counter_reset_failcnt(&memcg->memsw); 5124 res_counter_reset_failcnt(&memcg->memsw);
5125 else if (type == _KMEM)
5126 res_counter_reset_failcnt(&memcg->kmem);
5127 else
5128 return -EINVAL;
4077 break; 5129 break;
4078 } 5130 }
4079 5131
@@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4390 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5442 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4391 struct mem_cgroup_thresholds *thresholds; 5443 struct mem_cgroup_thresholds *thresholds;
4392 struct mem_cgroup_threshold_ary *new; 5444 struct mem_cgroup_threshold_ary *new;
4393 int type = MEMFILE_TYPE(cft->private); 5445 enum res_type type = MEMFILE_TYPE(cft->private);
4394 u64 threshold, usage; 5446 u64 threshold, usage;
4395 int i, size, ret; 5447 int i, size, ret;
4396 5448
@@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4473 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5525 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4474 struct mem_cgroup_thresholds *thresholds; 5526 struct mem_cgroup_thresholds *thresholds;
4475 struct mem_cgroup_threshold_ary *new; 5527 struct mem_cgroup_threshold_ary *new;
4476 int type = MEMFILE_TYPE(cft->private); 5528 enum res_type type = MEMFILE_TYPE(cft->private);
4477 u64 usage; 5529 u64 usage;
4478 int i, j, size; 5530 int i, j, size;
4479 5531
@@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4551{ 5603{
4552 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5604 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4553 struct mem_cgroup_eventfd_list *event; 5605 struct mem_cgroup_eventfd_list *event;
4554 int type = MEMFILE_TYPE(cft->private); 5606 enum res_type type = MEMFILE_TYPE(cft->private);
4555 5607
4556 BUG_ON(type != _OOM_TYPE); 5608 BUG_ON(type != _OOM_TYPE);
4557 event = kmalloc(sizeof(*event), GFP_KERNEL); 5609 event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4576{ 5628{
4577 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4578 struct mem_cgroup_eventfd_list *ev, *tmp; 5630 struct mem_cgroup_eventfd_list *ev, *tmp;
4579 int type = MEMFILE_TYPE(cft->private); 5631 enum res_type type = MEMFILE_TYPE(cft->private);
4580 5632
4581 BUG_ON(type != _OOM_TYPE); 5633 BUG_ON(type != _OOM_TYPE);
4582 5634
@@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4635#ifdef CONFIG_MEMCG_KMEM 5687#ifdef CONFIG_MEMCG_KMEM
4636static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5688static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4637{ 5689{
5690 int ret;
5691
5692 memcg->kmemcg_id = -1;
5693 ret = memcg_propagate_kmem(memcg);
5694 if (ret)
5695 return ret;
5696
4638 return mem_cgroup_sockets_init(memcg, ss); 5697 return mem_cgroup_sockets_init(memcg, ss);
4639}; 5698};
4640 5699
4641static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5700static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4642{ 5701{
4643 mem_cgroup_sockets_destroy(memcg); 5702 mem_cgroup_sockets_destroy(memcg);
5703
5704 memcg_kmem_mark_dead(memcg);
5705
5706 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5707 return;
5708
5709 /*
5710 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5711 * path here, being careful not to race with memcg_uncharge_kmem: it is
5712 * possible that the charges went down to 0 between mark_dead and the
5713 * res_counter read, so in that case, we don't need the put
5714 */
5715 if (memcg_kmem_test_and_clear_dead(memcg))
5716 mem_cgroup_put(memcg);
4644} 5717}
4645#else 5718#else
4646static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5719static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
4749 .read = mem_cgroup_read, 5822 .read = mem_cgroup_read,
4750 }, 5823 },
4751#endif 5824#endif
5825#ifdef CONFIG_MEMCG_KMEM
5826 {
5827 .name = "kmem.limit_in_bytes",
5828 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5829 .write_string = mem_cgroup_write,
5830 .read = mem_cgroup_read,
5831 },
5832 {
5833 .name = "kmem.usage_in_bytes",
5834 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5835 .read = mem_cgroup_read,
5836 },
5837 {
5838 .name = "kmem.failcnt",
5839 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5840 .trigger = mem_cgroup_reset,
5841 .read = mem_cgroup_read,
5842 },
5843 {
5844 .name = "kmem.max_usage_in_bytes",
5845 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5846 .trigger = mem_cgroup_reset,
5847 .read = mem_cgroup_read,
5848 },
5849#ifdef CONFIG_SLABINFO
5850 {
5851 .name = "kmem.slabinfo",
5852 .read_seq_string = mem_cgroup_slabinfo_read,
5853 },
5854#endif
5855#endif
4752 { }, /* terminate */ 5856 { }, /* terminate */
4753}; 5857};
4754 5858
@@ -4816,16 +5920,29 @@ out_free:
4816} 5920}
4817 5921
4818/* 5922/*
4819 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5923 * At destroying mem_cgroup, references from swap_cgroup can remain.
4820 * but in process context. The work_freeing structure is overlaid 5924 * (scanning all at force_empty is too costly...)
4821 * on the rcu_freeing structure, which itself is overlaid on memsw. 5925 *
5926 * Instead of clearing all references at force_empty, we remember
5927 * the number of reference from swap_cgroup and free mem_cgroup when
5928 * it goes down to 0.
5929 *
5930 * Removal of cgroup itself succeeds regardless of refs from swap.
4822 */ 5931 */
4823static void free_work(struct work_struct *work) 5932
5933static void __mem_cgroup_free(struct mem_cgroup *memcg)
4824{ 5934{
4825 struct mem_cgroup *memcg; 5935 int node;
4826 int size = sizeof(struct mem_cgroup); 5936 int size = sizeof(struct mem_cgroup);
4827 5937
4828 memcg = container_of(work, struct mem_cgroup, work_freeing); 5938 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css);
5940
5941 for_each_node(node)
5942 free_mem_cgroup_per_zone_info(memcg, node);
5943
5944 free_percpu(memcg->stat);
5945
4829 /* 5946 /*
4830 * We need to make sure that (at least for now), the jump label 5947 * We need to make sure that (at least for now), the jump label
4831 * destruction code runs outside of the cgroup lock. This is because 5948 * destruction code runs outside of the cgroup lock. This is because
@@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)
4837 * to move this code around, and make sure it is outside 5954 * to move this code around, and make sure it is outside
4838 * the cgroup_lock. 5955 * the cgroup_lock.
4839 */ 5956 */
4840 disarm_sock_keys(memcg); 5957 disarm_static_keys(memcg);
4841 if (size < PAGE_SIZE) 5958 if (size < PAGE_SIZE)
4842 kfree(memcg); 5959 kfree(memcg);
4843 else 5960 else
4844 vfree(memcg); 5961 vfree(memcg);
4845} 5962}
4846 5963
4847static void free_rcu(struct rcu_head *rcu_head)
4848{
4849 struct mem_cgroup *memcg;
4850
4851 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4852 INIT_WORK(&memcg->work_freeing, free_work);
4853 schedule_work(&memcg->work_freeing);
4854}
4855 5964
4856/* 5965/*
4857 * At destroying mem_cgroup, references from swap_cgroup can remain. 5966 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4858 * (scanning all at force_empty is too costly...) 5967 * but in process context. The work_freeing structure is overlaid
4859 * 5968 * on the rcu_freeing structure, which itself is overlaid on memsw.
4860 * Instead of clearing all references at force_empty, we remember
4861 * the number of reference from swap_cgroup and free mem_cgroup when
4862 * it goes down to 0.
4863 *
4864 * Removal of cgroup itself succeeds regardless of refs from swap.
4865 */ 5969 */
4866 5970static void free_work(struct work_struct *work)
4867static void __mem_cgroup_free(struct mem_cgroup *memcg)
4868{ 5971{
4869 int node; 5972 struct mem_cgroup *memcg;
4870 5973
4871 mem_cgroup_remove_from_trees(memcg); 5974 memcg = container_of(work, struct mem_cgroup, work_freeing);
4872 free_css_id(&mem_cgroup_subsys, &memcg->css); 5975 __mem_cgroup_free(memcg);
5976}
4873 5977
4874 for_each_node(node) 5978static void free_rcu(struct rcu_head *rcu_head)
4875 free_mem_cgroup_per_zone_info(memcg, node); 5979{
5980 struct mem_cgroup *memcg;
4876 5981
4877 free_percpu(memcg->stat); 5982 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4878 call_rcu(&memcg->rcu_freeing, free_rcu); 5983 INIT_WORK(&memcg->work_freeing, free_work);
5984 schedule_work(&memcg->work_freeing);
4879} 5985}
4880 5986
4881static void mem_cgroup_get(struct mem_cgroup *memcg) 5987static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4887{ 5993{
4888 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5994 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4889 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5995 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4890 __mem_cgroup_free(memcg); 5996 call_rcu(&memcg->rcu_freeing, free_rcu);
4891 if (parent) 5997 if (parent)
4892 mem_cgroup_put(parent); 5998 mem_cgroup_put(parent);
4893 } 5999 }
@@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
4994 if (parent && parent->use_hierarchy) { 6100 if (parent && parent->use_hierarchy) {
4995 res_counter_init(&memcg->res, &parent->res); 6101 res_counter_init(&memcg->res, &parent->res);
4996 res_counter_init(&memcg->memsw, &parent->memsw); 6102 res_counter_init(&memcg->memsw, &parent->memsw);
6103 res_counter_init(&memcg->kmem, &parent->kmem);
6104
4997 /* 6105 /*
4998 * We increment refcnt of the parent to ensure that we can 6106 * We increment refcnt of the parent to ensure that we can
4999 * safely access it on res_counter_charge/uncharge. 6107 * safely access it on res_counter_charge/uncharge.
@@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5004 } else { 6112 } else {
5005 res_counter_init(&memcg->res, NULL); 6113 res_counter_init(&memcg->res, NULL);
5006 res_counter_init(&memcg->memsw, NULL); 6114 res_counter_init(&memcg->memsw, NULL);
6115 res_counter_init(&memcg->kmem, NULL);
5007 /* 6116 /*
5008 * Deeper hierachy with use_hierarchy == false doesn't make 6117 * Deeper hierachy with use_hierarchy == false doesn't make
5009 * much sense so let cgroup subsystem know about this 6118 * much sense so let cgroup subsystem know about this
@@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
5043 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6152 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5044 6153
5045 mem_cgroup_reparent_charges(memcg); 6154 mem_cgroup_reparent_charges(memcg);
6155 mem_cgroup_destroy_all_caches(memcg);
5046} 6156}
5047 6157
5048static void mem_cgroup_css_free(struct cgroup *cont) 6158static void mem_cgroup_css_free(struct cgroup *cont)