aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-03-04 00:35:43 -0500
committerIngo Molnar <mingo@kernel.org>2015-03-04 00:35:43 -0500
commitd2c032e3dc58137a7261a7824d3acce435db1d66 (patch)
tree7eea1c7c6103eefe879f07472eec99b3c41eb792 /mm/memcontrol.c
parent7e8e385aaf6ed5b64b5d9108081cfcdcdd021b78 (diff)
parent13a7a6ac0a11197edcd0f756a035f472b42cdf8b (diff)
Merge tag 'v4.0-rc2' into x86/asm, to refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1073
1 files changed, 580 insertions, 493 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..9fe07692eaad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
72#define MEM_CGROUP_RECLAIM_RETRIES 5 72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly; 73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74 74
75/* Whether the swap controller is active */
75#ifdef CONFIG_MEMCG_SWAP 76#ifdef CONFIG_MEMCG_SWAP
76/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77int do_swap_account __read_mostly; 77int do_swap_account __read_mostly;
78
79/* for remember boot option*/
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata;
84#endif
85
86#else 78#else
87#define do_swap_account 0 79#define do_swap_account 0
88#endif 80#endif
89 81
90
91static const char * const mem_cgroup_stat_names[] = { 82static const char * const mem_cgroup_stat_names[] = {
92 "cache", 83 "cache",
93 "rss", 84 "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
97 "swap", 88 "swap",
98}; 89};
99 90
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = { 91static const char * const mem_cgroup_events_names[] = {
109 "pgpgin", 92 "pgpgin",
110 "pgpgout", 93 "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
138 121
139struct mem_cgroup_stat_cpu { 122struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS]; 123 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 124 unsigned long events[MEMCG_NR_EVENTS];
142 unsigned long nr_page_events; 125 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 126 unsigned long targets[MEM_CGROUP_NTARGETS];
144}; 127};
@@ -284,6 +267,10 @@ struct mem_cgroup {
284 struct page_counter memsw; 267 struct page_counter memsw;
285 struct page_counter kmem; 268 struct page_counter kmem;
286 269
270 /* Normal memory consumption range */
271 unsigned long low;
272 unsigned long high;
273
287 unsigned long soft_limit; 274 unsigned long soft_limit;
288 275
289 /* vmpressure notifications */ 276 /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
325 /* 312 /*
326 * set > 0 if pages under this cgroup are moving to other cgroup. 313 * set > 0 if pages under this cgroup are moving to other cgroup.
327 */ 314 */
328 atomic_t moving_account; 315 atomic_t moving_account;
329 /* taken only while moving_account > 0 */ 316 /* taken only while moving_account > 0 */
330 spinlock_t move_lock; 317 spinlock_t move_lock;
318 struct task_struct *move_lock_task;
319 unsigned long move_lock_flags;
331 /* 320 /*
332 * percpu counter. 321 * percpu counter.
333 */ 322 */
@@ -343,11 +332,10 @@ struct mem_cgroup {
343 struct cg_proto tcp_mem; 332 struct cg_proto tcp_mem;
344#endif 333#endif
345#if defined(CONFIG_MEMCG_KMEM) 334#if defined(CONFIG_MEMCG_KMEM)
346 /* analogous to slab_common's slab_caches list, but per-memcg; 335 /* Index in the kmem_cache->memcg_params.memcg_caches array */
347 * protected by memcg_slab_mutex */
348 struct list_head memcg_slab_caches;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 336 int kmemcg_id;
337 bool kmem_acct_activated;
338 bool kmem_acct_active;
351#endif 339#endif
352 340
353 int last_scanned_node; 341 int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
366}; 354};
367 355
368#ifdef CONFIG_MEMCG_KMEM 356#ifdef CONFIG_MEMCG_KMEM
369static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 357bool memcg_kmem_is_active(struct mem_cgroup *memcg)
370{ 358{
371 return memcg->kmemcg_id >= 0; 359 return memcg->kmem_acct_active;
372} 360}
373#endif 361#endif
374 362
375/* Stuffs for move charges at task migration. */ 363/* Stuffs for move charges at task migration. */
376/* 364/*
377 * Types of charges to be moved. "move_charge_at_immitgrate" and 365 * Types of charges to be moved.
378 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
379 */ 366 */
380enum move_type { 367#define MOVE_ANON 0x1U
381 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 368#define MOVE_FILE 0x2U
382 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 369#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
383 NR_MOVE_TYPE,
384};
385 370
386/* "mc" and its members are protected by cgroup_mutex */ 371/* "mc" and its members are protected by cgroup_mutex */
387static struct move_charge_struct { 372static struct move_charge_struct {
388 spinlock_t lock; /* for from, to */ 373 spinlock_t lock; /* for from, to */
389 struct mem_cgroup *from; 374 struct mem_cgroup *from;
390 struct mem_cgroup *to; 375 struct mem_cgroup *to;
391 unsigned long immigrate_flags; 376 unsigned long flags;
392 unsigned long precharge; 377 unsigned long precharge;
393 unsigned long moved_charge; 378 unsigned long moved_charge;
394 unsigned long moved_swap; 379 unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
399 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 384 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
400}; 385};
401 386
402static bool move_anon(void)
403{
404 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
405}
406
407static bool move_file(void)
408{
409 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
410}
411
412/* 387/*
413 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 388 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
414 * limit reclaim to prevent infinite loops, if they ever occur. 389 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
544} 519}
545EXPORT_SYMBOL(tcp_proto_cgroup); 520EXPORT_SYMBOL(tcp_proto_cgroup);
546 521
547static void disarm_sock_keys(struct mem_cgroup *memcg)
548{
549 if (!memcg_proto_activated(&memcg->tcp_mem))
550 return;
551 static_key_slow_dec(&memcg_socket_limit_enabled);
552}
553#else
554static void disarm_sock_keys(struct mem_cgroup *memcg)
555{
556}
557#endif 522#endif
558 523
559#ifdef CONFIG_MEMCG_KMEM 524#ifdef CONFIG_MEMCG_KMEM
560/* 525/*
561 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 526 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
562 * The main reason for not using cgroup id for this: 527 * The main reason for not using cgroup id for this:
563 * this works better in sparse environments, where we have a lot of memcgs, 528 * this works better in sparse environments, where we have a lot of memcgs,
564 * but only a few kmem-limited. Or also, if we have, for instance, 200 529 * but only a few kmem-limited. Or also, if we have, for instance, 200
565 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 530 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
566 * 200 entry array for that. 531 * 200 entry array for that.
567 * 532 *
568 * The current size of the caches array is stored in 533 * The current size of the caches array is stored in memcg_nr_cache_ids. It
569 * memcg_limited_groups_array_size. It will double each time we have to 534 * will double each time we have to increase it.
570 * increase it.
571 */ 535 */
572static DEFINE_IDA(kmem_limited_groups); 536static DEFINE_IDA(memcg_cache_ida);
573int memcg_limited_groups_array_size; 537int memcg_nr_cache_ids;
538
539/* Protects memcg_nr_cache_ids */
540static DECLARE_RWSEM(memcg_cache_ids_sem);
541
542void memcg_get_cache_ids(void)
543{
544 down_read(&memcg_cache_ids_sem);
545}
546
547void memcg_put_cache_ids(void)
548{
549 up_read(&memcg_cache_ids_sem);
550}
574 551
575/* 552/*
576 * MIN_SIZE is different than 1, because we would like to avoid going through 553 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
596struct static_key memcg_kmem_enabled_key; 573struct static_key memcg_kmem_enabled_key;
597EXPORT_SYMBOL(memcg_kmem_enabled_key); 574EXPORT_SYMBOL(memcg_kmem_enabled_key);
598 575
599static void memcg_free_cache_id(int id);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 memcg_free_cache_id(memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(page_counter_read(&memcg->kmem));
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */ 576#endif /* CONFIG_MEMCG_KMEM */
618 577
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
625static struct mem_cgroup_per_zone * 578static struct mem_cgroup_per_zone *
626mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 579mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
627{ 580{
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1368 return inactive * inactive_ratio < active; 1321 return inactive * inactive_ratio < active;
1369} 1322}
1370 1323
1324bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
1325{
1326 struct mem_cgroup_per_zone *mz;
1327 struct mem_cgroup *memcg;
1328
1329 if (mem_cgroup_disabled())
1330 return true;
1331
1332 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1333 memcg = mz->memcg;
1334
1335 return !!(memcg->css.flags & CSS_ONLINE);
1336}
1337
1371#define mem_cgroup_from_counter(counter, member) \ 1338#define mem_cgroup_from_counter(counter, member) \
1372 container_of(counter, struct mem_cgroup, member) 1339 container_of(counter, struct mem_cgroup, member)
1373 1340
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1560 * quickly exit and free its memory. 1527 * quickly exit and free its memory.
1561 */ 1528 */
1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1529 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1563 set_thread_flag(TIF_MEMDIE); 1530 mark_tsk_oom_victim(current);
1564 return; 1531 return;
1565 } 1532 }
1566 1533
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1934 if (!memcg) 1901 if (!memcg)
1935 return false; 1902 return false;
1936 1903
1937 if (!handle) 1904 if (!handle || oom_killer_disabled)
1938 goto cleanup; 1905 goto cleanup;
1939 1906
1940 owait.memcg = memcg; 1907 owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
1980/** 1947/**
1981 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1948 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
1982 * @page: page that is going to change accounted state 1949 * @page: page that is going to change accounted state
1983 * @locked: &memcg->move_lock slowpath was taken
1984 * @flags: IRQ-state flags for &memcg->move_lock
1985 * 1950 *
1986 * This function must mark the beginning of an accounted page state 1951 * This function must mark the beginning of an accounted page state
1987 * change to prevent double accounting when the page is concurrently 1952 * change to prevent double accounting when the page is concurrently
1988 * being moved to another memcg: 1953 * being moved to another memcg:
1989 * 1954 *
1990 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1955 * memcg = mem_cgroup_begin_page_stat(page);
1991 * if (TestClearPageState(page)) 1956 * if (TestClearPageState(page))
1992 * mem_cgroup_update_page_stat(memcg, state, -1); 1957 * mem_cgroup_update_page_stat(memcg, state, -1);
1993 * mem_cgroup_end_page_stat(memcg, locked, flags); 1958 * mem_cgroup_end_page_stat(memcg);
1994 *
1995 * The RCU lock is held throughout the transaction. The fast path can
1996 * get away without acquiring the memcg->move_lock (@locked is false)
1997 * because page moving starts with an RCU grace period.
1998 *
1999 * The RCU lock also protects the memcg from being freed when the page
2000 * state that is going to change is the only thing preventing the page
2001 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
2002 * which allows migration to go ahead and uncharge the page before the
2003 * account transaction might be complete.
2004 */ 1959 */
2005struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 1960struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
2006 bool *locked,
2007 unsigned long *flags)
2008{ 1961{
2009 struct mem_cgroup *memcg; 1962 struct mem_cgroup *memcg;
1963 unsigned long flags;
2010 1964
1965 /*
1966 * The RCU lock is held throughout the transaction. The fast
1967 * path can get away without acquiring the memcg->move_lock
1968 * because page moving starts with an RCU grace period.
1969 *
1970 * The RCU lock also protects the memcg from being freed when
1971 * the page state that is going to change is the only thing
1972 * preventing the page from being uncharged.
1973 * E.g. end-writeback clearing PageWriteback(), which allows
1974 * migration to go ahead and uncharge the page before the
1975 * account transaction might be complete.
1976 */
2011 rcu_read_lock(); 1977 rcu_read_lock();
2012 1978
2013 if (mem_cgroup_disabled()) 1979 if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
2017 if (unlikely(!memcg)) 1983 if (unlikely(!memcg))
2018 return NULL; 1984 return NULL;
2019 1985
2020 *locked = false;
2021 if (atomic_read(&memcg->moving_account) <= 0) 1986 if (atomic_read(&memcg->moving_account) <= 0)
2022 return memcg; 1987 return memcg;
2023 1988
2024 spin_lock_irqsave(&memcg->move_lock, *flags); 1989 spin_lock_irqsave(&memcg->move_lock, flags);
2025 if (memcg != page->mem_cgroup) { 1990 if (memcg != page->mem_cgroup) {
2026 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1991 spin_unlock_irqrestore(&memcg->move_lock, flags);
2027 goto again; 1992 goto again;
2028 } 1993 }
2029 *locked = true; 1994
1995 /*
1996 * When charge migration first begins, we can have locked and
1997 * unlocked page stat updates happening concurrently. Track
1998 * the task who has the lock for mem_cgroup_end_page_stat().
1999 */
2000 memcg->move_lock_task = current;
2001 memcg->move_lock_flags = flags;
2030 2002
2031 return memcg; 2003 return memcg;
2032} 2004}
@@ -2034,14 +2006,17 @@ again:
2034/** 2006/**
2035 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2007 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2036 * @memcg: the memcg that was accounted against 2008 * @memcg: the memcg that was accounted against
2037 * @locked: value received from mem_cgroup_begin_page_stat()
2038 * @flags: value received from mem_cgroup_begin_page_stat()
2039 */ 2009 */
2040void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, 2010void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
2041 unsigned long *flags)
2042{ 2011{
2043 if (memcg && *locked) 2012 if (memcg && memcg->move_lock_task == current) {
2044 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2013 unsigned long flags = memcg->move_lock_flags;
2014
2015 memcg->move_lock_task = NULL;
2016 memcg->move_lock_flags = 0;
2017
2018 spin_unlock_irqrestore(&memcg->move_lock, flags);
2019 }
2045 2020
2046 rcu_read_unlock(); 2021 rcu_read_unlock();
2047} 2022}
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
2134 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2109 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2135} 2110}
2136 2111
2137static void __init memcg_stock_init(void)
2138{
2139 int cpu;
2140
2141 for_each_possible_cpu(cpu) {
2142 struct memcg_stock_pcp *stock =
2143 &per_cpu(memcg_stock, cpu);
2144 INIT_WORK(&stock->work, drain_local_stock);
2145 }
2146}
2147
2148/* 2112/*
2149 * Cache charges(val) to local per_cpu area. 2113 * Cache charges(val) to local per_cpu area.
2150 * This will be consumed by consume_stock() function, later. 2114 * This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
2294 if (!(gfp_mask & __GFP_WAIT)) 2258 if (!(gfp_mask & __GFP_WAIT))
2295 goto nomem; 2259 goto nomem;
2296 2260
2261 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
2262
2297 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2263 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2298 gfp_mask, may_swap); 2264 gfp_mask, may_swap);
2299 2265
@@ -2335,6 +2301,8 @@ retry:
2335 if (fatal_signal_pending(current)) 2301 if (fatal_signal_pending(current))
2336 goto bypass; 2302 goto bypass;
2337 2303
2304 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2305
2338 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2306 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2339nomem: 2307nomem:
2340 if (!(gfp_mask & __GFP_NOFAIL)) 2308 if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
2346 css_get_many(&memcg->css, batch); 2314 css_get_many(&memcg->css, batch);
2347 if (batch > nr_pages) 2315 if (batch > nr_pages)
2348 refill_stock(memcg, batch - nr_pages); 2316 refill_stock(memcg, batch - nr_pages);
2317 /*
2318 * If the hierarchy is above the normal consumption range,
2319 * make the charging task trim their excess contribution.
2320 */
2321 do {
2322 if (page_counter_read(&memcg->memory) <= memcg->high)
2323 continue;
2324 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
2325 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2326 } while ((memcg = parent_mem_cgroup(memcg)));
2349done: 2327done:
2350 return ret; 2328 return ret;
2351} 2329}
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2476} 2454}
2477 2455
2478#ifdef CONFIG_MEMCG_KMEM 2456#ifdef CONFIG_MEMCG_KMEM
2479/* 2457int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2480 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2458 unsigned long nr_pages)
2481 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2482 */
2483static DEFINE_MUTEX(memcg_slab_mutex);
2484
2485/*
2486 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2487 * in the memcg_cache_params struct.
2488 */
2489static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2490{
2491 struct kmem_cache *cachep;
2492
2493 VM_BUG_ON(p->is_root_cache);
2494 cachep = p->root_cache;
2495 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2496}
2497
2498static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2499 unsigned long nr_pages)
2500{ 2459{
2501 struct page_counter *counter; 2460 struct page_counter *counter;
2502 int ret = 0; 2461 int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2533 return ret; 2492 return ret;
2534} 2493}
2535 2494
2536static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2495void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2537 unsigned long nr_pages)
2538{ 2496{
2539 page_counter_uncharge(&memcg->memory, nr_pages); 2497 page_counter_uncharge(&memcg->memory, nr_pages);
2540 if (do_swap_account) 2498 if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
2560 int id, size; 2518 int id, size;
2561 int err; 2519 int err;
2562 2520
2563 id = ida_simple_get(&kmem_limited_groups, 2521 id = ida_simple_get(&memcg_cache_ida,
2564 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2522 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2565 if (id < 0) 2523 if (id < 0)
2566 return id; 2524 return id;
2567 2525
2568 if (id < memcg_limited_groups_array_size) 2526 if (id < memcg_nr_cache_ids)
2569 return id; 2527 return id;
2570 2528
2571 /* 2529 /*
2572 * There's no space for the new id in memcg_caches arrays, 2530 * There's no space for the new id in memcg_caches arrays,
2573 * so we have to grow them. 2531 * so we have to grow them.
2574 */ 2532 */
2533 down_write(&memcg_cache_ids_sem);
2575 2534
2576 size = 2 * (id + 1); 2535 size = 2 * (id + 1);
2577 if (size < MEMCG_CACHES_MIN_SIZE) 2536 if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
2579 else if (size > MEMCG_CACHES_MAX_SIZE) 2538 else if (size > MEMCG_CACHES_MAX_SIZE)
2580 size = MEMCG_CACHES_MAX_SIZE; 2539 size = MEMCG_CACHES_MAX_SIZE;
2581 2540
2582 mutex_lock(&memcg_slab_mutex);
2583 err = memcg_update_all_caches(size); 2541 err = memcg_update_all_caches(size);
2584 mutex_unlock(&memcg_slab_mutex); 2542 if (!err)
2543 err = memcg_update_all_list_lrus(size);
2544 if (!err)
2545 memcg_nr_cache_ids = size;
2546
2547 up_write(&memcg_cache_ids_sem);
2585 2548
2586 if (err) { 2549 if (err) {
2587 ida_simple_remove(&kmem_limited_groups, id); 2550 ida_simple_remove(&memcg_cache_ida, id);
2588 return err; 2551 return err;
2589 } 2552 }
2590 return id; 2553 return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
2592 2555
2593static void memcg_free_cache_id(int id) 2556static void memcg_free_cache_id(int id)
2594{ 2557{
2595 ida_simple_remove(&kmem_limited_groups, id); 2558 ida_simple_remove(&memcg_cache_ida, id);
2596} 2559}
2597 2560
2598/* 2561struct memcg_kmem_cache_create_work {
2599 * We should update the current array size iff all caches updates succeed. This
2600 * can only be done from the slab side. The slab mutex needs to be held when
2601 * calling this.
2602 */
2603void memcg_update_array_size(int num)
2604{
2605 memcg_limited_groups_array_size = num;
2606}
2607
2608static void memcg_register_cache(struct mem_cgroup *memcg,
2609 struct kmem_cache *root_cache)
2610{
2611 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
2612 memcg_slab_mutex */
2613 struct kmem_cache *cachep;
2614 int id;
2615
2616 lockdep_assert_held(&memcg_slab_mutex);
2617
2618 id = memcg_cache_id(memcg);
2619
2620 /*
2621 * Since per-memcg caches are created asynchronously on first
2622 * allocation (see memcg_kmem_get_cache()), several threads can try to
2623 * create the same cache, but only one of them may succeed.
2624 */
2625 if (cache_from_memcg_idx(root_cache, id))
2626 return;
2627
2628 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
2629 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2630 /*
2631 * If we could not create a memcg cache, do not complain, because
2632 * that's not critical at all as we can always proceed with the root
2633 * cache.
2634 */
2635 if (!cachep)
2636 return;
2637
2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2639
2640 /*
2641 * Since readers won't lock (see cache_from_memcg_idx()), we need a
2642 * barrier here to ensure nobody will see the kmem_cache partially
2643 * initialized.
2644 */
2645 smp_wmb();
2646
2647 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
2648 root_cache->memcg_params->memcg_caches[id] = cachep;
2649}
2650
2651static void memcg_unregister_cache(struct kmem_cache *cachep)
2652{
2653 struct kmem_cache *root_cache;
2654 struct mem_cgroup *memcg;
2655 int id;
2656
2657 lockdep_assert_held(&memcg_slab_mutex);
2658
2659 BUG_ON(is_root_cache(cachep));
2660
2661 root_cache = cachep->memcg_params->root_cache;
2662 memcg = cachep->memcg_params->memcg;
2663 id = memcg_cache_id(memcg);
2664
2665 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
2666 root_cache->memcg_params->memcg_caches[id] = NULL;
2667
2668 list_del(&cachep->memcg_params->list);
2669
2670 kmem_cache_destroy(cachep);
2671}
2672
2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
2674{
2675 struct kmem_cache *c;
2676 int i, failed = 0;
2677
2678 mutex_lock(&memcg_slab_mutex);
2679 for_each_memcg_cache_index(i) {
2680 c = cache_from_memcg_idx(s, i);
2681 if (!c)
2682 continue;
2683
2684 memcg_unregister_cache(c);
2685
2686 if (cache_from_memcg_idx(s, i))
2687 failed++;
2688 }
2689 mutex_unlock(&memcg_slab_mutex);
2690 return failed;
2691}
2692
2693static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2694{
2695 struct kmem_cache *cachep;
2696 struct memcg_cache_params *params, *tmp;
2697
2698 if (!memcg_kmem_is_active(memcg))
2699 return;
2700
2701 mutex_lock(&memcg_slab_mutex);
2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2703 cachep = memcg_params_to_cache(params);
2704 memcg_unregister_cache(cachep);
2705 }
2706 mutex_unlock(&memcg_slab_mutex);
2707}
2708
2709struct memcg_register_cache_work {
2710 struct mem_cgroup *memcg; 2562 struct mem_cgroup *memcg;
2711 struct kmem_cache *cachep; 2563 struct kmem_cache *cachep;
2712 struct work_struct work; 2564 struct work_struct work;
2713}; 2565};
2714 2566
2715static void memcg_register_cache_func(struct work_struct *w) 2567static void memcg_kmem_cache_create_func(struct work_struct *w)
2716{ 2568{
2717 struct memcg_register_cache_work *cw = 2569 struct memcg_kmem_cache_create_work *cw =
2718 container_of(w, struct memcg_register_cache_work, work); 2570 container_of(w, struct memcg_kmem_cache_create_work, work);
2719 struct mem_cgroup *memcg = cw->memcg; 2571 struct mem_cgroup *memcg = cw->memcg;
2720 struct kmem_cache *cachep = cw->cachep; 2572 struct kmem_cache *cachep = cw->cachep;
2721 2573
2722 mutex_lock(&memcg_slab_mutex); 2574 memcg_create_kmem_cache(memcg, cachep);
2723 memcg_register_cache(memcg, cachep);
2724 mutex_unlock(&memcg_slab_mutex);
2725 2575
2726 css_put(&memcg->css); 2576 css_put(&memcg->css);
2727 kfree(cw); 2577 kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
2730/* 2580/*
2731 * Enqueue the creation of a per-memcg kmem_cache. 2581 * Enqueue the creation of a per-memcg kmem_cache.
2732 */ 2582 */
2733static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 2583static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2734 struct kmem_cache *cachep) 2584 struct kmem_cache *cachep)
2735{ 2585{
2736 struct memcg_register_cache_work *cw; 2586 struct memcg_kmem_cache_create_work *cw;
2737 2587
2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2588 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2739 if (!cw) 2589 if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2743 2593
2744 cw->memcg = memcg; 2594 cw->memcg = memcg;
2745 cw->cachep = cachep; 2595 cw->cachep = cachep;
2596 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2746 2597
2747 INIT_WORK(&cw->work, memcg_register_cache_func);
2748 schedule_work(&cw->work); 2598 schedule_work(&cw->work);
2749} 2599}
2750 2600
2751static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 2601static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2752 struct kmem_cache *cachep) 2602 struct kmem_cache *cachep)
2753{ 2603{
2754 /* 2604 /*
2755 * We need to stop accounting when we kmalloc, because if the 2605 * We need to stop accounting when we kmalloc, because if the
2756 * corresponding kmalloc cache is not yet created, the first allocation 2606 * corresponding kmalloc cache is not yet created, the first allocation
2757 * in __memcg_schedule_register_cache will recurse. 2607 * in __memcg_schedule_kmem_cache_create will recurse.
2758 * 2608 *
2759 * However, it is better to enclose the whole function. Depending on 2609 * However, it is better to enclose the whole function. Depending on
2760 * the debugging options enabled, INIT_WORK(), for instance, can 2610 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2763 * the safest choice is to do it like this, wrapping the whole function. 2613 * the safest choice is to do it like this, wrapping the whole function.
2764 */ 2614 */
2765 current->memcg_kmem_skip_account = 1; 2615 current->memcg_kmem_skip_account = 1;
2766 __memcg_schedule_register_cache(memcg, cachep); 2616 __memcg_schedule_kmem_cache_create(memcg, cachep);
2767 current->memcg_kmem_skip_account = 0; 2617 current->memcg_kmem_skip_account = 0;
2768} 2618}
2769 2619
2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2771{
2772 unsigned int nr_pages = 1 << order;
2773
2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2775}
2776
2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2778{
2779 unsigned int nr_pages = 1 << order;
2780
2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2782}
2783
2784/* 2620/*
2785 * Return the kmem_cache we're supposed to use for a slab allocation. 2621 * Return the kmem_cache we're supposed to use for a slab allocation.
2786 * We try to use the current memcg's version of the cache. 2622 * We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2798{ 2634{
2799 struct mem_cgroup *memcg; 2635 struct mem_cgroup *memcg;
2800 struct kmem_cache *memcg_cachep; 2636 struct kmem_cache *memcg_cachep;
2637 int kmemcg_id;
2801 2638
2802 VM_BUG_ON(!cachep->memcg_params); 2639 VM_BUG_ON(!is_root_cache(cachep));
2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
2804 2640
2805 if (current->memcg_kmem_skip_account) 2641 if (current->memcg_kmem_skip_account)
2806 return cachep; 2642 return cachep;
2807 2643
2808 memcg = get_mem_cgroup_from_mm(current->mm); 2644 memcg = get_mem_cgroup_from_mm(current->mm);
2809 if (!memcg_kmem_is_active(memcg)) 2645 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
2646 if (kmemcg_id < 0)
2810 goto out; 2647 goto out;
2811 2648
2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2649 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2813 if (likely(memcg_cachep)) 2650 if (likely(memcg_cachep))
2814 return memcg_cachep; 2651 return memcg_cachep;
2815 2652
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2825 * could happen with the slab_mutex held. So it's better to 2662 * could happen with the slab_mutex held. So it's better to
2826 * defer everything. 2663 * defer everything.
2827 */ 2664 */
2828 memcg_schedule_register_cache(memcg, cachep); 2665 memcg_schedule_kmem_cache_create(memcg, cachep);
2829out: 2666out:
2830 css_put(&memcg->css); 2667 css_put(&memcg->css);
2831 return cachep; 2668 return cachep;
@@ -2834,7 +2671,7 @@ out:
2834void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2671void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2835{ 2672{
2836 if (!is_root_cache(cachep)) 2673 if (!is_root_cache(cachep))
2837 css_put(&cachep->memcg_params->memcg->css); 2674 css_put(&cachep->memcg_params.memcg->css);
2838} 2675}
2839 2676
2840/* 2677/*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2899 memcg_uncharge_kmem(memcg, 1 << order); 2736 memcg_uncharge_kmem(memcg, 1 << order);
2900 page->mem_cgroup = NULL; 2737 page->mem_cgroup = NULL;
2901} 2738}
2739
2740struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
2741{
2742 struct mem_cgroup *memcg = NULL;
2743 struct kmem_cache *cachep;
2744 struct page *page;
2745
2746 page = virt_to_head_page(ptr);
2747 if (PageSlab(page)) {
2748 cachep = page->slab_cache;
2749 if (!is_root_cache(cachep))
2750 memcg = cachep->memcg_params.memcg;
2751 } else
2752 /* page allocated by alloc_kmem_pages */
2753 memcg = page->mem_cgroup;
2754
2755 return memcg;
2756}
2902#endif /* CONFIG_MEMCG_KMEM */ 2757#endif /* CONFIG_MEMCG_KMEM */
2903 2758
2904#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2759#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3433 int err = 0; 3288 int err = 0;
3434 int memcg_id; 3289 int memcg_id;
3435 3290
3436 if (memcg_kmem_is_active(memcg)) 3291 BUG_ON(memcg->kmemcg_id >= 0);
3437 return 0; 3292 BUG_ON(memcg->kmem_acct_activated);
3293 BUG_ON(memcg->kmem_acct_active);
3438 3294
3439 /* 3295 /*
3440 * For simplicity, we won't allow this to be disabled. It also can't 3296 * For simplicity, we won't allow this to be disabled. It also can't
@@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3477 * patched. 3333 * patched.
3478 */ 3334 */
3479 memcg->kmemcg_id = memcg_id; 3335 memcg->kmemcg_id = memcg_id;
3336 memcg->kmem_acct_activated = true;
3337 memcg->kmem_acct_active = true;
3480out: 3338out:
3481 return err; 3339 return err;
3482} 3340}
@@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3533 int ret; 3391 int ret;
3534 3392
3535 buf = strstrip(buf); 3393 buf = strstrip(buf);
3536 ret = page_counter_memparse(buf, &nr_pages); 3394 ret = page_counter_memparse(buf, "-1", &nr_pages);
3537 if (ret) 3395 if (ret)
3538 return ret; 3396 return ret;
3539 3397
@@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3609{ 3467{
3610 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3468 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3611 3469
3612 if (val >= (1 << NR_MOVE_TYPE)) 3470 if (val & ~MOVE_MASK)
3613 return -EINVAL; 3471 return -EINVAL;
3614 3472
3615 /* 3473 /*
@@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3687 struct mem_cgroup *mi; 3545 struct mem_cgroup *mi;
3688 unsigned int i; 3546 unsigned int i;
3689 3547
3548 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3549 MEM_CGROUP_STAT_NSTATS);
3550 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3551 MEM_CGROUP_EVENTS_NSTATS);
3690 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3552 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3691 3553
3692 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3554 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3901 unsigned long usage; 3763 unsigned long usage;
3902 int i, size, ret; 3764 int i, size, ret;
3903 3765
3904 ret = page_counter_memparse(args, &threshold); 3766 ret = page_counter_memparse(args, "-1", &threshold);
3905 if (ret) 3767 if (ret)
3906 return ret; 3768 return ret;
3907 3769
@@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4152 return mem_cgroup_sockets_init(memcg, ss); 4014 return mem_cgroup_sockets_init(memcg, ss);
4153} 4015}
4154 4016
4017static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4018{
4019 struct cgroup_subsys_state *css;
4020 struct mem_cgroup *parent, *child;
4021 int kmemcg_id;
4022
4023 if (!memcg->kmem_acct_active)
4024 return;
4025
4026 /*
4027 * Clear the 'active' flag before clearing memcg_caches arrays entries.
4028 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
4029 * guarantees no cache will be created for this cgroup after we are
4030 * done (see memcg_create_kmem_cache()).
4031 */
4032 memcg->kmem_acct_active = false;
4033
4034 memcg_deactivate_kmem_caches(memcg);
4035
4036 kmemcg_id = memcg->kmemcg_id;
4037 BUG_ON(kmemcg_id < 0);
4038
4039 parent = parent_mem_cgroup(memcg);
4040 if (!parent)
4041 parent = root_mem_cgroup;
4042
4043 /*
4044 * Change kmemcg_id of this cgroup and all its descendants to the
4045 * parent's id, and then move all entries from this cgroup's list_lrus
4046 * to ones of the parent. After we have finished, all list_lrus
4047 * corresponding to this cgroup are guaranteed to remain empty. The
4048 * ordering is imposed by list_lru_node->lock taken by
4049 * memcg_drain_all_list_lrus().
4050 */
4051 css_for_each_descendant_pre(css, &memcg->css) {
4052 child = mem_cgroup_from_css(css);
4053 BUG_ON(child->kmemcg_id != kmemcg_id);
4054 child->kmemcg_id = parent->kmemcg_id;
4055 if (!memcg->use_hierarchy)
4056 break;
4057 }
4058 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
4059
4060 memcg_free_cache_id(kmemcg_id);
4061}
4062
4155static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4063static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4156{ 4064{
4157 memcg_unregister_all_caches(memcg); 4065 if (memcg->kmem_acct_activated) {
4066 memcg_destroy_kmem_caches(memcg);
4067 static_key_slow_dec(&memcg_kmem_enabled_key);
4068 WARN_ON(page_counter_read(&memcg->kmem));
4069 }
4158 mem_cgroup_sockets_destroy(memcg); 4070 mem_cgroup_sockets_destroy(memcg);
4159} 4071}
4160#else 4072#else
@@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4163 return 0; 4075 return 0;
4164} 4076}
4165 4077
4078static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4079{
4080}
4081
4166static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4082static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4167{ 4083{
4168} 4084}
@@ -4391,7 +4307,7 @@ out_kfree:
4391 return ret; 4307 return ret;
4392} 4308}
4393 4309
4394static struct cftype mem_cgroup_files[] = { 4310static struct cftype mem_cgroup_legacy_files[] = {
4395 { 4311 {
4396 .name = "usage_in_bytes", 4312 .name = "usage_in_bytes",
4397 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4313 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
4502 { }, /* terminate */ 4418 { }, /* terminate */
4503}; 4419};
4504 4420
4505#ifdef CONFIG_MEMCG_SWAP
4506static struct cftype memsw_cgroup_files[] = {
4507 {
4508 .name = "memsw.usage_in_bytes",
4509 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4510 .read_u64 = mem_cgroup_read_u64,
4511 },
4512 {
4513 .name = "memsw.max_usage_in_bytes",
4514 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4515 .write = mem_cgroup_reset,
4516 .read_u64 = mem_cgroup_read_u64,
4517 },
4518 {
4519 .name = "memsw.limit_in_bytes",
4520 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4521 .write = mem_cgroup_write,
4522 .read_u64 = mem_cgroup_read_u64,
4523 },
4524 {
4525 .name = "memsw.failcnt",
4526 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4527 .write = mem_cgroup_reset,
4528 .read_u64 = mem_cgroup_read_u64,
4529 },
4530 { }, /* terminate */
4531};
4532#endif
4533static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4421static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4534{ 4422{
4535 struct mem_cgroup_per_node *pn; 4423 struct mem_cgroup_per_node *pn;
@@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4609 free_mem_cgroup_per_zone_info(memcg, node); 4497 free_mem_cgroup_per_zone_info(memcg, node);
4610 4498
4611 free_percpu(memcg->stat); 4499 free_percpu(memcg->stat);
4612
4613 disarm_static_keys(memcg);
4614 kfree(memcg); 4500 kfree(memcg);
4615} 4501}
4616 4502
@@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4625} 4511}
4626EXPORT_SYMBOL(parent_mem_cgroup); 4512EXPORT_SYMBOL(parent_mem_cgroup);
4627 4513
4628static void __init mem_cgroup_soft_limit_tree_init(void)
4629{
4630 struct mem_cgroup_tree_per_node *rtpn;
4631 struct mem_cgroup_tree_per_zone *rtpz;
4632 int tmp, node, zone;
4633
4634 for_each_node(node) {
4635 tmp = node;
4636 if (!node_state(node, N_NORMAL_MEMORY))
4637 tmp = -1;
4638 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4639 BUG_ON(!rtpn);
4640
4641 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4642
4643 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4644 rtpz = &rtpn->rb_tree_per_zone[zone];
4645 rtpz->rb_root = RB_ROOT;
4646 spin_lock_init(&rtpz->lock);
4647 }
4648 }
4649}
4650
4651static struct cgroup_subsys_state * __ref 4514static struct cgroup_subsys_state * __ref
4652mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4515mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4653{ 4516{
@@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4667 if (parent_css == NULL) { 4530 if (parent_css == NULL) {
4668 root_mem_cgroup = memcg; 4531 root_mem_cgroup = memcg;
4669 page_counter_init(&memcg->memory, NULL); 4532 page_counter_init(&memcg->memory, NULL);
4533 memcg->high = PAGE_COUNTER_MAX;
4670 memcg->soft_limit = PAGE_COUNTER_MAX; 4534 memcg->soft_limit = PAGE_COUNTER_MAX;
4671 page_counter_init(&memcg->memsw, NULL); 4535 page_counter_init(&memcg->memsw, NULL);
4672 page_counter_init(&memcg->kmem, NULL); 4536 page_counter_init(&memcg->kmem, NULL);
@@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4682 spin_lock_init(&memcg->event_list_lock); 4546 spin_lock_init(&memcg->event_list_lock);
4683#ifdef CONFIG_MEMCG_KMEM 4547#ifdef CONFIG_MEMCG_KMEM
4684 memcg->kmemcg_id = -1; 4548 memcg->kmemcg_id = -1;
4685 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4686#endif 4549#endif
4687 4550
4688 return &memcg->css; 4551 return &memcg->css;
@@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4713 4576
4714 if (parent->use_hierarchy) { 4577 if (parent->use_hierarchy) {
4715 page_counter_init(&memcg->memory, &parent->memory); 4578 page_counter_init(&memcg->memory, &parent->memory);
4579 memcg->high = PAGE_COUNTER_MAX;
4716 memcg->soft_limit = PAGE_COUNTER_MAX; 4580 memcg->soft_limit = PAGE_COUNTER_MAX;
4717 page_counter_init(&memcg->memsw, &parent->memsw); 4581 page_counter_init(&memcg->memsw, &parent->memsw);
4718 page_counter_init(&memcg->kmem, &parent->kmem); 4582 page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4723 */ 4587 */
4724 } else { 4588 } else {
4725 page_counter_init(&memcg->memory, NULL); 4589 page_counter_init(&memcg->memory, NULL);
4590 memcg->high = PAGE_COUNTER_MAX;
4726 memcg->soft_limit = PAGE_COUNTER_MAX; 4591 memcg->soft_limit = PAGE_COUNTER_MAX;
4727 page_counter_init(&memcg->memsw, NULL); 4592 page_counter_init(&memcg->memsw, NULL);
4728 page_counter_init(&memcg->kmem, NULL); 4593 page_counter_init(&memcg->kmem, NULL);
@@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4768 spin_unlock(&memcg->event_list_lock); 4633 spin_unlock(&memcg->event_list_lock);
4769 4634
4770 vmpressure_cleanup(&memcg->vmpressure); 4635 vmpressure_cleanup(&memcg->vmpressure);
4636
4637 memcg_deactivate_kmem(memcg);
4771} 4638}
4772 4639
4773static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4640static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4798 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4665 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
4799 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4666 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
4800 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4667 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
4668 memcg->low = 0;
4669 memcg->high = PAGE_COUNTER_MAX;
4801 memcg->soft_limit = PAGE_COUNTER_MAX; 4670 memcg->soft_limit = PAGE_COUNTER_MAX;
4802} 4671}
4803 4672
@@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4874 if (!page || !page_mapped(page)) 4743 if (!page || !page_mapped(page))
4875 return NULL; 4744 return NULL;
4876 if (PageAnon(page)) { 4745 if (PageAnon(page)) {
4877 /* we don't move shared anon */ 4746 if (!(mc.flags & MOVE_ANON))
4878 if (!move_anon())
4879 return NULL; 4747 return NULL;
4880 } else if (!move_file()) 4748 } else {
4881 /* we ignore mapcount for file pages */ 4749 if (!(mc.flags & MOVE_FILE))
4882 return NULL; 4750 return NULL;
4751 }
4883 if (!get_page_unless_zero(page)) 4752 if (!get_page_unless_zero(page))
4884 return NULL; 4753 return NULL;
4885 4754
@@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4893 struct page *page = NULL; 4762 struct page *page = NULL;
4894 swp_entry_t ent = pte_to_swp_entry(ptent); 4763 swp_entry_t ent = pte_to_swp_entry(ptent);
4895 4764
4896 if (!move_anon() || non_swap_entry(ent)) 4765 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4897 return NULL; 4766 return NULL;
4898 /* 4767 /*
4899 * Because lookup_swap_cache() updates some statistics counter, 4768 * Because lookup_swap_cache() updates some statistics counter,
@@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4922 4791
4923 if (!vma->vm_file) /* anonymous vma */ 4792 if (!vma->vm_file) /* anonymous vma */
4924 return NULL; 4793 return NULL;
4925 if (!move_file()) 4794 if (!(mc.flags & MOVE_FILE))
4926 return NULL; 4795 return NULL;
4927 4796
4928 mapping = vma->vm_file->f_mapping; 4797 mapping = vma->vm_file->f_mapping;
4929 if (pte_none(ptent)) 4798 pgoff = linear_page_index(vma, addr);
4930 pgoff = linear_page_index(vma, addr);
4931 else /* pte_file(ptent) is true */
4932 pgoff = pte_to_pgoff(ptent);
4933 4799
4934 /* page is moved even if it's not RSS of this task(page-faulted). */ 4800 /* page is moved even if it's not RSS of this task(page-faulted). */
4935#ifdef CONFIG_SWAP 4801#ifdef CONFIG_SWAP
@@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4961 page = mc_handle_present_pte(vma, addr, ptent); 4827 page = mc_handle_present_pte(vma, addr, ptent);
4962 else if (is_swap_pte(ptent)) 4828 else if (is_swap_pte(ptent))
4963 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4829 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4964 else if (pte_none(ptent) || pte_file(ptent)) 4830 else if (pte_none(ptent))
4965 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4831 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4966 4832
4967 if (!page && !ent.val) 4833 if (!page && !ent.val)
@@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5004 4870
5005 page = pmd_page(pmd); 4871 page = pmd_page(pmd);
5006 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4872 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5007 if (!move_anon()) 4873 if (!(mc.flags & MOVE_ANON))
5008 return ret; 4874 return ret;
5009 if (page->mem_cgroup == mc.from) { 4875 if (page->mem_cgroup == mc.from) {
5010 ret = MC_TARGET_PAGE; 4876 ret = MC_TARGET_PAGE;
@@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5027 unsigned long addr, unsigned long end, 4893 unsigned long addr, unsigned long end,
5028 struct mm_walk *walk) 4894 struct mm_walk *walk)
5029{ 4895{
5030 struct vm_area_struct *vma = walk->private; 4896 struct vm_area_struct *vma = walk->vma;
5031 pte_t *pte; 4897 pte_t *pte;
5032 spinlock_t *ptl; 4898 spinlock_t *ptl;
5033 4899
@@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5053static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4919static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5054{ 4920{
5055 unsigned long precharge; 4921 unsigned long precharge;
5056 struct vm_area_struct *vma;
5057 4922
4923 struct mm_walk mem_cgroup_count_precharge_walk = {
4924 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4925 .mm = mm,
4926 };
5058 down_read(&mm->mmap_sem); 4927 down_read(&mm->mmap_sem);
5059 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4928 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
5060 struct mm_walk mem_cgroup_count_precharge_walk = {
5061 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5062 .mm = mm,
5063 .private = vma,
5064 };
5065 if (is_vm_hugetlb_page(vma))
5066 continue;
5067 walk_page_range(vma->vm_start, vma->vm_end,
5068 &mem_cgroup_count_precharge_walk);
5069 }
5070 up_read(&mm->mmap_sem); 4929 up_read(&mm->mmap_sem);
5071 4930
5072 precharge = mc.precharge; 4931 precharge = mc.precharge;
@@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5146 struct task_struct *p = cgroup_taskset_first(tset); 5005 struct task_struct *p = cgroup_taskset_first(tset);
5147 int ret = 0; 5006 int ret = 0;
5148 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5007 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5149 unsigned long move_charge_at_immigrate; 5008 unsigned long move_flags;
5150 5009
5151 /* 5010 /*
5152 * We are now commited to this value whatever it is. Changes in this 5011 * We are now commited to this value whatever it is. Changes in this
5153 * tunable will only affect upcoming migrations, not the current one. 5012 * tunable will only affect upcoming migrations, not the current one.
5154 * So we need to save it, and keep it going. 5013 * So we need to save it, and keep it going.
5155 */ 5014 */
5156 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5015 move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
5157 if (move_charge_at_immigrate) { 5016 if (move_flags) {
5158 struct mm_struct *mm; 5017 struct mm_struct *mm;
5159 struct mem_cgroup *from = mem_cgroup_from_task(p); 5018 struct mem_cgroup *from = mem_cgroup_from_task(p);
5160 5019
@@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5174 spin_lock(&mc.lock); 5033 spin_lock(&mc.lock);
5175 mc.from = from; 5034 mc.from = from;
5176 mc.to = memcg; 5035 mc.to = memcg;
5177 mc.immigrate_flags = move_charge_at_immigrate; 5036 mc.flags = move_flags;
5178 spin_unlock(&mc.lock); 5037 spin_unlock(&mc.lock);
5179 /* We set mc.moving_task later */ 5038 /* We set mc.moving_task later */
5180 5039
@@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5199 struct mm_walk *walk) 5058 struct mm_walk *walk)
5200{ 5059{
5201 int ret = 0; 5060 int ret = 0;
5202 struct vm_area_struct *vma = walk->private; 5061 struct vm_area_struct *vma = walk->vma;
5203 pte_t *pte; 5062 pte_t *pte;
5204 spinlock_t *ptl; 5063 spinlock_t *ptl;
5205 enum mc_target_type target_type; 5064 enum mc_target_type target_type;
@@ -5295,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */
5295 5154
5296static void mem_cgroup_move_charge(struct mm_struct *mm) 5155static void mem_cgroup_move_charge(struct mm_struct *mm)
5297{ 5156{
5298 struct vm_area_struct *vma; 5157 struct mm_walk mem_cgroup_move_charge_walk = {
5158 .pmd_entry = mem_cgroup_move_charge_pte_range,
5159 .mm = mm,
5160 };
5299 5161
5300 lru_add_drain_all(); 5162 lru_add_drain_all();
5301 /* 5163 /*
@@ -5318,24 +5180,11 @@ retry:
5318 cond_resched(); 5180 cond_resched();
5319 goto retry; 5181 goto retry;
5320 } 5182 }
5321 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5183 /*
5322 int ret; 5184 * When we have consumed all precharges and failed in doing
5323 struct mm_walk mem_cgroup_move_charge_walk = { 5185 * additional charge, the page walk just aborts.
5324 .pmd_entry = mem_cgroup_move_charge_pte_range, 5186 */
5325 .mm = mm, 5187 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
5326 .private = vma,
5327 };
5328 if (is_vm_hugetlb_page(vma))
5329 continue;
5330 ret = walk_page_range(vma->vm_start, vma->vm_end,
5331 &mem_cgroup_move_charge_walk);
5332 if (ret)
5333 /*
5334 * means we have consumed all precharges and failed in
5335 * doing additional charge. Just abandon here.
5336 */
5337 break;
5338 }
5339 up_read(&mm->mmap_sem); 5188 up_read(&mm->mmap_sem);
5340 atomic_dec(&mc.from->moving_account); 5189 atomic_dec(&mc.from->moving_account);
5341} 5190}
@@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5386 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5235 mem_cgroup_from_css(root_css)->use_hierarchy = true;
5387} 5236}
5388 5237
5389struct cgroup_subsys memory_cgrp_subsys = { 5238static u64 memory_current_read(struct cgroup_subsys_state *css,
5390 .css_alloc = mem_cgroup_css_alloc, 5239 struct cftype *cft)
5391 .css_online = mem_cgroup_css_online, 5240{
5392 .css_offline = mem_cgroup_css_offline, 5241 return mem_cgroup_usage(mem_cgroup_from_css(css), false);
5393 .css_free = mem_cgroup_css_free, 5242}
5394 .css_reset = mem_cgroup_css_reset,
5395 .can_attach = mem_cgroup_can_attach,
5396 .cancel_attach = mem_cgroup_cancel_attach,
5397 .attach = mem_cgroup_move_task,
5398 .bind = mem_cgroup_bind,
5399 .legacy_cftypes = mem_cgroup_files,
5400 .early_init = 0,
5401};
5402 5243
5403#ifdef CONFIG_MEMCG_SWAP 5244static int memory_low_show(struct seq_file *m, void *v)
5404static int __init enable_swap_account(char *s)
5405{ 5245{
5406 if (!strcmp(s, "1")) 5246 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5407 really_do_swap_account = 1; 5247 unsigned long low = ACCESS_ONCE(memcg->low);
5408 else if (!strcmp(s, "0")) 5248
5409 really_do_swap_account = 0; 5249 if (low == PAGE_COUNTER_MAX)
5410 return 1; 5250 seq_puts(m, "max\n");
5251 else
5252 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5253
5254 return 0;
5411} 5255}
5412__setup("swapaccount=", enable_swap_account);
5413 5256
5414static void __init memsw_file_init(void) 5257static ssize_t memory_low_write(struct kernfs_open_file *of,
5258 char *buf, size_t nbytes, loff_t off)
5415{ 5259{
5416 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5260 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5417 memsw_cgroup_files)); 5261 unsigned long low;
5262 int err;
5263
5264 buf = strstrip(buf);
5265 err = page_counter_memparse(buf, "max", &low);
5266 if (err)
5267 return err;
5268
5269 memcg->low = low;
5270
5271 return nbytes;
5418} 5272}
5419 5273
5420static void __init enable_swap_cgroup(void) 5274static int memory_high_show(struct seq_file *m, void *v)
5421{ 5275{
5422 if (!mem_cgroup_disabled() && really_do_swap_account) { 5276 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5423 do_swap_account = 1; 5277 unsigned long high = ACCESS_ONCE(memcg->high);
5424 memsw_file_init(); 5278
5425 } 5279 if (high == PAGE_COUNTER_MAX)
5280 seq_puts(m, "max\n");
5281 else
5282 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5283
5284 return 0;
5426} 5285}
5427 5286
5428#else 5287static ssize_t memory_high_write(struct kernfs_open_file *of,
5429static void __init enable_swap_cgroup(void) 5288 char *buf, size_t nbytes, loff_t off)
5430{ 5289{
5290 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5291 unsigned long high;
5292 int err;
5293
5294 buf = strstrip(buf);
5295 err = page_counter_memparse(buf, "max", &high);
5296 if (err)
5297 return err;
5298
5299 memcg->high = high;
5300
5301 return nbytes;
5431} 5302}
5432#endif
5433 5303
5434#ifdef CONFIG_MEMCG_SWAP 5304static int memory_max_show(struct seq_file *m, void *v)
5435/**
5436 * mem_cgroup_swapout - transfer a memsw charge to swap
5437 * @page: page whose memsw charge to transfer
5438 * @entry: swap entry to move the charge to
5439 *
5440 * Transfer the memsw charge of @page to @entry.
5441 */
5442void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5443{ 5305{
5444 struct mem_cgroup *memcg; 5306 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5445 unsigned short oldid; 5307 unsigned long max = ACCESS_ONCE(memcg->memory.limit);
5446 5308
5447 VM_BUG_ON_PAGE(PageLRU(page), page); 5309 if (max == PAGE_COUNTER_MAX)
5448 VM_BUG_ON_PAGE(page_count(page), page); 5310 seq_puts(m, "max\n");
5311 else
5312 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5449 5313
5450 if (!do_swap_account) 5314 return 0;
5451 return; 5315}
5452 5316
5453 memcg = page->mem_cgroup; 5317static ssize_t memory_max_write(struct kernfs_open_file *of,
5318 char *buf, size_t nbytes, loff_t off)
5319{
5320 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5321 unsigned long max;
5322 int err;
5454 5323
5455 /* Readahead page, never charged */ 5324 buf = strstrip(buf);
5456 if (!memcg) 5325 err = page_counter_memparse(buf, "max", &max);
5457 return; 5326 if (err)
5327 return err;
5458 5328
5459 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5329 err = mem_cgroup_resize_limit(memcg, max);
5460 VM_BUG_ON_PAGE(oldid, page); 5330 if (err)
5461 mem_cgroup_swap_statistics(memcg, true); 5331 return err;
5462 5332
5463 page->mem_cgroup = NULL; 5333 return nbytes;
5334}
5464 5335
5465 if (!mem_cgroup_is_root(memcg)) 5336static int memory_events_show(struct seq_file *m, void *v)
5466 page_counter_uncharge(&memcg->memory, 1); 5337{
5338 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5467 5339
5468 /* XXX: caller holds IRQ-safe mapping->tree_lock */ 5340 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5469 VM_BUG_ON(!irqs_disabled()); 5341 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5342 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5343 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5470 5344
5471 mem_cgroup_charge_statistics(memcg, page, -1); 5345 return 0;
5472 memcg_check_events(memcg, page);
5473} 5346}
5474 5347
5348static struct cftype memory_files[] = {
5349 {
5350 .name = "current",
5351 .read_u64 = memory_current_read,
5352 },
5353 {
5354 .name = "low",
5355 .flags = CFTYPE_NOT_ON_ROOT,
5356 .seq_show = memory_low_show,
5357 .write = memory_low_write,
5358 },
5359 {
5360 .name = "high",
5361 .flags = CFTYPE_NOT_ON_ROOT,
5362 .seq_show = memory_high_show,
5363 .write = memory_high_write,
5364 },
5365 {
5366 .name = "max",
5367 .flags = CFTYPE_NOT_ON_ROOT,
5368 .seq_show = memory_max_show,
5369 .write = memory_max_write,
5370 },
5371 {
5372 .name = "events",
5373 .flags = CFTYPE_NOT_ON_ROOT,
5374 .seq_show = memory_events_show,
5375 },
5376 { } /* terminate */
5377};
5378
5379struct cgroup_subsys memory_cgrp_subsys = {
5380 .css_alloc = mem_cgroup_css_alloc,
5381 .css_online = mem_cgroup_css_online,
5382 .css_offline = mem_cgroup_css_offline,
5383 .css_free = mem_cgroup_css_free,
5384 .css_reset = mem_cgroup_css_reset,
5385 .can_attach = mem_cgroup_can_attach,
5386 .cancel_attach = mem_cgroup_cancel_attach,
5387 .attach = mem_cgroup_move_task,
5388 .bind = mem_cgroup_bind,
5389 .dfl_cftypes = memory_files,
5390 .legacy_cftypes = mem_cgroup_legacy_files,
5391 .early_init = 0,
5392};
5393
5475/** 5394/**
5476 * mem_cgroup_uncharge_swap - uncharge a swap entry 5395 * mem_cgroup_events - count memory events against a cgroup
5477 * @entry: swap entry to uncharge 5396 * @memcg: the memory cgroup
5397 * @idx: the event index
5398 * @nr: the number of events to account for
5399 */
5400void mem_cgroup_events(struct mem_cgroup *memcg,
5401 enum mem_cgroup_events_index idx,
5402 unsigned int nr)
5403{
5404 this_cpu_add(memcg->stat->events[idx], nr);
5405}
5406
5407/**
5408 * mem_cgroup_low - check if memory consumption is below the normal range
5409 * @root: the highest ancestor to consider
5410 * @memcg: the memory cgroup to check
5478 * 5411 *
5479 * Drop the memsw charge associated with @entry. 5412 * Returns %true if memory consumption of @memcg, and that of all
5413 * configurable ancestors up to @root, is below the normal range.
5480 */ 5414 */
5481void mem_cgroup_uncharge_swap(swp_entry_t entry) 5415bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5482{ 5416{
5483 struct mem_cgroup *memcg; 5417 if (mem_cgroup_disabled())
5484 unsigned short id; 5418 return false;
5485 5419
5486 if (!do_swap_account) 5420 /*
5487 return; 5421 * The toplevel group doesn't have a configurable range, so
5422 * it's never low when looked at directly, and it is not
5423 * considered an ancestor when assessing the hierarchy.
5424 */
5488 5425
5489 id = swap_cgroup_record(entry, 0); 5426 if (memcg == root_mem_cgroup)
5490 rcu_read_lock(); 5427 return false;
5491 memcg = mem_cgroup_lookup(id); 5428
5492 if (memcg) { 5429 if (page_counter_read(&memcg->memory) >= memcg->low)
5493 if (!mem_cgroup_is_root(memcg)) 5430 return false;
5494 page_counter_uncharge(&memcg->memsw, 1); 5431
5495 mem_cgroup_swap_statistics(memcg, false); 5432 while (memcg != root) {
5496 css_put(&memcg->css); 5433 memcg = parent_mem_cgroup(memcg);
5434
5435 if (memcg == root_mem_cgroup)
5436 break;
5437
5438 if (page_counter_read(&memcg->memory) >= memcg->low)
5439 return false;
5497 } 5440 }
5498 rcu_read_unlock(); 5441 return true;
5499} 5442}
5500#endif
5501 5443
5502/** 5444/**
5503 * mem_cgroup_try_charge - try charging a page 5445 * mem_cgroup_try_charge - try charging a page
@@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5831 */ 5773 */
5832static int __init mem_cgroup_init(void) 5774static int __init mem_cgroup_init(void)
5833{ 5775{
5776 int cpu, node;
5777
5834 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5778 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5835 enable_swap_cgroup(); 5779
5836 mem_cgroup_soft_limit_tree_init(); 5780 for_each_possible_cpu(cpu)
5837 memcg_stock_init(); 5781 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5782 drain_local_stock);
5783
5784 for_each_node(node) {
5785 struct mem_cgroup_tree_per_node *rtpn;
5786 int zone;
5787
5788 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5789 node_online(node) ? node : NUMA_NO_NODE);
5790
5791 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5792 struct mem_cgroup_tree_per_zone *rtpz;
5793
5794 rtpz = &rtpn->rb_tree_per_zone[zone];
5795 rtpz->rb_root = RB_ROOT;
5796 spin_lock_init(&rtpz->lock);
5797 }
5798 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5799 }
5800
5838 return 0; 5801 return 0;
5839} 5802}
5840subsys_initcall(mem_cgroup_init); 5803subsys_initcall(mem_cgroup_init);
5804
5805#ifdef CONFIG_MEMCG_SWAP
5806/**
5807 * mem_cgroup_swapout - transfer a memsw charge to swap
5808 * @page: page whose memsw charge to transfer
5809 * @entry: swap entry to move the charge to
5810 *
5811 * Transfer the memsw charge of @page to @entry.
5812 */
5813void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5814{
5815 struct mem_cgroup *memcg;
5816 unsigned short oldid;
5817
5818 VM_BUG_ON_PAGE(PageLRU(page), page);
5819 VM_BUG_ON_PAGE(page_count(page), page);
5820
5821 if (!do_swap_account)
5822 return;
5823
5824 memcg = page->mem_cgroup;
5825
5826 /* Readahead page, never charged */
5827 if (!memcg)
5828 return;
5829
5830 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5831 VM_BUG_ON_PAGE(oldid, page);
5832 mem_cgroup_swap_statistics(memcg, true);
5833
5834 page->mem_cgroup = NULL;
5835
5836 if (!mem_cgroup_is_root(memcg))
5837 page_counter_uncharge(&memcg->memory, 1);
5838
5839 /* XXX: caller holds IRQ-safe mapping->tree_lock */
5840 VM_BUG_ON(!irqs_disabled());
5841
5842 mem_cgroup_charge_statistics(memcg, page, -1);
5843 memcg_check_events(memcg, page);
5844}
5845
5846/**
5847 * mem_cgroup_uncharge_swap - uncharge a swap entry
5848 * @entry: swap entry to uncharge
5849 *
5850 * Drop the memsw charge associated with @entry.
5851 */
5852void mem_cgroup_uncharge_swap(swp_entry_t entry)
5853{
5854 struct mem_cgroup *memcg;
5855 unsigned short id;
5856
5857 if (!do_swap_account)
5858 return;
5859
5860 id = swap_cgroup_record(entry, 0);
5861 rcu_read_lock();
5862 memcg = mem_cgroup_lookup(id);
5863 if (memcg) {
5864 if (!mem_cgroup_is_root(memcg))
5865 page_counter_uncharge(&memcg->memsw, 1);
5866 mem_cgroup_swap_statistics(memcg, false);
5867 css_put(&memcg->css);
5868 }
5869 rcu_read_unlock();
5870}
5871
5872/* for remember boot option*/
5873#ifdef CONFIG_MEMCG_SWAP_ENABLED
5874static int really_do_swap_account __initdata = 1;
5875#else
5876static int really_do_swap_account __initdata;
5877#endif
5878
5879static int __init enable_swap_account(char *s)
5880{
5881 if (!strcmp(s, "1"))
5882 really_do_swap_account = 1;
5883 else if (!strcmp(s, "0"))
5884 really_do_swap_account = 0;
5885 return 1;
5886}
5887__setup("swapaccount=", enable_swap_account);
5888
5889static struct cftype memsw_cgroup_files[] = {
5890 {
5891 .name = "memsw.usage_in_bytes",
5892 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5893 .read_u64 = mem_cgroup_read_u64,
5894 },
5895 {
5896 .name = "memsw.max_usage_in_bytes",
5897 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5898 .write = mem_cgroup_reset,
5899 .read_u64 = mem_cgroup_read_u64,
5900 },
5901 {
5902 .name = "memsw.limit_in_bytes",
5903 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5904 .write = mem_cgroup_write,
5905 .read_u64 = mem_cgroup_read_u64,
5906 },
5907 {
5908 .name = "memsw.failcnt",
5909 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5910 .write = mem_cgroup_reset,
5911 .read_u64 = mem_cgroup_read_u64,
5912 },
5913 { }, /* terminate */
5914};
5915
5916static int __init mem_cgroup_swap_init(void)
5917{
5918 if (!mem_cgroup_disabled() && really_do_swap_account) {
5919 do_swap_account = 1;
5920 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5921 memsw_cgroup_files));
5922 }
5923 return 0;
5924}
5925subsys_initcall(mem_cgroup_swap_init);
5926
5927#endif /* CONFIG_MEMCG_SWAP */