aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c474
1 files changed, 262 insertions, 212 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..249671873aa9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
58#include <linux/file.h> 58#include <linux/file.h>
59#include <linux/tracehook.h> 59#include <linux/tracehook.h>
60#include <linux/seq_buf.h>
60#include "internal.h" 61#include "internal.h"
61#include <net/sock.h> 62#include <net/sock.h>
62#include <net/ip.h> 63#include <net/ip.h>
@@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page)
485 unsigned long ino = 0; 486 unsigned long ino = 0;
486 487
487 rcu_read_lock(); 488 rcu_read_lock();
488 memcg = READ_ONCE(page->mem_cgroup); 489 if (PageHead(page) && PageSlab(page))
490 memcg = memcg_from_slab_page(page);
491 else
492 memcg = READ_ONCE(page->mem_cgroup);
489 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 493 while (memcg && !(memcg->css.flags & CSS_ONLINE))
490 memcg = parent_mem_cgroup(memcg); 494 memcg = parent_mem_cgroup(memcg);
491 if (memcg) 495 if (memcg)
@@ -1163,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1163 struct css_task_iter it; 1167 struct css_task_iter it;
1164 struct task_struct *task; 1168 struct task_struct *task;
1165 1169
1166 css_task_iter_start(&iter->css, 0, &it); 1170 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1167 while (!ret && (task = css_task_iter_next(&it))) 1171 while (!ret && (task = css_task_iter_next(&it)))
1168 ret = fn(task, arg); 1172 ret = fn(task, arg);
1169 css_task_iter_end(&it); 1173 css_task_iter_end(&it);
@@ -1255,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1255 *lru_size += nr_pages; 1259 *lru_size += nr_pages;
1256} 1260}
1257 1261
1258bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1259{
1260 struct mem_cgroup *task_memcg;
1261 struct task_struct *p;
1262 bool ret;
1263
1264 p = find_lock_task_mm(task);
1265 if (p) {
1266 task_memcg = get_mem_cgroup_from_mm(p->mm);
1267 task_unlock(p);
1268 } else {
1269 /*
1270 * All threads may have already detached their mm's, but the oom
1271 * killer still needs to detect if they have already been oom
1272 * killed to prevent needlessly killing additional tasks.
1273 */
1274 rcu_read_lock();
1275 task_memcg = mem_cgroup_from_task(task);
1276 css_get(&task_memcg->css);
1277 rcu_read_unlock();
1278 }
1279 ret = mem_cgroup_is_descendant(task_memcg, memcg);
1280 css_put(&task_memcg->css);
1281 return ret;
1282}
1283
1284/** 1262/**
1285 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1263 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1286 * @memcg: the memory cgroup 1264 * @memcg: the memory cgroup
@@ -1356,27 +1334,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1356 return false; 1334 return false;
1357} 1335}
1358 1336
1359static const unsigned int memcg1_stats[] = { 1337static char *memory_stat_format(struct mem_cgroup *memcg)
1360 MEMCG_CACHE, 1338{
1361 MEMCG_RSS, 1339 struct seq_buf s;
1362 MEMCG_RSS_HUGE, 1340 int i;
1363 NR_SHMEM,
1364 NR_FILE_MAPPED,
1365 NR_FILE_DIRTY,
1366 NR_WRITEBACK,
1367 MEMCG_SWAP,
1368};
1369 1341
1370static const char *const memcg1_stat_names[] = { 1342 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1371 "cache", 1343 if (!s.buffer)
1372 "rss", 1344 return NULL;
1373 "rss_huge", 1345
1374 "shmem", 1346 /*
1375 "mapped_file", 1347 * Provide statistics on the state of the memory subsystem as
1376 "dirty", 1348 * well as cumulative event counters that show past behavior.
1377 "writeback", 1349 *
1378 "swap", 1350 * This list is ordered following a combination of these gradients:
1379}; 1351 * 1) generic big picture -> specifics and details
1352 * 2) reflecting userspace activity -> reflecting kernel heuristics
1353 *
1354 * Current memory state:
1355 */
1356
1357 seq_buf_printf(&s, "anon %llu\n",
1358 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1359 PAGE_SIZE);
1360 seq_buf_printf(&s, "file %llu\n",
1361 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1362 PAGE_SIZE);
1363 seq_buf_printf(&s, "kernel_stack %llu\n",
1364 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1365 1024);
1366 seq_buf_printf(&s, "slab %llu\n",
1367 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1368 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1369 PAGE_SIZE);
1370 seq_buf_printf(&s, "sock %llu\n",
1371 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1372 PAGE_SIZE);
1373
1374 seq_buf_printf(&s, "shmem %llu\n",
1375 (u64)memcg_page_state(memcg, NR_SHMEM) *
1376 PAGE_SIZE);
1377 seq_buf_printf(&s, "file_mapped %llu\n",
1378 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1379 PAGE_SIZE);
1380 seq_buf_printf(&s, "file_dirty %llu\n",
1381 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1382 PAGE_SIZE);
1383 seq_buf_printf(&s, "file_writeback %llu\n",
1384 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1385 PAGE_SIZE);
1386
1387 /*
1388 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
1389 * with the NR_ANON_THP vm counter, but right now it's a pain in the
1390 * arse because it requires migrating the work out of rmap to a place
1391 * where the page->mem_cgroup is set up and stable.
1392 */
1393 seq_buf_printf(&s, "anon_thp %llu\n",
1394 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1395 PAGE_SIZE);
1396
1397 for (i = 0; i < NR_LRU_LISTS; i++)
1398 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
1399 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1400 PAGE_SIZE);
1401
1402 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1403 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1404 PAGE_SIZE);
1405 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1406 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1407 PAGE_SIZE);
1408
1409 /* Accumulated memory events */
1410
1411 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
1412 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
1413
1414 seq_buf_printf(&s, "workingset_refault %lu\n",
1415 memcg_page_state(memcg, WORKINGSET_REFAULT));
1416 seq_buf_printf(&s, "workingset_activate %lu\n",
1417 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1418 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1419 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1420
1421 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
1422 seq_buf_printf(&s, "pgscan %lu\n",
1423 memcg_events(memcg, PGSCAN_KSWAPD) +
1424 memcg_events(memcg, PGSCAN_DIRECT));
1425 seq_buf_printf(&s, "pgsteal %lu\n",
1426 memcg_events(memcg, PGSTEAL_KSWAPD) +
1427 memcg_events(memcg, PGSTEAL_DIRECT));
1428 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
1429 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
1430 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
1431 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
1432
1433#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1434 seq_buf_printf(&s, "thp_fault_alloc %lu\n",
1435 memcg_events(memcg, THP_FAULT_ALLOC));
1436 seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
1437 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1438#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1439
1440 /* The above should easily fit into one page */
1441 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1442
1443 return s.buffer;
1444}
1380 1445
1381#define K(x) ((x) << (PAGE_SHIFT-10)) 1446#define K(x) ((x) << (PAGE_SHIFT-10))
1382/** 1447/**
@@ -1411,39 +1476,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
1411 */ 1476 */
1412void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1477void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1413{ 1478{
1414 struct mem_cgroup *iter; 1479 char *buf;
1415 unsigned int i;
1416 1480
1417 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1481 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1418 K((u64)page_counter_read(&memcg->memory)), 1482 K((u64)page_counter_read(&memcg->memory)),
1419 K((u64)memcg->memory.max), memcg->memory.failcnt); 1483 K((u64)memcg->memory.max), memcg->memory.failcnt);
1420 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1484 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1421 K((u64)page_counter_read(&memcg->memsw)), 1485 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1422 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1486 K((u64)page_counter_read(&memcg->swap)),
1423 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1487 K((u64)memcg->swap.max), memcg->swap.failcnt);
1424 K((u64)page_counter_read(&memcg->kmem)), 1488 else {
1425 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1489 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1426 1490 K((u64)page_counter_read(&memcg->memsw)),
1427 for_each_mem_cgroup_tree(iter, memcg) { 1491 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1428 pr_info("Memory cgroup stats for "); 1492 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1429 pr_cont_cgroup_path(iter->css.cgroup); 1493 K((u64)page_counter_read(&memcg->kmem)),
1430 pr_cont(":"); 1494 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1431
1432 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1433 if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1434 continue;
1435 pr_cont(" %s:%luKB", memcg1_stat_names[i],
1436 K(memcg_page_state_local(iter,
1437 memcg1_stats[i])));
1438 }
1439
1440 for (i = 0; i < NR_LRU_LISTS; i++)
1441 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1442 K(memcg_page_state_local(iter,
1443 NR_LRU_BASE + i)));
1444
1445 pr_cont("\n");
1446 } 1495 }
1496
1497 pr_info("Memory cgroup stats for ");
1498 pr_cont_cgroup_path(memcg->css.cgroup);
1499 pr_cont(":");
1500 buf = memory_stat_format(memcg);
1501 if (!buf)
1502 return;
1503 pr_info("%s", buf);
1504 kfree(buf);
1447} 1505}
1448 1506
1449/* 1507/*
@@ -2279,7 +2337,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2279 unsigned long nr_reclaimed; 2337 unsigned long nr_reclaimed;
2280 bool may_swap = true; 2338 bool may_swap = true;
2281 bool drained = false; 2339 bool drained = false;
2282 bool oomed = false;
2283 enum oom_status oom_status; 2340 enum oom_status oom_status;
2284 2341
2285 if (mem_cgroup_is_root(memcg)) 2342 if (mem_cgroup_is_root(memcg))
@@ -2366,7 +2423,7 @@ retry:
2366 if (nr_retries--) 2423 if (nr_retries--)
2367 goto retry; 2424 goto retry;
2368 2425
2369 if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) 2426 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2370 goto nomem; 2427 goto nomem;
2371 2428
2372 if (gfp_mask & __GFP_NOFAIL) 2429 if (gfp_mask & __GFP_NOFAIL)
@@ -2385,7 +2442,6 @@ retry:
2385 switch (oom_status) { 2442 switch (oom_status) {
2386 case OOM_SUCCESS: 2443 case OOM_SUCCESS:
2387 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2444 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2388 oomed = true;
2389 goto retry; 2445 goto retry;
2390 case OOM_FAILED: 2446 case OOM_FAILED:
2391 goto force; 2447 goto force;
@@ -2588,12 +2644,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2588{ 2644{
2589 struct memcg_kmem_cache_create_work *cw; 2645 struct memcg_kmem_cache_create_work *cw;
2590 2646
2647 if (!css_tryget_online(&memcg->css))
2648 return;
2649
2591 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); 2650 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2592 if (!cw) 2651 if (!cw)
2593 return; 2652 return;
2594 2653
2595 css_get(&memcg->css);
2596
2597 cw->memcg = memcg; 2654 cw->memcg = memcg;
2598 cw->cachep = cachep; 2655 cw->cachep = cachep;
2599 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2656 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2628,6 +2685,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2628{ 2685{
2629 struct mem_cgroup *memcg; 2686 struct mem_cgroup *memcg;
2630 struct kmem_cache *memcg_cachep; 2687 struct kmem_cache *memcg_cachep;
2688 struct memcg_cache_array *arr;
2631 int kmemcg_id; 2689 int kmemcg_id;
2632 2690
2633 VM_BUG_ON(!is_root_cache(cachep)); 2691 VM_BUG_ON(!is_root_cache(cachep));
@@ -2635,14 +2693,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2635 if (memcg_kmem_bypass()) 2693 if (memcg_kmem_bypass())
2636 return cachep; 2694 return cachep;
2637 2695
2638 memcg = get_mem_cgroup_from_current(); 2696 rcu_read_lock();
2697
2698 if (unlikely(current->active_memcg))
2699 memcg = current->active_memcg;
2700 else
2701 memcg = mem_cgroup_from_task(current);
2702
2703 if (!memcg || memcg == root_mem_cgroup)
2704 goto out_unlock;
2705
2639 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2706 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2640 if (kmemcg_id < 0) 2707 if (kmemcg_id < 0)
2641 goto out; 2708 goto out_unlock;
2642 2709
2643 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2710 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2644 if (likely(memcg_cachep)) 2711
2645 return memcg_cachep; 2712 /*
2713 * Make sure we will access the up-to-date value. The code updating
2714 * memcg_caches issues a write barrier to match the data dependency
2715 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2716 */
2717 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2646 2718
2647 /* 2719 /*
2648 * If we are in a safe context (can wait, and not in interrupt 2720 * If we are in a safe context (can wait, and not in interrupt
@@ -2655,10 +2727,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2655 * memcg_create_kmem_cache, this means no further allocation 2727 * memcg_create_kmem_cache, this means no further allocation
2656 * could happen with the slab_mutex held. So it's better to 2728 * could happen with the slab_mutex held. So it's better to
2657 * defer everything. 2729 * defer everything.
2730 *
2731 * If the memcg is dying or memcg_cache is about to be released,
2732 * don't bother creating new kmem_caches. Because memcg_cachep
2733 * is ZEROed as the fist step of kmem offlining, we don't need
2734 * percpu_ref_tryget_live() here. css_tryget_online() check in
2735 * memcg_schedule_kmem_cache_create() will prevent us from
2736 * creation of a new kmem_cache.
2658 */ 2737 */
2659 memcg_schedule_kmem_cache_create(memcg, cachep); 2738 if (unlikely(!memcg_cachep))
2660out: 2739 memcg_schedule_kmem_cache_create(memcg, cachep);
2661 css_put(&memcg->css); 2740 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2741 cachep = memcg_cachep;
2742out_unlock:
2743 rcu_read_unlock();
2662 return cachep; 2744 return cachep;
2663} 2745}
2664 2746
@@ -2669,7 +2751,7 @@ out:
2669void memcg_kmem_put_cache(struct kmem_cache *cachep) 2751void memcg_kmem_put_cache(struct kmem_cache *cachep)
2670{ 2752{
2671 if (!is_root_cache(cachep)) 2753 if (!is_root_cache(cachep))
2672 css_put(&cachep->memcg_params.memcg->css); 2754 percpu_ref_put(&cachep->memcg_params.refcnt);
2673} 2755}
2674 2756
2675/** 2757/**
@@ -2697,9 +2779,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2697 cancel_charge(memcg, nr_pages); 2779 cancel_charge(memcg, nr_pages);
2698 return -ENOMEM; 2780 return -ENOMEM;
2699 } 2781 }
2700
2701 page->mem_cgroup = memcg;
2702
2703 return 0; 2782 return 0;
2704} 2783}
2705 2784
@@ -2722,12 +2801,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2722 memcg = get_mem_cgroup_from_current(); 2801 memcg = get_mem_cgroup_from_current();
2723 if (!mem_cgroup_is_root(memcg)) { 2802 if (!mem_cgroup_is_root(memcg)) {
2724 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); 2803 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2725 if (!ret) 2804 if (!ret) {
2805 page->mem_cgroup = memcg;
2726 __SetPageKmemcg(page); 2806 __SetPageKmemcg(page);
2807 }
2727 } 2808 }
2728 css_put(&memcg->css); 2809 css_put(&memcg->css);
2729 return ret; 2810 return ret;
2730} 2811}
2812
2813/**
2814 * __memcg_kmem_uncharge_memcg: uncharge a kmem page
2815 * @memcg: memcg to uncharge
2816 * @nr_pages: number of pages to uncharge
2817 */
2818void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
2819 unsigned int nr_pages)
2820{
2821 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2822 page_counter_uncharge(&memcg->kmem, nr_pages);
2823
2824 page_counter_uncharge(&memcg->memory, nr_pages);
2825 if (do_memsw_account())
2826 page_counter_uncharge(&memcg->memsw, nr_pages);
2827}
2731/** 2828/**
2732 * __memcg_kmem_uncharge: uncharge a kmem page 2829 * __memcg_kmem_uncharge: uncharge a kmem page
2733 * @page: page to uncharge 2830 * @page: page to uncharge
@@ -2742,14 +2839,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
2742 return; 2839 return;
2743 2840
2744 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2841 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2745 2842 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
2746 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2747 page_counter_uncharge(&memcg->kmem, nr_pages);
2748
2749 page_counter_uncharge(&memcg->memory, nr_pages);
2750 if (do_memsw_account())
2751 page_counter_uncharge(&memcg->memsw, nr_pages);
2752
2753 page->mem_cgroup = NULL; 2843 page->mem_cgroup = NULL;
2754 2844
2755 /* slab pages do not have PageKmemcg flag set */ 2845 /* slab pages do not have PageKmemcg flag set */
@@ -3168,15 +3258,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
3168 */ 3258 */
3169 memcg->kmem_state = KMEM_ALLOCATED; 3259 memcg->kmem_state = KMEM_ALLOCATED;
3170 3260
3171 memcg_deactivate_kmem_caches(memcg);
3172
3173 kmemcg_id = memcg->kmemcg_id;
3174 BUG_ON(kmemcg_id < 0);
3175
3176 parent = parent_mem_cgroup(memcg); 3261 parent = parent_mem_cgroup(memcg);
3177 if (!parent) 3262 if (!parent)
3178 parent = root_mem_cgroup; 3263 parent = root_mem_cgroup;
3179 3264
3265 memcg_deactivate_kmem_caches(memcg, parent);
3266
3267 kmemcg_id = memcg->kmemcg_id;
3268 BUG_ON(kmemcg_id < 0);
3269
3180 /* 3270 /*
3181 * Change kmemcg_id of this cgroup and all its descendants to the 3271 * Change kmemcg_id of this cgroup and all its descendants to the
3182 * parent's id, and then move all entries from this cgroup's list_lrus 3272 * parent's id, and then move all entries from this cgroup's list_lrus
@@ -3207,9 +3297,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
3207 memcg_offline_kmem(memcg); 3297 memcg_offline_kmem(memcg);
3208 3298
3209 if (memcg->kmem_state == KMEM_ALLOCATED) { 3299 if (memcg->kmem_state == KMEM_ALLOCATED) {
3210 memcg_destroy_kmem_caches(memcg); 3300 WARN_ON(!list_empty(&memcg->kmem_caches));
3211 static_branch_dec(&memcg_kmem_enabled_key); 3301 static_branch_dec(&memcg_kmem_enabled_key);
3212 WARN_ON(page_counter_read(&memcg->kmem));
3213 } 3302 }
3214} 3303}
3215#else 3304#else
@@ -3472,6 +3561,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
3472} 3561}
3473#endif /* CONFIG_NUMA */ 3562#endif /* CONFIG_NUMA */
3474 3563
3564static const unsigned int memcg1_stats[] = {
3565 MEMCG_CACHE,
3566 MEMCG_RSS,
3567 MEMCG_RSS_HUGE,
3568 NR_SHMEM,
3569 NR_FILE_MAPPED,
3570 NR_FILE_DIRTY,
3571 NR_WRITEBACK,
3572 MEMCG_SWAP,
3573};
3574
3575static const char *const memcg1_stat_names[] = {
3576 "cache",
3577 "rss",
3578 "rss_huge",
3579 "shmem",
3580 "mapped_file",
3581 "dirty",
3582 "writeback",
3583 "swap",
3584};
3585
3475/* Universal VM events cgroup1 shows, original sort order */ 3586/* Universal VM events cgroup1 shows, original sort order */
3476static const unsigned int memcg1_events[] = { 3587static const unsigned int memcg1_events[] = {
3477 PGPGIN, 3588 PGPGIN,
@@ -3530,12 +3641,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3530 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3641 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3531 continue; 3642 continue;
3532 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 3643 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3533 (u64)memcg_page_state(memcg, i) * PAGE_SIZE); 3644 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3645 PAGE_SIZE);
3534 } 3646 }
3535 3647
3536 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3648 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3537 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], 3649 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3538 (u64)memcg_events(memcg, i)); 3650 (u64)memcg_events(memcg, memcg1_events[i]));
3539 3651
3540 for (i = 0; i < NR_LRU_LISTS; i++) 3652 for (i = 0; i < NR_LRU_LISTS; i++)
3541 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], 3653 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
@@ -4634,6 +4746,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4634 4746
4635 /* The following stuff does not apply to the root */ 4747 /* The following stuff does not apply to the root */
4636 if (!parent) { 4748 if (!parent) {
4749#ifdef CONFIG_MEMCG_KMEM
4750 INIT_LIST_HEAD(&memcg->kmem_caches);
4751#endif
4637 root_mem_cgroup = memcg; 4752 root_mem_cgroup = memcg;
4638 return &memcg->css; 4753 return &memcg->css;
4639 } 4754 }
@@ -4793,7 +4908,7 @@ enum mc_target_type {
4793static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4908static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4794 unsigned long addr, pte_t ptent) 4909 unsigned long addr, pte_t ptent)
4795{ 4910{
4796 struct page *page = _vm_normal_page(vma, addr, ptent, true); 4911 struct page *page = vm_normal_page(vma, addr, ptent);
4797 4912
4798 if (!page || !page_mapped(page)) 4913 if (!page || !page_mapped(page))
4799 return NULL; 4914 return NULL;
@@ -4994,8 +5109,8 @@ out:
4994 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5109 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4995 * target for charge migration. if @target is not NULL, the entry is stored 5110 * target for charge migration. if @target is not NULL, the entry is stored
4996 * in target->ent. 5111 * in target->ent.
4997 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC 5112 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
4998 * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). 5113 * (so ZONE_DEVICE page and thus not on the lru).
4999 * For now we such page is charge like a regular page would be as for all 5114 * For now we such page is charge like a regular page would be as for all
5000 * intent and purposes it is just special memory taking the place of a 5115 * intent and purposes it is just special memory taking the place of a
5001 * regular page. 5116 * regular page.
@@ -5029,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5029 */ 5144 */
5030 if (page->mem_cgroup == mc.from) { 5145 if (page->mem_cgroup == mc.from) {
5031 ret = MC_TARGET_PAGE; 5146 ret = MC_TARGET_PAGE;
5032 if (is_device_private_page(page) || 5147 if (is_device_private_page(page))
5033 is_device_public_page(page))
5034 ret = MC_TARGET_DEVICE; 5148 ret = MC_TARGET_DEVICE;
5035 if (target) 5149 if (target)
5036 target->page = page; 5150 target->page = page;
@@ -5101,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5101 if (ptl) { 5215 if (ptl) {
5102 /* 5216 /*
5103 * Note their can not be MC_TARGET_DEVICE for now as we do not 5217 * Note their can not be MC_TARGET_DEVICE for now as we do not
5104 * support transparent huge page with MEMORY_DEVICE_PUBLIC or 5218 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5105 * MEMORY_DEVICE_PRIVATE but this might change. 5219 * this might change.
5106 */ 5220 */
5107 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5221 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5108 mc.precharge += HPAGE_PMD_NR; 5222 mc.precharge += HPAGE_PMD_NR;
@@ -5625,112 +5739,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
5625 return nbytes; 5739 return nbytes;
5626} 5740}
5627 5741
5742static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
5743{
5744 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
5745 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
5746 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
5747 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
5748 seq_printf(m, "oom_kill %lu\n",
5749 atomic_long_read(&events[MEMCG_OOM_KILL]));
5750}
5751
5628static int memory_events_show(struct seq_file *m, void *v) 5752static int memory_events_show(struct seq_file *m, void *v)
5629{ 5753{
5630 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5754 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5631 5755
5632 seq_printf(m, "low %lu\n", 5756 __memory_events_show(m, memcg->memory_events);
5633 atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5634 seq_printf(m, "high %lu\n",
5635 atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5636 seq_printf(m, "max %lu\n",
5637 atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5638 seq_printf(m, "oom %lu\n",
5639 atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5640 seq_printf(m, "oom_kill %lu\n",
5641 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
5642
5643 return 0; 5757 return 0;
5644} 5758}
5645 5759
5646static int memory_stat_show(struct seq_file *m, void *v) 5760static int memory_events_local_show(struct seq_file *m, void *v)
5647{ 5761{
5648 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5762 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5649 int i;
5650
5651 /*
5652 * Provide statistics on the state of the memory subsystem as
5653 * well as cumulative event counters that show past behavior.
5654 *
5655 * This list is ordered following a combination of these gradients:
5656 * 1) generic big picture -> specifics and details
5657 * 2) reflecting userspace activity -> reflecting kernel heuristics
5658 *
5659 * Current memory state:
5660 */
5661
5662 seq_printf(m, "anon %llu\n",
5663 (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
5664 seq_printf(m, "file %llu\n",
5665 (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
5666 seq_printf(m, "kernel_stack %llu\n",
5667 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
5668 seq_printf(m, "slab %llu\n",
5669 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
5670 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
5671 PAGE_SIZE);
5672 seq_printf(m, "sock %llu\n",
5673 (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
5674
5675 seq_printf(m, "shmem %llu\n",
5676 (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
5677 seq_printf(m, "file_mapped %llu\n",
5678 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
5679 seq_printf(m, "file_dirty %llu\n",
5680 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
5681 seq_printf(m, "file_writeback %llu\n",
5682 (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
5683 5763
5684 /* 5764 __memory_events_show(m, memcg->memory_events_local);
5685 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter 5765 return 0;
5686 * with the NR_ANON_THP vm counter, but right now it's a pain in the 5766}
5687 * arse because it requires migrating the work out of rmap to a place
5688 * where the page->mem_cgroup is set up and stable.
5689 */
5690 seq_printf(m, "anon_thp %llu\n",
5691 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
5692
5693 for (i = 0; i < NR_LRU_LISTS; i++)
5694 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5695 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
5696 PAGE_SIZE);
5697
5698 seq_printf(m, "slab_reclaimable %llu\n",
5699 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
5700 PAGE_SIZE);
5701 seq_printf(m, "slab_unreclaimable %llu\n",
5702 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
5703 PAGE_SIZE);
5704
5705 /* Accumulated memory events */
5706
5707 seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
5708 seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
5709
5710 seq_printf(m, "workingset_refault %lu\n",
5711 memcg_page_state(memcg, WORKINGSET_REFAULT));
5712 seq_printf(m, "workingset_activate %lu\n",
5713 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
5714 seq_printf(m, "workingset_nodereclaim %lu\n",
5715 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
5716
5717 seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
5718 seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
5719 memcg_events(memcg, PGSCAN_DIRECT));
5720 seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
5721 memcg_events(memcg, PGSTEAL_DIRECT));
5722 seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
5723 seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
5724 seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
5725 seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
5726 5767
5727#ifdef CONFIG_TRANSPARENT_HUGEPAGE 5768static int memory_stat_show(struct seq_file *m, void *v)
5728 seq_printf(m, "thp_fault_alloc %lu\n", 5769{
5729 memcg_events(memcg, THP_FAULT_ALLOC)); 5770 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5730 seq_printf(m, "thp_collapse_alloc %lu\n", 5771 char *buf;
5731 memcg_events(memcg, THP_COLLAPSE_ALLOC));
5732#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5733 5772
5773 buf = memory_stat_format(memcg);
5774 if (!buf)
5775 return -ENOMEM;
5776 seq_puts(m, buf);
5777 kfree(buf);
5734 return 0; 5778 return 0;
5735} 5779}
5736 5780
@@ -5802,6 +5846,12 @@ static struct cftype memory_files[] = {
5802 .seq_show = memory_events_show, 5846 .seq_show = memory_events_show,
5803 }, 5847 },
5804 { 5848 {
5849 .name = "events.local",
5850 .flags = CFTYPE_NOT_ON_ROOT,
5851 .file_offset = offsetof(struct mem_cgroup, events_local_file),
5852 .seq_show = memory_events_local_show,
5853 },
5854 {
5805 .name = "stat", 5855 .name = "stat",
5806 .flags = CFTYPE_NOT_ON_ROOT, 5856 .flags = CFTYPE_NOT_ON_ROOT,
5807 .seq_show = memory_stat_show, 5857 .seq_show = memory_stat_show,