diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 474 |
1 files changed, 262 insertions, 212 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ba9138a4a1de..249671873aa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/file.h> | 58 | #include <linux/file.h> |
59 | #include <linux/tracehook.h> | 59 | #include <linux/tracehook.h> |
60 | #include <linux/seq_buf.h> | ||
60 | #include "internal.h" | 61 | #include "internal.h" |
61 | #include <net/sock.h> | 62 | #include <net/sock.h> |
62 | #include <net/ip.h> | 63 | #include <net/ip.h> |
@@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page) | |||
485 | unsigned long ino = 0; | 486 | unsigned long ino = 0; |
486 | 487 | ||
487 | rcu_read_lock(); | 488 | rcu_read_lock(); |
488 | memcg = READ_ONCE(page->mem_cgroup); | 489 | if (PageHead(page) && PageSlab(page)) |
490 | memcg = memcg_from_slab_page(page); | ||
491 | else | ||
492 | memcg = READ_ONCE(page->mem_cgroup); | ||
489 | while (memcg && !(memcg->css.flags & CSS_ONLINE)) | 493 | while (memcg && !(memcg->css.flags & CSS_ONLINE)) |
490 | memcg = parent_mem_cgroup(memcg); | 494 | memcg = parent_mem_cgroup(memcg); |
491 | if (memcg) | 495 | if (memcg) |
@@ -1163,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, | |||
1163 | struct css_task_iter it; | 1167 | struct css_task_iter it; |
1164 | struct task_struct *task; | 1168 | struct task_struct *task; |
1165 | 1169 | ||
1166 | css_task_iter_start(&iter->css, 0, &it); | 1170 | css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); |
1167 | while (!ret && (task = css_task_iter_next(&it))) | 1171 | while (!ret && (task = css_task_iter_next(&it))) |
1168 | ret = fn(task, arg); | 1172 | ret = fn(task, arg); |
1169 | css_task_iter_end(&it); | 1173 | css_task_iter_end(&it); |
@@ -1255,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | |||
1255 | *lru_size += nr_pages; | 1259 | *lru_size += nr_pages; |
1256 | } | 1260 | } |
1257 | 1261 | ||
1258 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) | ||
1259 | { | ||
1260 | struct mem_cgroup *task_memcg; | ||
1261 | struct task_struct *p; | ||
1262 | bool ret; | ||
1263 | |||
1264 | p = find_lock_task_mm(task); | ||
1265 | if (p) { | ||
1266 | task_memcg = get_mem_cgroup_from_mm(p->mm); | ||
1267 | task_unlock(p); | ||
1268 | } else { | ||
1269 | /* | ||
1270 | * All threads may have already detached their mm's, but the oom | ||
1271 | * killer still needs to detect if they have already been oom | ||
1272 | * killed to prevent needlessly killing additional tasks. | ||
1273 | */ | ||
1274 | rcu_read_lock(); | ||
1275 | task_memcg = mem_cgroup_from_task(task); | ||
1276 | css_get(&task_memcg->css); | ||
1277 | rcu_read_unlock(); | ||
1278 | } | ||
1279 | ret = mem_cgroup_is_descendant(task_memcg, memcg); | ||
1280 | css_put(&task_memcg->css); | ||
1281 | return ret; | ||
1282 | } | ||
1283 | |||
1284 | /** | 1262 | /** |
1285 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1263 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1286 | * @memcg: the memory cgroup | 1264 | * @memcg: the memory cgroup |
@@ -1356,27 +1334,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1356 | return false; | 1334 | return false; |
1357 | } | 1335 | } |
1358 | 1336 | ||
1359 | static const unsigned int memcg1_stats[] = { | 1337 | static char *memory_stat_format(struct mem_cgroup *memcg) |
1360 | MEMCG_CACHE, | 1338 | { |
1361 | MEMCG_RSS, | 1339 | struct seq_buf s; |
1362 | MEMCG_RSS_HUGE, | 1340 | int i; |
1363 | NR_SHMEM, | ||
1364 | NR_FILE_MAPPED, | ||
1365 | NR_FILE_DIRTY, | ||
1366 | NR_WRITEBACK, | ||
1367 | MEMCG_SWAP, | ||
1368 | }; | ||
1369 | 1341 | ||
1370 | static const char *const memcg1_stat_names[] = { | 1342 | seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); |
1371 | "cache", | 1343 | if (!s.buffer) |
1372 | "rss", | 1344 | return NULL; |
1373 | "rss_huge", | 1345 | |
1374 | "shmem", | 1346 | /* |
1375 | "mapped_file", | 1347 | * Provide statistics on the state of the memory subsystem as |
1376 | "dirty", | 1348 | * well as cumulative event counters that show past behavior. |
1377 | "writeback", | 1349 | * |
1378 | "swap", | 1350 | * This list is ordered following a combination of these gradients: |
1379 | }; | 1351 | * 1) generic big picture -> specifics and details |
1352 | * 2) reflecting userspace activity -> reflecting kernel heuristics | ||
1353 | * | ||
1354 | * Current memory state: | ||
1355 | */ | ||
1356 | |||
1357 | seq_buf_printf(&s, "anon %llu\n", | ||
1358 | (u64)memcg_page_state(memcg, MEMCG_RSS) * | ||
1359 | PAGE_SIZE); | ||
1360 | seq_buf_printf(&s, "file %llu\n", | ||
1361 | (u64)memcg_page_state(memcg, MEMCG_CACHE) * | ||
1362 | PAGE_SIZE); | ||
1363 | seq_buf_printf(&s, "kernel_stack %llu\n", | ||
1364 | (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * | ||
1365 | 1024); | ||
1366 | seq_buf_printf(&s, "slab %llu\n", | ||
1367 | (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + | ||
1368 | memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * | ||
1369 | PAGE_SIZE); | ||
1370 | seq_buf_printf(&s, "sock %llu\n", | ||
1371 | (u64)memcg_page_state(memcg, MEMCG_SOCK) * | ||
1372 | PAGE_SIZE); | ||
1373 | |||
1374 | seq_buf_printf(&s, "shmem %llu\n", | ||
1375 | (u64)memcg_page_state(memcg, NR_SHMEM) * | ||
1376 | PAGE_SIZE); | ||
1377 | seq_buf_printf(&s, "file_mapped %llu\n", | ||
1378 | (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * | ||
1379 | PAGE_SIZE); | ||
1380 | seq_buf_printf(&s, "file_dirty %llu\n", | ||
1381 | (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * | ||
1382 | PAGE_SIZE); | ||
1383 | seq_buf_printf(&s, "file_writeback %llu\n", | ||
1384 | (u64)memcg_page_state(memcg, NR_WRITEBACK) * | ||
1385 | PAGE_SIZE); | ||
1386 | |||
1387 | /* | ||
1388 | * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter | ||
1389 | * with the NR_ANON_THP vm counter, but right now it's a pain in the | ||
1390 | * arse because it requires migrating the work out of rmap to a place | ||
1391 | * where the page->mem_cgroup is set up and stable. | ||
1392 | */ | ||
1393 | seq_buf_printf(&s, "anon_thp %llu\n", | ||
1394 | (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * | ||
1395 | PAGE_SIZE); | ||
1396 | |||
1397 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
1398 | seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], | ||
1399 | (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * | ||
1400 | PAGE_SIZE); | ||
1401 | |||
1402 | seq_buf_printf(&s, "slab_reclaimable %llu\n", | ||
1403 | (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * | ||
1404 | PAGE_SIZE); | ||
1405 | seq_buf_printf(&s, "slab_unreclaimable %llu\n", | ||
1406 | (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * | ||
1407 | PAGE_SIZE); | ||
1408 | |||
1409 | /* Accumulated memory events */ | ||
1410 | |||
1411 | seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); | ||
1412 | seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); | ||
1413 | |||
1414 | seq_buf_printf(&s, "workingset_refault %lu\n", | ||
1415 | memcg_page_state(memcg, WORKINGSET_REFAULT)); | ||
1416 | seq_buf_printf(&s, "workingset_activate %lu\n", | ||
1417 | memcg_page_state(memcg, WORKINGSET_ACTIVATE)); | ||
1418 | seq_buf_printf(&s, "workingset_nodereclaim %lu\n", | ||
1419 | memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); | ||
1420 | |||
1421 | seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); | ||
1422 | seq_buf_printf(&s, "pgscan %lu\n", | ||
1423 | memcg_events(memcg, PGSCAN_KSWAPD) + | ||
1424 | memcg_events(memcg, PGSCAN_DIRECT)); | ||
1425 | seq_buf_printf(&s, "pgsteal %lu\n", | ||
1426 | memcg_events(memcg, PGSTEAL_KSWAPD) + | ||
1427 | memcg_events(memcg, PGSTEAL_DIRECT)); | ||
1428 | seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); | ||
1429 | seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); | ||
1430 | seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); | ||
1431 | seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); | ||
1432 | |||
1433 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1434 | seq_buf_printf(&s, "thp_fault_alloc %lu\n", | ||
1435 | memcg_events(memcg, THP_FAULT_ALLOC)); | ||
1436 | seq_buf_printf(&s, "thp_collapse_alloc %lu\n", | ||
1437 | memcg_events(memcg, THP_COLLAPSE_ALLOC)); | ||
1438 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
1439 | |||
1440 | /* The above should easily fit into one page */ | ||
1441 | WARN_ON_ONCE(seq_buf_has_overflowed(&s)); | ||
1442 | |||
1443 | return s.buffer; | ||
1444 | } | ||
1380 | 1445 | ||
1381 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1446 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1382 | /** | 1447 | /** |
@@ -1411,39 +1476,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * | |||
1411 | */ | 1476 | */ |
1412 | void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) | 1477 | void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) |
1413 | { | 1478 | { |
1414 | struct mem_cgroup *iter; | 1479 | char *buf; |
1415 | unsigned int i; | ||
1416 | 1480 | ||
1417 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", | 1481 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", |
1418 | K((u64)page_counter_read(&memcg->memory)), | 1482 | K((u64)page_counter_read(&memcg->memory)), |
1419 | K((u64)memcg->memory.max), memcg->memory.failcnt); | 1483 | K((u64)memcg->memory.max), memcg->memory.failcnt); |
1420 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", | 1484 | if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) |
1421 | K((u64)page_counter_read(&memcg->memsw)), | 1485 | pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", |
1422 | K((u64)memcg->memsw.max), memcg->memsw.failcnt); | 1486 | K((u64)page_counter_read(&memcg->swap)), |
1423 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", | 1487 | K((u64)memcg->swap.max), memcg->swap.failcnt); |
1424 | K((u64)page_counter_read(&memcg->kmem)), | 1488 | else { |
1425 | K((u64)memcg->kmem.max), memcg->kmem.failcnt); | 1489 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", |
1426 | 1490 | K((u64)page_counter_read(&memcg->memsw)), | |
1427 | for_each_mem_cgroup_tree(iter, memcg) { | 1491 | K((u64)memcg->memsw.max), memcg->memsw.failcnt); |
1428 | pr_info("Memory cgroup stats for "); | 1492 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", |
1429 | pr_cont_cgroup_path(iter->css.cgroup); | 1493 | K((u64)page_counter_read(&memcg->kmem)), |
1430 | pr_cont(":"); | 1494 | K((u64)memcg->kmem.max), memcg->kmem.failcnt); |
1431 | |||
1432 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | ||
1433 | if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) | ||
1434 | continue; | ||
1435 | pr_cont(" %s:%luKB", memcg1_stat_names[i], | ||
1436 | K(memcg_page_state_local(iter, | ||
1437 | memcg1_stats[i]))); | ||
1438 | } | ||
1439 | |||
1440 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
1441 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | ||
1442 | K(memcg_page_state_local(iter, | ||
1443 | NR_LRU_BASE + i))); | ||
1444 | |||
1445 | pr_cont("\n"); | ||
1446 | } | 1495 | } |
1496 | |||
1497 | pr_info("Memory cgroup stats for "); | ||
1498 | pr_cont_cgroup_path(memcg->css.cgroup); | ||
1499 | pr_cont(":"); | ||
1500 | buf = memory_stat_format(memcg); | ||
1501 | if (!buf) | ||
1502 | return; | ||
1503 | pr_info("%s", buf); | ||
1504 | kfree(buf); | ||
1447 | } | 1505 | } |
1448 | 1506 | ||
1449 | /* | 1507 | /* |
@@ -2279,7 +2337,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2279 | unsigned long nr_reclaimed; | 2337 | unsigned long nr_reclaimed; |
2280 | bool may_swap = true; | 2338 | bool may_swap = true; |
2281 | bool drained = false; | 2339 | bool drained = false; |
2282 | bool oomed = false; | ||
2283 | enum oom_status oom_status; | 2340 | enum oom_status oom_status; |
2284 | 2341 | ||
2285 | if (mem_cgroup_is_root(memcg)) | 2342 | if (mem_cgroup_is_root(memcg)) |
@@ -2366,7 +2423,7 @@ retry: | |||
2366 | if (nr_retries--) | 2423 | if (nr_retries--) |
2367 | goto retry; | 2424 | goto retry; |
2368 | 2425 | ||
2369 | if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) | 2426 | if (gfp_mask & __GFP_RETRY_MAYFAIL) |
2370 | goto nomem; | 2427 | goto nomem; |
2371 | 2428 | ||
2372 | if (gfp_mask & __GFP_NOFAIL) | 2429 | if (gfp_mask & __GFP_NOFAIL) |
@@ -2385,7 +2442,6 @@ retry: | |||
2385 | switch (oom_status) { | 2442 | switch (oom_status) { |
2386 | case OOM_SUCCESS: | 2443 | case OOM_SUCCESS: |
2387 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2444 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2388 | oomed = true; | ||
2389 | goto retry; | 2445 | goto retry; |
2390 | case OOM_FAILED: | 2446 | case OOM_FAILED: |
2391 | goto force; | 2447 | goto force; |
@@ -2588,12 +2644,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | |||
2588 | { | 2644 | { |
2589 | struct memcg_kmem_cache_create_work *cw; | 2645 | struct memcg_kmem_cache_create_work *cw; |
2590 | 2646 | ||
2647 | if (!css_tryget_online(&memcg->css)) | ||
2648 | return; | ||
2649 | |||
2591 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); | 2650 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); |
2592 | if (!cw) | 2651 | if (!cw) |
2593 | return; | 2652 | return; |
2594 | 2653 | ||
2595 | css_get(&memcg->css); | ||
2596 | |||
2597 | cw->memcg = memcg; | 2654 | cw->memcg = memcg; |
2598 | cw->cachep = cachep; | 2655 | cw->cachep = cachep; |
2599 | INIT_WORK(&cw->work, memcg_kmem_cache_create_func); | 2656 | INIT_WORK(&cw->work, memcg_kmem_cache_create_func); |
@@ -2628,6 +2685,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2628 | { | 2685 | { |
2629 | struct mem_cgroup *memcg; | 2686 | struct mem_cgroup *memcg; |
2630 | struct kmem_cache *memcg_cachep; | 2687 | struct kmem_cache *memcg_cachep; |
2688 | struct memcg_cache_array *arr; | ||
2631 | int kmemcg_id; | 2689 | int kmemcg_id; |
2632 | 2690 | ||
2633 | VM_BUG_ON(!is_root_cache(cachep)); | 2691 | VM_BUG_ON(!is_root_cache(cachep)); |
@@ -2635,14 +2693,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2635 | if (memcg_kmem_bypass()) | 2693 | if (memcg_kmem_bypass()) |
2636 | return cachep; | 2694 | return cachep; |
2637 | 2695 | ||
2638 | memcg = get_mem_cgroup_from_current(); | 2696 | rcu_read_lock(); |
2697 | |||
2698 | if (unlikely(current->active_memcg)) | ||
2699 | memcg = current->active_memcg; | ||
2700 | else | ||
2701 | memcg = mem_cgroup_from_task(current); | ||
2702 | |||
2703 | if (!memcg || memcg == root_mem_cgroup) | ||
2704 | goto out_unlock; | ||
2705 | |||
2639 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); | 2706 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); |
2640 | if (kmemcg_id < 0) | 2707 | if (kmemcg_id < 0) |
2641 | goto out; | 2708 | goto out_unlock; |
2642 | 2709 | ||
2643 | memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); | 2710 | arr = rcu_dereference(cachep->memcg_params.memcg_caches); |
2644 | if (likely(memcg_cachep)) | 2711 | |
2645 | return memcg_cachep; | 2712 | /* |
2713 | * Make sure we will access the up-to-date value. The code updating | ||
2714 | * memcg_caches issues a write barrier to match the data dependency | ||
2715 | * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). | ||
2716 | */ | ||
2717 | memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); | ||
2646 | 2718 | ||
2647 | /* | 2719 | /* |
2648 | * If we are in a safe context (can wait, and not in interrupt | 2720 | * If we are in a safe context (can wait, and not in interrupt |
@@ -2655,10 +2727,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2655 | * memcg_create_kmem_cache, this means no further allocation | 2727 | * memcg_create_kmem_cache, this means no further allocation |
2656 | * could happen with the slab_mutex held. So it's better to | 2728 | * could happen with the slab_mutex held. So it's better to |
2657 | * defer everything. | 2729 | * defer everything. |
2730 | * | ||
2731 | * If the memcg is dying or memcg_cache is about to be released, | ||
2732 | * don't bother creating new kmem_caches. Because memcg_cachep | ||
2733 | * is ZEROed as the fist step of kmem offlining, we don't need | ||
2734 | * percpu_ref_tryget_live() here. css_tryget_online() check in | ||
2735 | * memcg_schedule_kmem_cache_create() will prevent us from | ||
2736 | * creation of a new kmem_cache. | ||
2658 | */ | 2737 | */ |
2659 | memcg_schedule_kmem_cache_create(memcg, cachep); | 2738 | if (unlikely(!memcg_cachep)) |
2660 | out: | 2739 | memcg_schedule_kmem_cache_create(memcg, cachep); |
2661 | css_put(&memcg->css); | 2740 | else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) |
2741 | cachep = memcg_cachep; | ||
2742 | out_unlock: | ||
2743 | rcu_read_unlock(); | ||
2662 | return cachep; | 2744 | return cachep; |
2663 | } | 2745 | } |
2664 | 2746 | ||
@@ -2669,7 +2751,7 @@ out: | |||
2669 | void memcg_kmem_put_cache(struct kmem_cache *cachep) | 2751 | void memcg_kmem_put_cache(struct kmem_cache *cachep) |
2670 | { | 2752 | { |
2671 | if (!is_root_cache(cachep)) | 2753 | if (!is_root_cache(cachep)) |
2672 | css_put(&cachep->memcg_params.memcg->css); | 2754 | percpu_ref_put(&cachep->memcg_params.refcnt); |
2673 | } | 2755 | } |
2674 | 2756 | ||
2675 | /** | 2757 | /** |
@@ -2697,9 +2779,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | |||
2697 | cancel_charge(memcg, nr_pages); | 2779 | cancel_charge(memcg, nr_pages); |
2698 | return -ENOMEM; | 2780 | return -ENOMEM; |
2699 | } | 2781 | } |
2700 | |||
2701 | page->mem_cgroup = memcg; | ||
2702 | |||
2703 | return 0; | 2782 | return 0; |
2704 | } | 2783 | } |
2705 | 2784 | ||
@@ -2722,12 +2801,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | |||
2722 | memcg = get_mem_cgroup_from_current(); | 2801 | memcg = get_mem_cgroup_from_current(); |
2723 | if (!mem_cgroup_is_root(memcg)) { | 2802 | if (!mem_cgroup_is_root(memcg)) { |
2724 | ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); | 2803 | ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); |
2725 | if (!ret) | 2804 | if (!ret) { |
2805 | page->mem_cgroup = memcg; | ||
2726 | __SetPageKmemcg(page); | 2806 | __SetPageKmemcg(page); |
2807 | } | ||
2727 | } | 2808 | } |
2728 | css_put(&memcg->css); | 2809 | css_put(&memcg->css); |
2729 | return ret; | 2810 | return ret; |
2730 | } | 2811 | } |
2812 | |||
2813 | /** | ||
2814 | * __memcg_kmem_uncharge_memcg: uncharge a kmem page | ||
2815 | * @memcg: memcg to uncharge | ||
2816 | * @nr_pages: number of pages to uncharge | ||
2817 | */ | ||
2818 | void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, | ||
2819 | unsigned int nr_pages) | ||
2820 | { | ||
2821 | if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) | ||
2822 | page_counter_uncharge(&memcg->kmem, nr_pages); | ||
2823 | |||
2824 | page_counter_uncharge(&memcg->memory, nr_pages); | ||
2825 | if (do_memsw_account()) | ||
2826 | page_counter_uncharge(&memcg->memsw, nr_pages); | ||
2827 | } | ||
2731 | /** | 2828 | /** |
2732 | * __memcg_kmem_uncharge: uncharge a kmem page | 2829 | * __memcg_kmem_uncharge: uncharge a kmem page |
2733 | * @page: page to uncharge | 2830 | * @page: page to uncharge |
@@ -2742,14 +2839,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) | |||
2742 | return; | 2839 | return; |
2743 | 2840 | ||
2744 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 2841 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
2745 | 2842 | __memcg_kmem_uncharge_memcg(memcg, nr_pages); | |
2746 | if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) | ||
2747 | page_counter_uncharge(&memcg->kmem, nr_pages); | ||
2748 | |||
2749 | page_counter_uncharge(&memcg->memory, nr_pages); | ||
2750 | if (do_memsw_account()) | ||
2751 | page_counter_uncharge(&memcg->memsw, nr_pages); | ||
2752 | |||
2753 | page->mem_cgroup = NULL; | 2843 | page->mem_cgroup = NULL; |
2754 | 2844 | ||
2755 | /* slab pages do not have PageKmemcg flag set */ | 2845 | /* slab pages do not have PageKmemcg flag set */ |
@@ -3168,15 +3258,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) | |||
3168 | */ | 3258 | */ |
3169 | memcg->kmem_state = KMEM_ALLOCATED; | 3259 | memcg->kmem_state = KMEM_ALLOCATED; |
3170 | 3260 | ||
3171 | memcg_deactivate_kmem_caches(memcg); | ||
3172 | |||
3173 | kmemcg_id = memcg->kmemcg_id; | ||
3174 | BUG_ON(kmemcg_id < 0); | ||
3175 | |||
3176 | parent = parent_mem_cgroup(memcg); | 3261 | parent = parent_mem_cgroup(memcg); |
3177 | if (!parent) | 3262 | if (!parent) |
3178 | parent = root_mem_cgroup; | 3263 | parent = root_mem_cgroup; |
3179 | 3264 | ||
3265 | memcg_deactivate_kmem_caches(memcg, parent); | ||
3266 | |||
3267 | kmemcg_id = memcg->kmemcg_id; | ||
3268 | BUG_ON(kmemcg_id < 0); | ||
3269 | |||
3180 | /* | 3270 | /* |
3181 | * Change kmemcg_id of this cgroup and all its descendants to the | 3271 | * Change kmemcg_id of this cgroup and all its descendants to the |
3182 | * parent's id, and then move all entries from this cgroup's list_lrus | 3272 | * parent's id, and then move all entries from this cgroup's list_lrus |
@@ -3207,9 +3297,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) | |||
3207 | memcg_offline_kmem(memcg); | 3297 | memcg_offline_kmem(memcg); |
3208 | 3298 | ||
3209 | if (memcg->kmem_state == KMEM_ALLOCATED) { | 3299 | if (memcg->kmem_state == KMEM_ALLOCATED) { |
3210 | memcg_destroy_kmem_caches(memcg); | 3300 | WARN_ON(!list_empty(&memcg->kmem_caches)); |
3211 | static_branch_dec(&memcg_kmem_enabled_key); | 3301 | static_branch_dec(&memcg_kmem_enabled_key); |
3212 | WARN_ON(page_counter_read(&memcg->kmem)); | ||
3213 | } | 3302 | } |
3214 | } | 3303 | } |
3215 | #else | 3304 | #else |
@@ -3472,6 +3561,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
3472 | } | 3561 | } |
3473 | #endif /* CONFIG_NUMA */ | 3562 | #endif /* CONFIG_NUMA */ |
3474 | 3563 | ||
3564 | static const unsigned int memcg1_stats[] = { | ||
3565 | MEMCG_CACHE, | ||
3566 | MEMCG_RSS, | ||
3567 | MEMCG_RSS_HUGE, | ||
3568 | NR_SHMEM, | ||
3569 | NR_FILE_MAPPED, | ||
3570 | NR_FILE_DIRTY, | ||
3571 | NR_WRITEBACK, | ||
3572 | MEMCG_SWAP, | ||
3573 | }; | ||
3574 | |||
3575 | static const char *const memcg1_stat_names[] = { | ||
3576 | "cache", | ||
3577 | "rss", | ||
3578 | "rss_huge", | ||
3579 | "shmem", | ||
3580 | "mapped_file", | ||
3581 | "dirty", | ||
3582 | "writeback", | ||
3583 | "swap", | ||
3584 | }; | ||
3585 | |||
3475 | /* Universal VM events cgroup1 shows, original sort order */ | 3586 | /* Universal VM events cgroup1 shows, original sort order */ |
3476 | static const unsigned int memcg1_events[] = { | 3587 | static const unsigned int memcg1_events[] = { |
3477 | PGPGIN, | 3588 | PGPGIN, |
@@ -3530,12 +3641,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3530 | if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) | 3641 | if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) |
3531 | continue; | 3642 | continue; |
3532 | seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], | 3643 | seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], |
3533 | (u64)memcg_page_state(memcg, i) * PAGE_SIZE); | 3644 | (u64)memcg_page_state(memcg, memcg1_stats[i]) * |
3645 | PAGE_SIZE); | ||
3534 | } | 3646 | } |
3535 | 3647 | ||
3536 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) | 3648 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) |
3537 | seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], | 3649 | seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], |
3538 | (u64)memcg_events(memcg, i)); | 3650 | (u64)memcg_events(memcg, memcg1_events[i])); |
3539 | 3651 | ||
3540 | for (i = 0; i < NR_LRU_LISTS; i++) | 3652 | for (i = 0; i < NR_LRU_LISTS; i++) |
3541 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], | 3653 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], |
@@ -4634,6 +4746,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4634 | 4746 | ||
4635 | /* The following stuff does not apply to the root */ | 4747 | /* The following stuff does not apply to the root */ |
4636 | if (!parent) { | 4748 | if (!parent) { |
4749 | #ifdef CONFIG_MEMCG_KMEM | ||
4750 | INIT_LIST_HEAD(&memcg->kmem_caches); | ||
4751 | #endif | ||
4637 | root_mem_cgroup = memcg; | 4752 | root_mem_cgroup = memcg; |
4638 | return &memcg->css; | 4753 | return &memcg->css; |
4639 | } | 4754 | } |
@@ -4793,7 +4908,7 @@ enum mc_target_type { | |||
4793 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | 4908 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, |
4794 | unsigned long addr, pte_t ptent) | 4909 | unsigned long addr, pte_t ptent) |
4795 | { | 4910 | { |
4796 | struct page *page = _vm_normal_page(vma, addr, ptent, true); | 4911 | struct page *page = vm_normal_page(vma, addr, ptent); |
4797 | 4912 | ||
4798 | if (!page || !page_mapped(page)) | 4913 | if (!page || !page_mapped(page)) |
4799 | return NULL; | 4914 | return NULL; |
@@ -4994,8 +5109,8 @@ out: | |||
4994 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | 5109 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a |
4995 | * target for charge migration. if @target is not NULL, the entry is stored | 5110 | * target for charge migration. if @target is not NULL, the entry is stored |
4996 | * in target->ent. | 5111 | * in target->ent. |
4997 | * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC | 5112 | * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE |
4998 | * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). | 5113 | * (so ZONE_DEVICE page and thus not on the lru). |
4999 | * For now we such page is charge like a regular page would be as for all | 5114 | * For now we such page is charge like a regular page would be as for all |
5000 | * intent and purposes it is just special memory taking the place of a | 5115 | * intent and purposes it is just special memory taking the place of a |
5001 | * regular page. | 5116 | * regular page. |
@@ -5029,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
5029 | */ | 5144 | */ |
5030 | if (page->mem_cgroup == mc.from) { | 5145 | if (page->mem_cgroup == mc.from) { |
5031 | ret = MC_TARGET_PAGE; | 5146 | ret = MC_TARGET_PAGE; |
5032 | if (is_device_private_page(page) || | 5147 | if (is_device_private_page(page)) |
5033 | is_device_public_page(page)) | ||
5034 | ret = MC_TARGET_DEVICE; | 5148 | ret = MC_TARGET_DEVICE; |
5035 | if (target) | 5149 | if (target) |
5036 | target->page = page; | 5150 | target->page = page; |
@@ -5101,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5101 | if (ptl) { | 5215 | if (ptl) { |
5102 | /* | 5216 | /* |
5103 | * Note their can not be MC_TARGET_DEVICE for now as we do not | 5217 | * Note their can not be MC_TARGET_DEVICE for now as we do not |
5104 | * support transparent huge page with MEMORY_DEVICE_PUBLIC or | 5218 | * support transparent huge page with MEMORY_DEVICE_PRIVATE but |
5105 | * MEMORY_DEVICE_PRIVATE but this might change. | 5219 | * this might change. |
5106 | */ | 5220 | */ |
5107 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | 5221 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) |
5108 | mc.precharge += HPAGE_PMD_NR; | 5222 | mc.precharge += HPAGE_PMD_NR; |
@@ -5625,112 +5739,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, | |||
5625 | return nbytes; | 5739 | return nbytes; |
5626 | } | 5740 | } |
5627 | 5741 | ||
5742 | static void __memory_events_show(struct seq_file *m, atomic_long_t *events) | ||
5743 | { | ||
5744 | seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); | ||
5745 | seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); | ||
5746 | seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); | ||
5747 | seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); | ||
5748 | seq_printf(m, "oom_kill %lu\n", | ||
5749 | atomic_long_read(&events[MEMCG_OOM_KILL])); | ||
5750 | } | ||
5751 | |||
5628 | static int memory_events_show(struct seq_file *m, void *v) | 5752 | static int memory_events_show(struct seq_file *m, void *v) |
5629 | { | 5753 | { |
5630 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); | 5754 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5631 | 5755 | ||
5632 | seq_printf(m, "low %lu\n", | 5756 | __memory_events_show(m, memcg->memory_events); |
5633 | atomic_long_read(&memcg->memory_events[MEMCG_LOW])); | ||
5634 | seq_printf(m, "high %lu\n", | ||
5635 | atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); | ||
5636 | seq_printf(m, "max %lu\n", | ||
5637 | atomic_long_read(&memcg->memory_events[MEMCG_MAX])); | ||
5638 | seq_printf(m, "oom %lu\n", | ||
5639 | atomic_long_read(&memcg->memory_events[MEMCG_OOM])); | ||
5640 | seq_printf(m, "oom_kill %lu\n", | ||
5641 | atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); | ||
5642 | |||
5643 | return 0; | 5757 | return 0; |
5644 | } | 5758 | } |
5645 | 5759 | ||
5646 | static int memory_stat_show(struct seq_file *m, void *v) | 5760 | static int memory_events_local_show(struct seq_file *m, void *v) |
5647 | { | 5761 | { |
5648 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); | 5762 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5649 | int i; | ||
5650 | |||
5651 | /* | ||
5652 | * Provide statistics on the state of the memory subsystem as | ||
5653 | * well as cumulative event counters that show past behavior. | ||
5654 | * | ||
5655 | * This list is ordered following a combination of these gradients: | ||
5656 | * 1) generic big picture -> specifics and details | ||
5657 | * 2) reflecting userspace activity -> reflecting kernel heuristics | ||
5658 | * | ||
5659 | * Current memory state: | ||
5660 | */ | ||
5661 | |||
5662 | seq_printf(m, "anon %llu\n", | ||
5663 | (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); | ||
5664 | seq_printf(m, "file %llu\n", | ||
5665 | (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); | ||
5666 | seq_printf(m, "kernel_stack %llu\n", | ||
5667 | (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); | ||
5668 | seq_printf(m, "slab %llu\n", | ||
5669 | (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + | ||
5670 | memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * | ||
5671 | PAGE_SIZE); | ||
5672 | seq_printf(m, "sock %llu\n", | ||
5673 | (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); | ||
5674 | |||
5675 | seq_printf(m, "shmem %llu\n", | ||
5676 | (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); | ||
5677 | seq_printf(m, "file_mapped %llu\n", | ||
5678 | (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); | ||
5679 | seq_printf(m, "file_dirty %llu\n", | ||
5680 | (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); | ||
5681 | seq_printf(m, "file_writeback %llu\n", | ||
5682 | (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); | ||
5683 | 5763 | ||
5684 | /* | 5764 | __memory_events_show(m, memcg->memory_events_local); |
5685 | * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter | 5765 | return 0; |
5686 | * with the NR_ANON_THP vm counter, but right now it's a pain in the | 5766 | } |
5687 | * arse because it requires migrating the work out of rmap to a place | ||
5688 | * where the page->mem_cgroup is set up and stable. | ||
5689 | */ | ||
5690 | seq_printf(m, "anon_thp %llu\n", | ||
5691 | (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); | ||
5692 | |||
5693 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
5694 | seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], | ||
5695 | (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * | ||
5696 | PAGE_SIZE); | ||
5697 | |||
5698 | seq_printf(m, "slab_reclaimable %llu\n", | ||
5699 | (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * | ||
5700 | PAGE_SIZE); | ||
5701 | seq_printf(m, "slab_unreclaimable %llu\n", | ||
5702 | (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * | ||
5703 | PAGE_SIZE); | ||
5704 | |||
5705 | /* Accumulated memory events */ | ||
5706 | |||
5707 | seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); | ||
5708 | seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); | ||
5709 | |||
5710 | seq_printf(m, "workingset_refault %lu\n", | ||
5711 | memcg_page_state(memcg, WORKINGSET_REFAULT)); | ||
5712 | seq_printf(m, "workingset_activate %lu\n", | ||
5713 | memcg_page_state(memcg, WORKINGSET_ACTIVATE)); | ||
5714 | seq_printf(m, "workingset_nodereclaim %lu\n", | ||
5715 | memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); | ||
5716 | |||
5717 | seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); | ||
5718 | seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + | ||
5719 | memcg_events(memcg, PGSCAN_DIRECT)); | ||
5720 | seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + | ||
5721 | memcg_events(memcg, PGSTEAL_DIRECT)); | ||
5722 | seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); | ||
5723 | seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); | ||
5724 | seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); | ||
5725 | seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); | ||
5726 | 5767 | ||
5727 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 5768 | static int memory_stat_show(struct seq_file *m, void *v) |
5728 | seq_printf(m, "thp_fault_alloc %lu\n", | 5769 | { |
5729 | memcg_events(memcg, THP_FAULT_ALLOC)); | 5770 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5730 | seq_printf(m, "thp_collapse_alloc %lu\n", | 5771 | char *buf; |
5731 | memcg_events(memcg, THP_COLLAPSE_ALLOC)); | ||
5732 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
5733 | 5772 | ||
5773 | buf = memory_stat_format(memcg); | ||
5774 | if (!buf) | ||
5775 | return -ENOMEM; | ||
5776 | seq_puts(m, buf); | ||
5777 | kfree(buf); | ||
5734 | return 0; | 5778 | return 0; |
5735 | } | 5779 | } |
5736 | 5780 | ||
@@ -5802,6 +5846,12 @@ static struct cftype memory_files[] = { | |||
5802 | .seq_show = memory_events_show, | 5846 | .seq_show = memory_events_show, |
5803 | }, | 5847 | }, |
5804 | { | 5848 | { |
5849 | .name = "events.local", | ||
5850 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5851 | .file_offset = offsetof(struct mem_cgroup, events_local_file), | ||
5852 | .seq_show = memory_events_local_show, | ||
5853 | }, | ||
5854 | { | ||
5805 | .name = "stat", | 5855 | .name = "stat", |
5806 | .flags = CFTYPE_NOT_ON_ROOT, | 5856 | .flags = CFTYPE_NOT_ON_ROOT, |
5807 | .seq_show = memory_stat_show, | 5857 | .seq_show = memory_stat_show, |