diff options
| author | Johannes Weiner <hannes@cmpxchg.org> | 2018-01-31 19:16:45 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 20:18:36 -0500 |
| commit | a983b5ebee57209c99f68c8327072f25e0e6e3da (patch) | |
| tree | ae227de941d587feb9fc94e40165607b24579266 /mm/memcontrol.c | |
| parent | 284542656e22c43fdada8c8cc0ca9ede8453eed7 (diff) | |
mm: memcontrol: fix excessive complexity in memory.stat reporting
We've seen memory.stat reads in top-level cgroups take up to fourteen
seconds during a userspace bug that created tens of thousands of ghost
cgroups pinned by lingering page cache.
Even with a more reasonable number of cgroups, aggregating memory.stat
is unnecessarily heavy. The complexity is this:
nr_cgroups * nr_stat_items * nr_possible_cpus
where the stat items are ~70 at this point. With 128 cgroups and 128
CPUs - decent, not enormous setups - reading the top-level memory.stat
has to aggregate over a million per-cpu counters. This doesn't scale.
Instead of spreading the source of truth across all CPUs, use the
per-cpu counters merely to batch updates to shared atomic counters.
This is the same as the per-cpu stocks we use for charging memory to the
shared atomic page_counters, and also the way the global vmstat counters
are implemented.
Vmstat has elaborate spilling thresholds that depend on the number of
CPUs, amount of memory, and memory pressure - carefully balancing the
cost of counter updates with the amount of per-cpu error. That's
because the vmstat counters are system-wide, but also used for decisions
inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is
true for the memory controller.
Use the same static batch size we already use for page_counter updates
during charging. The per-cpu error in the stats will be 128k, which is
an acceptable ratio of cores to memory accounting granularity.
[hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls]
Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org
Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 101 |
1 files changed, 51 insertions, 50 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23841af1d756..51d398f1363c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | |||
| 542 | return mz; | 542 | return mz; |
| 543 | } | 543 | } |
| 544 | 544 | ||
| 545 | /* | ||
| 546 | * Return page count for single (non recursive) @memcg. | ||
| 547 | * | ||
| 548 | * Implementation Note: reading percpu statistics for memcg. | ||
| 549 | * | ||
| 550 | * Both of vmstat[] and percpu_counter has threshold and do periodic | ||
| 551 | * synchronization to implement "quick" read. There are trade-off between | ||
| 552 | * reading cost and precision of value. Then, we may have a chance to implement | ||
| 553 | * a periodic synchronization of counter in memcg's counter. | ||
| 554 | * | ||
| 555 | * But this _read() function is used for user interface now. The user accounts | ||
| 556 | * memory usage by memory cgroup and he _always_ requires exact value because | ||
| 557 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | ||
| 558 | * have to visit all online cpus and make sum. So, for now, unnecessary | ||
| 559 | * synchronization is not implemented. (just implemented for cpu hotplug) | ||
| 560 | * | ||
| 561 | * If there are kernel internal actions which can make use of some not-exact | ||
| 562 | * value, and reading all cpu value can be performance bottleneck in some | ||
| 563 | * common workload, threshold and synchronization as vmstat[] should be | ||
| 564 | * implemented. | ||
| 565 | * | ||
| 566 | * The parameter idx can be of type enum memcg_event_item or vm_event_item. | ||
| 567 | */ | ||
| 568 | |||
| 569 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, | 545 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, |
| 570 | int event) | 546 | int event) |
| 571 | { | 547 | { |
| 572 | unsigned long val = 0; | 548 | return atomic_long_read(&memcg->events[event]); |
| 573 | int cpu; | ||
| 574 | |||
| 575 | for_each_possible_cpu(cpu) | ||
| 576 | val += per_cpu(memcg->stat->events[event], cpu); | ||
| 577 | return val; | ||
| 578 | } | 549 | } |
| 579 | 550 | ||
| 580 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 551 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
| @@ -606,7 +577,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
| 606 | nr_pages = -nr_pages; /* for event */ | 577 | nr_pages = -nr_pages; /* for event */ |
| 607 | } | 578 | } |
| 608 | 579 | ||
| 609 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); | 580 | __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); |
| 610 | } | 581 | } |
| 611 | 582 | ||
| 612 | unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | 583 | unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| @@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
| 642 | { | 613 | { |
| 643 | unsigned long val, next; | 614 | unsigned long val, next; |
| 644 | 615 | ||
| 645 | val = __this_cpu_read(memcg->stat->nr_page_events); | 616 | val = __this_cpu_read(memcg->stat_cpu->nr_page_events); |
| 646 | next = __this_cpu_read(memcg->stat->targets[target]); | 617 | next = __this_cpu_read(memcg->stat_cpu->targets[target]); |
| 647 | /* from time_after() in jiffies.h */ | 618 | /* from time_after() in jiffies.h */ |
| 648 | if ((long)(next - val) < 0) { | 619 | if ((long)(next - val) < 0) { |
| 649 | switch (target) { | 620 | switch (target) { |
| @@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
| 659 | default: | 630 | default: |
| 660 | break; | 631 | break; |
| 661 | } | 632 | } |
| 662 | __this_cpu_write(memcg->stat->targets[target], next); | 633 | __this_cpu_write(memcg->stat_cpu->targets[target], next); |
| 663 | return true; | 634 | return true; |
| 664 | } | 635 | } |
| 665 | return false; | 636 | return false; |
| @@ -1707,11 +1678,6 @@ void unlock_page_memcg(struct page *page) | |||
| 1707 | } | 1678 | } |
| 1708 | EXPORT_SYMBOL(unlock_page_memcg); | 1679 | EXPORT_SYMBOL(unlock_page_memcg); |
| 1709 | 1680 | ||
| 1710 | /* | ||
| 1711 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
| 1712 | * TODO: maybe necessary to use big numbers in big irons. | ||
| 1713 | */ | ||
| 1714 | #define CHARGE_BATCH 32U | ||
| 1715 | struct memcg_stock_pcp { | 1681 | struct memcg_stock_pcp { |
| 1716 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1682 | struct mem_cgroup *cached; /* this never be root cgroup */ |
| 1717 | unsigned int nr_pages; | 1683 | unsigned int nr_pages; |
| @@ -1739,7 +1705,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
| 1739 | unsigned long flags; | 1705 | unsigned long flags; |
| 1740 | bool ret = false; | 1706 | bool ret = false; |
| 1741 | 1707 | ||
| 1742 | if (nr_pages > CHARGE_BATCH) | 1708 | if (nr_pages > MEMCG_CHARGE_BATCH) |
| 1743 | return ret; | 1709 | return ret; |
| 1744 | 1710 | ||
| 1745 | local_irq_save(flags); | 1711 | local_irq_save(flags); |
| @@ -1808,7 +1774,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
| 1808 | } | 1774 | } |
| 1809 | stock->nr_pages += nr_pages; | 1775 | stock->nr_pages += nr_pages; |
| 1810 | 1776 | ||
| 1811 | if (stock->nr_pages > CHARGE_BATCH) | 1777 | if (stock->nr_pages > MEMCG_CHARGE_BATCH) |
| 1812 | drain_stock(stock); | 1778 | drain_stock(stock); |
| 1813 | 1779 | ||
| 1814 | local_irq_restore(flags); | 1780 | local_irq_restore(flags); |
| @@ -1858,9 +1824,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) | |||
| 1858 | static int memcg_hotplug_cpu_dead(unsigned int cpu) | 1824 | static int memcg_hotplug_cpu_dead(unsigned int cpu) |
| 1859 | { | 1825 | { |
| 1860 | struct memcg_stock_pcp *stock; | 1826 | struct memcg_stock_pcp *stock; |
| 1827 | struct mem_cgroup *memcg; | ||
| 1861 | 1828 | ||
| 1862 | stock = &per_cpu(memcg_stock, cpu); | 1829 | stock = &per_cpu(memcg_stock, cpu); |
| 1863 | drain_stock(stock); | 1830 | drain_stock(stock); |
| 1831 | |||
| 1832 | for_each_mem_cgroup(memcg) { | ||
| 1833 | int i; | ||
| 1834 | |||
| 1835 | for (i = 0; i < MEMCG_NR_STAT; i++) { | ||
| 1836 | int nid; | ||
| 1837 | long x; | ||
| 1838 | |||
| 1839 | x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); | ||
| 1840 | if (x) | ||
| 1841 | atomic_long_add(x, &memcg->stat[i]); | ||
| 1842 | |||
| 1843 | if (i >= NR_VM_NODE_STAT_ITEMS) | ||
| 1844 | continue; | ||
| 1845 | |||
| 1846 | for_each_node(nid) { | ||
| 1847 | struct mem_cgroup_per_node *pn; | ||
| 1848 | |||
| 1849 | pn = mem_cgroup_nodeinfo(memcg, nid); | ||
| 1850 | x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); | ||
| 1851 | if (x) | ||
| 1852 | atomic_long_add(x, &pn->lruvec_stat[i]); | ||
| 1853 | } | ||
| 1854 | } | ||
| 1855 | |||
| 1856 | for (i = 0; i < MEMCG_NR_EVENTS; i++) { | ||
| 1857 | long x; | ||
| 1858 | |||
| 1859 | x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); | ||
| 1860 | if (x) | ||
| 1861 | atomic_long_add(x, &memcg->events[i]); | ||
| 1862 | } | ||
| 1863 | } | ||
| 1864 | |||
| 1864 | return 0; | 1865 | return 0; |
| 1865 | } | 1866 | } |
| 1866 | 1867 | ||
| @@ -1881,7 +1882,7 @@ static void high_work_func(struct work_struct *work) | |||
| 1881 | struct mem_cgroup *memcg; | 1882 | struct mem_cgroup *memcg; |
| 1882 | 1883 | ||
| 1883 | memcg = container_of(work, struct mem_cgroup, high_work); | 1884 | memcg = container_of(work, struct mem_cgroup, high_work); |
| 1884 | reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); | 1885 | reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); |
| 1885 | } | 1886 | } |
| 1886 | 1887 | ||
| 1887 | /* | 1888 | /* |
| @@ -1905,7 +1906,7 @@ void mem_cgroup_handle_over_high(void) | |||
| 1905 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1906 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
| 1906 | unsigned int nr_pages) | 1907 | unsigned int nr_pages) |
| 1907 | { | 1908 | { |
| 1908 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 1909 | unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); |
| 1909 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1910 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 1910 | struct mem_cgroup *mem_over_limit; | 1911 | struct mem_cgroup *mem_over_limit; |
| 1911 | struct page_counter *counter; | 1912 | struct page_counter *counter; |
| @@ -4161,8 +4162,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) | |||
| 4161 | if (!pn) | 4162 | if (!pn) |
| 4162 | return 1; | 4163 | return 1; |
| 4163 | 4164 | ||
| 4164 | pn->lruvec_stat = alloc_percpu(struct lruvec_stat); | 4165 | pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); |
| 4165 | if (!pn->lruvec_stat) { | 4166 | if (!pn->lruvec_stat_cpu) { |
| 4166 | kfree(pn); | 4167 | kfree(pn); |
| 4167 | return 1; | 4168 | return 1; |
| 4168 | } | 4169 | } |
| @@ -4180,7 +4181,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) | |||
| 4180 | { | 4181 | { |
| 4181 | struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; | 4182 | struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; |
| 4182 | 4183 | ||
| 4183 | free_percpu(pn->lruvec_stat); | 4184 | free_percpu(pn->lruvec_stat_cpu); |
| 4184 | kfree(pn); | 4185 | kfree(pn); |
| 4185 | } | 4186 | } |
| 4186 | 4187 | ||
| @@ -4190,7 +4191,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 4190 | 4191 | ||
| 4191 | for_each_node(node) | 4192 | for_each_node(node) |
| 4192 | free_mem_cgroup_per_node_info(memcg, node); | 4193 | free_mem_cgroup_per_node_info(memcg, node); |
| 4193 | free_percpu(memcg->stat); | 4194 | free_percpu(memcg->stat_cpu); |
| 4194 | kfree(memcg); | 4195 | kfree(memcg); |
| 4195 | } | 4196 | } |
| 4196 | 4197 | ||
| @@ -4219,8 +4220,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
| 4219 | if (memcg->id.id < 0) | 4220 | if (memcg->id.id < 0) |
| 4220 | goto fail; | 4221 | goto fail; |
| 4221 | 4222 | ||
| 4222 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4223 | memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); |
| 4223 | if (!memcg->stat) | 4224 | if (!memcg->stat_cpu) |
| 4224 | goto fail; | 4225 | goto fail; |
| 4225 | 4226 | ||
| 4226 | for_each_node(node) | 4227 | for_each_node(node) |
| @@ -5638,7 +5639,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) | |||
| 5638 | __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); | 5639 | __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); |
| 5639 | __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); | 5640 | __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); |
| 5640 | __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); | 5641 | __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); |
| 5641 | __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); | 5642 | __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); |
| 5642 | memcg_check_events(ug->memcg, ug->dummy_page); | 5643 | memcg_check_events(ug->memcg, ug->dummy_page); |
| 5643 | local_irq_restore(flags); | 5644 | local_irq_restore(flags); |
| 5644 | 5645 | ||
