diff options
author | Greg Thelen <gthelen@google.com> | 2015-05-22 17:13:16 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2015-06-02 10:33:33 -0400 |
commit | c4843a7593a9df3ff5b1806084cefdfa81dd7c79 (patch) | |
tree | 29adcfe091e8b453e499614db7f1aee6aebbef04 /mm/memcontrol.c | |
parent | 11f81becca04bb7d2826a9b65bb8d27b0a1bb543 (diff) |
memcg: add per cgroup dirty page accounting
When modifying PG_Dirty on cached file pages, update the new
MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where
global NR_FILE_DIRTY is managed. The new memcg stat is visible in the
per memcg memory.stat cgroupfs file. The most recent past attempt at
this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632
The new accounting supports future efforts to add per cgroup dirty
page throttling and writeback. It also helps an administrator break
down a container's memory usage and provides evidence to understand
memcg oom kills (the new dirty count is included in memcg oom kill
messages).
The ability to move page accounting between memcg
(memory.move_charge_at_immigrate) makes this accounting more
complicated than the global counter. The existing
mem_cgroup_{begin,end}_page_stat() lock is used to serialize move
accounting with stat updates.
Typical update operation:
memcg = mem_cgroup_begin_page_stat(page)
if (TestSetPageDirty()) {
[...]
mem_cgroup_update_page_stat(memcg)
}
mem_cgroup_end_page_stat(memcg)
Summary of mem_cgroup_end_page_stat() overhead:
- Without CONFIG_MEMCG it's a no-op
- With CONFIG_MEMCG and no inter memcg task movement, it's just
rcu_read_lock()
- With CONFIG_MEMCG and inter memcg task movement, it's
rcu_read_lock() + spin_lock_irqsave()
A memcg parameter is added to several routines because their callers
now grab mem_cgroup_begin_page_stat() which returns the memcg later
needed by for mem_cgroup_update_page_stat().
Because mem_cgroup_begin_page_stat() may disable interrupts, some
adjustments are needed:
- move __mark_inode_dirty() from __set_page_dirty() to its caller.
__mark_inode_dirty() locking does not want interrupts disabled.
- use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in
__delete_from_page_cache(), replace_page_cache_page(),
invalidate_complete_page2(), and __remove_mapping().
text data bss dec hex filename
8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before
8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after
+192 text bytes
8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before
8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after
+773 text bytes
Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for
all metrics, they're all wall clock or cycle counts. The read and write
fault benchmarks just measure fault time, they do not include I/O time.
* CONFIG_MEMCG not set:
baseline patched
kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples)
dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03%
dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99%
dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77%
read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples)
write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples)
* CONFIG_MEMCG=y root_memcg:
baseline patched
kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples)
dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90%
dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33%
dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00%
read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples)
write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples)
* CONFIG_MEMCG=y non-root_memcg:
baseline patched
kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples)
dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82%
dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27%
dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52%
read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples)
write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples)
As expected anon page faults are not affected by this patch.
tj: Updated to apply on top of the recent cancel_dirty_page() changes.
Signed-off-by: Sha Zhengju <handai.szj@gmail.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 24 |
1 files changed, 23 insertions, 1 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 14c2f2017e37..c23c1a3e8e16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -90,6 +90,7 @@ static const char * const mem_cgroup_stat_names[] = { | |||
90 | "rss", | 90 | "rss", |
91 | "rss_huge", | 91 | "rss_huge", |
92 | "mapped_file", | 92 | "mapped_file", |
93 | "dirty", | ||
93 | "writeback", | 94 | "writeback", |
94 | "swap", | 95 | "swap", |
95 | }; | 96 | }; |
@@ -2011,6 +2012,7 @@ again: | |||
2011 | 2012 | ||
2012 | return memcg; | 2013 | return memcg; |
2013 | } | 2014 | } |
2015 | EXPORT_SYMBOL(mem_cgroup_begin_page_stat); | ||
2014 | 2016 | ||
2015 | /** | 2017 | /** |
2016 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 2018 | * mem_cgroup_end_page_stat - finish a page state statistics transaction |
@@ -2029,6 +2031,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | |||
2029 | 2031 | ||
2030 | rcu_read_unlock(); | 2032 | rcu_read_unlock(); |
2031 | } | 2033 | } |
2034 | EXPORT_SYMBOL(mem_cgroup_end_page_stat); | ||
2032 | 2035 | ||
2033 | /** | 2036 | /** |
2034 | * mem_cgroup_update_page_stat - update page state statistics | 2037 | * mem_cgroup_update_page_stat - update page state statistics |
@@ -4746,6 +4749,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
4746 | { | 4749 | { |
4747 | unsigned long flags; | 4750 | unsigned long flags; |
4748 | int ret; | 4751 | int ret; |
4752 | bool anon; | ||
4749 | 4753 | ||
4750 | VM_BUG_ON(from == to); | 4754 | VM_BUG_ON(from == to); |
4751 | VM_BUG_ON_PAGE(PageLRU(page), page); | 4755 | VM_BUG_ON_PAGE(PageLRU(page), page); |
@@ -4771,15 +4775,33 @@ static int mem_cgroup_move_account(struct page *page, | |||
4771 | if (page->mem_cgroup != from) | 4775 | if (page->mem_cgroup != from) |
4772 | goto out_unlock; | 4776 | goto out_unlock; |
4773 | 4777 | ||
4778 | anon = PageAnon(page); | ||
4779 | |||
4774 | spin_lock_irqsave(&from->move_lock, flags); | 4780 | spin_lock_irqsave(&from->move_lock, flags); |
4775 | 4781 | ||
4776 | if (!PageAnon(page) && page_mapped(page)) { | 4782 | if (!anon && page_mapped(page)) { |
4777 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 4783 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
4778 | nr_pages); | 4784 | nr_pages); |
4779 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 4785 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
4780 | nr_pages); | 4786 | nr_pages); |
4781 | } | 4787 | } |
4782 | 4788 | ||
4789 | /* | ||
4790 | * move_lock grabbed above and caller set from->moving_account, so | ||
4791 | * mem_cgroup_update_page_stat() will serialize updates to PageDirty. | ||
4792 | * So mapping should be stable for dirty pages. | ||
4793 | */ | ||
4794 | if (!anon && PageDirty(page)) { | ||
4795 | struct address_space *mapping = page_mapping(page); | ||
4796 | |||
4797 | if (mapping_cap_account_dirty(mapping)) { | ||
4798 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], | ||
4799 | nr_pages); | ||
4800 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], | ||
4801 | nr_pages); | ||
4802 | } | ||
4803 | } | ||
4804 | |||
4783 | if (PageWriteback(page)) { | 4805 | if (PageWriteback(page)) { |
4784 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | 4806 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], |
4785 | nr_pages); | 4807 | nr_pages); |