summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit00501b531c4723972aa11d6d4ebcf8d6552007c8 (patch)
treeb3ad4850d58f137cf87b8424412d962fb251839f /mm/swapfile.c
parent4449a51a7c281602d3a385044ab928322a122a02 (diff)
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c14
1 files changed, 8 insertions, 6 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4c524f7bd0bf..0883b4912ff7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1106,15 +1106,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1106 if (unlikely(!page)) 1106 if (unlikely(!page))
1107 return -ENOMEM; 1107 return -ENOMEM;
1108 1108
1109 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, 1109 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
1110 GFP_KERNEL, &memcg)) {
1111 ret = -ENOMEM; 1110 ret = -ENOMEM;
1112 goto out_nolock; 1111 goto out_nolock;
1113 } 1112 }
1114 1113
1115 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1114 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1116 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { 1115 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
1117 mem_cgroup_cancel_charge_swapin(memcg); 1116 mem_cgroup_cancel_charge(page, memcg);
1118 ret = 0; 1117 ret = 0;
1119 goto out; 1118 goto out;
1120 } 1119 }
@@ -1124,11 +1123,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1124 get_page(page); 1123 get_page(page);
1125 set_pte_at(vma->vm_mm, addr, pte, 1124 set_pte_at(vma->vm_mm, addr, pte,
1126 pte_mkold(mk_pte(page, vma->vm_page_prot))); 1125 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1127 if (page == swapcache) 1126 if (page == swapcache) {
1128 page_add_anon_rmap(page, vma, addr); 1127 page_add_anon_rmap(page, vma, addr);
1129 else /* ksm created a completely new copy */ 1128 mem_cgroup_commit_charge(page, memcg, true);
1129 } else { /* ksm created a completely new copy */
1130 page_add_new_anon_rmap(page, vma, addr); 1130 page_add_new_anon_rmap(page, vma, addr);
1131 mem_cgroup_commit_charge_swapin(page, memcg); 1131 mem_cgroup_commit_charge(page, memcg, false);
1132 lru_cache_add_active_or_unevictable(page, vma);
1133 }
1132 swap_free(entry); 1134 swap_free(entry);
1133 /* 1135 /*
1134 * Move the page to the active list so it is not 1136 * Move the page to the active list so it is not