aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit00501b531c4723972aa11d6d4ebcf8d6552007c8 (patch)
treeb3ad4850d58f137cf87b8424412d962fb251839f /mm/memory.c
parent4449a51a7c281602d3a385044ab928322a122a02 (diff)
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c41
1 files changed, 24 insertions, 17 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 5c55270729f7..6d7648773dc4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2049,6 +2049,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2049 struct page *dirty_page = NULL; 2049 struct page *dirty_page = NULL;
2050 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2050 unsigned long mmun_start = 0; /* For mmu_notifiers */
2051 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2051 unsigned long mmun_end = 0; /* For mmu_notifiers */
2052 struct mem_cgroup *memcg;
2052 2053
2053 old_page = vm_normal_page(vma, address, orig_pte); 2054 old_page = vm_normal_page(vma, address, orig_pte);
2054 if (!old_page) { 2055 if (!old_page) {
@@ -2204,7 +2205,7 @@ gotten:
2204 } 2205 }
2205 __SetPageUptodate(new_page); 2206 __SetPageUptodate(new_page);
2206 2207
2207 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) 2208 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2208 goto oom_free_new; 2209 goto oom_free_new;
2209 2210
2210 mmun_start = address & PAGE_MASK; 2211 mmun_start = address & PAGE_MASK;
@@ -2234,6 +2235,8 @@ gotten:
2234 */ 2235 */
2235 ptep_clear_flush(vma, address, page_table); 2236 ptep_clear_flush(vma, address, page_table);
2236 page_add_new_anon_rmap(new_page, vma, address); 2237 page_add_new_anon_rmap(new_page, vma, address);
2238 mem_cgroup_commit_charge(new_page, memcg, false);
2239 lru_cache_add_active_or_unevictable(new_page, vma);
2237 /* 2240 /*
2238 * We call the notify macro here because, when using secondary 2241 * We call the notify macro here because, when using secondary
2239 * mmu page tables (such as kvm shadow page tables), we want the 2242 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2271,7 +2274,7 @@ gotten:
2271 new_page = old_page; 2274 new_page = old_page;
2272 ret |= VM_FAULT_WRITE; 2275 ret |= VM_FAULT_WRITE;
2273 } else 2276 } else
2274 mem_cgroup_uncharge_page(new_page); 2277 mem_cgroup_cancel_charge(new_page, memcg);
2275 2278
2276 if (new_page) 2279 if (new_page)
2277 page_cache_release(new_page); 2280 page_cache_release(new_page);
@@ -2410,10 +2413,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2410{ 2413{
2411 spinlock_t *ptl; 2414 spinlock_t *ptl;
2412 struct page *page, *swapcache; 2415 struct page *page, *swapcache;
2416 struct mem_cgroup *memcg;
2413 swp_entry_t entry; 2417 swp_entry_t entry;
2414 pte_t pte; 2418 pte_t pte;
2415 int locked; 2419 int locked;
2416 struct mem_cgroup *ptr;
2417 int exclusive = 0; 2420 int exclusive = 0;
2418 int ret = 0; 2421 int ret = 0;
2419 2422
@@ -2489,7 +2492,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2489 goto out_page; 2492 goto out_page;
2490 } 2493 }
2491 2494
2492 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2495 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2493 ret = VM_FAULT_OOM; 2496 ret = VM_FAULT_OOM;
2494 goto out_page; 2497 goto out_page;
2495 } 2498 }
@@ -2514,10 +2517,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2514 * while the page is counted on swap but not yet in mapcount i.e. 2517 * while the page is counted on swap but not yet in mapcount i.e.
2515 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 2518 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2516 * must be called after the swap_free(), or it will never succeed. 2519 * must be called after the swap_free(), or it will never succeed.
2517 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2518 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2519 * in page->private. In this case, a record in swap_cgroup is silently
2520 * discarded at swap_free().
2521 */ 2520 */
2522 2521
2523 inc_mm_counter_fast(mm, MM_ANONPAGES); 2522 inc_mm_counter_fast(mm, MM_ANONPAGES);
@@ -2533,12 +2532,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2533 if (pte_swp_soft_dirty(orig_pte)) 2532 if (pte_swp_soft_dirty(orig_pte))
2534 pte = pte_mksoft_dirty(pte); 2533 pte = pte_mksoft_dirty(pte);
2535 set_pte_at(mm, address, page_table, pte); 2534 set_pte_at(mm, address, page_table, pte);
2536 if (page == swapcache) 2535 if (page == swapcache) {
2537 do_page_add_anon_rmap(page, vma, address, exclusive); 2536 do_page_add_anon_rmap(page, vma, address, exclusive);
2538 else /* ksm created a completely new copy */ 2537 mem_cgroup_commit_charge(page, memcg, true);
2538 } else { /* ksm created a completely new copy */
2539 page_add_new_anon_rmap(page, vma, address); 2539 page_add_new_anon_rmap(page, vma, address);
2540 /* It's better to call commit-charge after rmap is established */ 2540 mem_cgroup_commit_charge(page, memcg, false);
2541 mem_cgroup_commit_charge_swapin(page, ptr); 2541 lru_cache_add_active_or_unevictable(page, vma);
2542 }
2542 2543
2543 swap_free(entry); 2544 swap_free(entry);
2544 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2545 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2571,7 +2572,7 @@ unlock:
2571out: 2572out:
2572 return ret; 2573 return ret;
2573out_nomap: 2574out_nomap:
2574 mem_cgroup_cancel_charge_swapin(ptr); 2575 mem_cgroup_cancel_charge(page, memcg);
2575 pte_unmap_unlock(page_table, ptl); 2576 pte_unmap_unlock(page_table, ptl);
2576out_page: 2577out_page:
2577 unlock_page(page); 2578 unlock_page(page);
@@ -2627,6 +2628,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2627 unsigned long address, pte_t *page_table, pmd_t *pmd, 2628 unsigned long address, pte_t *page_table, pmd_t *pmd,
2628 unsigned int flags) 2629 unsigned int flags)
2629{ 2630{
2631 struct mem_cgroup *memcg;
2630 struct page *page; 2632 struct page *page;
2631 spinlock_t *ptl; 2633 spinlock_t *ptl;
2632 pte_t entry; 2634 pte_t entry;
@@ -2660,7 +2662,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2660 */ 2662 */
2661 __SetPageUptodate(page); 2663 __SetPageUptodate(page);
2662 2664
2663 if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) 2665 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2664 goto oom_free_page; 2666 goto oom_free_page;
2665 2667
2666 entry = mk_pte(page, vma->vm_page_prot); 2668 entry = mk_pte(page, vma->vm_page_prot);
@@ -2673,6 +2675,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2673 2675
2674 inc_mm_counter_fast(mm, MM_ANONPAGES); 2676 inc_mm_counter_fast(mm, MM_ANONPAGES);
2675 page_add_new_anon_rmap(page, vma, address); 2677 page_add_new_anon_rmap(page, vma, address);
2678 mem_cgroup_commit_charge(page, memcg, false);
2679 lru_cache_add_active_or_unevictable(page, vma);
2676setpte: 2680setpte:
2677 set_pte_at(mm, address, page_table, entry); 2681 set_pte_at(mm, address, page_table, entry);
2678 2682
@@ -2682,7 +2686,7 @@ unlock:
2682 pte_unmap_unlock(page_table, ptl); 2686 pte_unmap_unlock(page_table, ptl);
2683 return 0; 2687 return 0;
2684release: 2688release:
2685 mem_cgroup_uncharge_page(page); 2689 mem_cgroup_cancel_charge(page, memcg);
2686 page_cache_release(page); 2690 page_cache_release(page);
2687 goto unlock; 2691 goto unlock;
2688oom_free_page: 2692oom_free_page:
@@ -2919,6 +2923,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2919 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2923 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2920{ 2924{
2921 struct page *fault_page, *new_page; 2925 struct page *fault_page, *new_page;
2926 struct mem_cgroup *memcg;
2922 spinlock_t *ptl; 2927 spinlock_t *ptl;
2923 pte_t *pte; 2928 pte_t *pte;
2924 int ret; 2929 int ret;
@@ -2930,7 +2935,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2930 if (!new_page) 2935 if (!new_page)
2931 return VM_FAULT_OOM; 2936 return VM_FAULT_OOM;
2932 2937
2933 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { 2938 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2934 page_cache_release(new_page); 2939 page_cache_release(new_page);
2935 return VM_FAULT_OOM; 2940 return VM_FAULT_OOM;
2936 } 2941 }
@@ -2950,12 +2955,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2950 goto uncharge_out; 2955 goto uncharge_out;
2951 } 2956 }
2952 do_set_pte(vma, address, new_page, pte, true, true); 2957 do_set_pte(vma, address, new_page, pte, true, true);
2958 mem_cgroup_commit_charge(new_page, memcg, false);
2959 lru_cache_add_active_or_unevictable(new_page, vma);
2953 pte_unmap_unlock(pte, ptl); 2960 pte_unmap_unlock(pte, ptl);
2954 unlock_page(fault_page); 2961 unlock_page(fault_page);
2955 page_cache_release(fault_page); 2962 page_cache_release(fault_page);
2956 return ret; 2963 return ret;
2957uncharge_out: 2964uncharge_out:
2958 mem_cgroup_uncharge_page(new_page); 2965 mem_cgroup_cancel_charge(new_page, memcg);
2959 page_cache_release(new_page); 2966 page_cache_release(new_page);
2960 return ret; 2967 return ret;
2961} 2968}