aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit00501b531c4723972aa11d6d4ebcf8d6552007c8 (patch)
treeb3ad4850d58f137cf87b8424412d962fb251839f
parent4449a51a7c281602d3a385044ab928322a122a02 (diff)
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memcg_test.txt32
-rw-r--r--include/linux/memcontrol.h53
-rw-r--r--include/linux/swap.h3
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--mm/filemap.c21
-rw-r--r--mm/huge_memory.c57
-rw-r--r--mm/memcontrol.c407
-rw-r--r--mm/memory.c41
-rw-r--r--mm/rmap.c19
-rw-r--r--mm/shmem.c37
-rw-r--r--mm/swap.c34
-rw-r--r--mm/swapfile.c14
12 files changed, 338 insertions, 395 deletions
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 80ac454704b8..bcf750d3cecd 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -24,24 +24,7 @@ Please note that implementation details can be changed.
24 24
25 a page/swp_entry may be charged (usage += PAGE_SIZE) at 25 a page/swp_entry may be charged (usage += PAGE_SIZE) at
26 26
27 mem_cgroup_charge_anon() 27 mem_cgroup_try_charge()
28 Called at new page fault and Copy-On-Write.
29
30 mem_cgroup_try_charge_swapin()
31 Called at do_swap_page() (page fault on swap entry) and swapoff.
32 Followed by charge-commit-cancel protocol. (With swap accounting)
33 At commit, a charge recorded in swap_cgroup is removed.
34
35 mem_cgroup_charge_file()
36 Called at add_to_page_cache()
37
38 mem_cgroup_cache_charge_swapin()
39 Called at shmem's swapin.
40
41 mem_cgroup_prepare_migration()
42 Called before migration. "extra" charge is done and followed by
43 charge-commit-cancel protocol.
44 At commit, charge against oldpage or newpage will be committed.
45 28
462. Uncharge 292. Uncharge
47 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by 30 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
@@ -69,19 +52,14 @@ Please note that implementation details can be changed.
69 to new page is committed. At failure, charge to old page is committed. 52 to new page is committed. At failure, charge to old page is committed.
70 53
713. charge-commit-cancel 543. charge-commit-cancel
72 In some case, we can't know this "charge" is valid or not at charging 55 Memcg pages are charged in two steps:
73 (because of races). 56 mem_cgroup_try_charge()
74 To handle such case, there are charge-commit-cancel functions. 57 mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
75 mem_cgroup_try_charge_XXX
76 mem_cgroup_commit_charge_XXX
77 mem_cgroup_cancel_charge_XXX
78 these are used in swap-in and migration.
79 58
80 At try_charge(), there are no flags to say "this page is charged". 59 At try_charge(), there are no flags to say "this page is charged".
81 at this point, usage += PAGE_SIZE. 60 at this point, usage += PAGE_SIZE.
82 61
83 At commit(), the function checks the page should be charged or not 62 At commit(), the page is associated with the memcg.
84 and set flags or avoid charging.(usage -= PAGE_SIZE)
85 63
86 At cancel(), simply usage -= PAGE_SIZE. 64 At cancel(), simply usage -= PAGE_SIZE.
87 65
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index eb65d29516ca..1a9a096858e0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -54,28 +54,11 @@ struct mem_cgroup_reclaim_cookie {
54}; 54};
55 55
56#ifdef CONFIG_MEMCG 56#ifdef CONFIG_MEMCG
57/* 57int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
58 * All "charge" functions with gfp_mask should use GFP_KERNEL or 58 gfp_t gfp_mask, struct mem_cgroup **memcgp);
59 * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't 59void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
60 * alloc memory but reclaims memory from all available zones. So, "where I want 60 bool lrucare);
61 * memory from" bits of gfp_mask has no meaning. So any bits of that field is 61void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
62 * available but adding a rule is better. charge functions' gfp_mask should
63 * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
64 * codes.
65 * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
66 */
67
68extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm,
69 gfp_t gfp_mask);
70/* for swap handling */
71extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
72 struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
73extern void mem_cgroup_commit_charge_swapin(struct page *page,
74 struct mem_cgroup *memcg);
75extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
76
77extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
78 gfp_t gfp_mask);
79 62
80struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 63struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
81struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 64struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -233,30 +216,22 @@ void mem_cgroup_print_bad_page(struct page *page);
233#else /* CONFIG_MEMCG */ 216#else /* CONFIG_MEMCG */
234struct mem_cgroup; 217struct mem_cgroup;
235 218
236static inline int mem_cgroup_charge_anon(struct page *page, 219static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
237 struct mm_struct *mm, gfp_t gfp_mask) 220 gfp_t gfp_mask,
238{ 221 struct mem_cgroup **memcgp)
239 return 0;
240}
241
242static inline int mem_cgroup_charge_file(struct page *page,
243 struct mm_struct *mm, gfp_t gfp_mask)
244{
245 return 0;
246}
247
248static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
249 struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
250{ 222{
223 *memcgp = NULL;
251 return 0; 224 return 0;
252} 225}
253 226
254static inline void mem_cgroup_commit_charge_swapin(struct page *page, 227static inline void mem_cgroup_commit_charge(struct page *page,
255 struct mem_cgroup *memcg) 228 struct mem_cgroup *memcg,
229 bool lrucare)
256{ 230{
257} 231}
258 232
259static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 233static inline void mem_cgroup_cancel_charge(struct page *page,
234 struct mem_cgroup *memcg)
260{ 235{
261} 236}
262 237
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1eb64043c076..46a649e4e8cd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -320,6 +320,9 @@ extern void swap_setup(void);
320 320
321extern void add_page_to_unevictable_list(struct page *page); 321extern void add_page_to_unevictable_list(struct page *page);
322 322
323extern void lru_cache_add_active_or_unevictable(struct page *page,
324 struct vm_area_struct *vma);
325
323/* linux/mm/vmscan.c */ 326/* linux/mm/vmscan.c */
324extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 327extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
325 gfp_t gfp_mask, nodemask_t *mask); 328 gfp_t gfp_mask, nodemask_t *mask);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
167 /* For mmu_notifiers */ 167 /* For mmu_notifiers */
168 const unsigned long mmun_start = addr; 168 const unsigned long mmun_start = addr;
169 const unsigned long mmun_end = addr + PAGE_SIZE; 169 const unsigned long mmun_end = addr + PAGE_SIZE;
170 struct mem_cgroup *memcg;
171
172 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
173 if (err)
174 return err;
170 175
171 /* For try_to_free_swap() and munlock_vma_page() below */ 176 /* For try_to_free_swap() and munlock_vma_page() below */
172 lock_page(page); 177 lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
179 184
180 get_page(kpage); 185 get_page(kpage);
181 page_add_new_anon_rmap(kpage, vma, addr); 186 page_add_new_anon_rmap(kpage, vma, addr);
187 mem_cgroup_commit_charge(kpage, memcg, false);
188 lru_cache_add_active_or_unevictable(kpage, vma);
182 189
183 if (!PageAnon(page)) { 190 if (!PageAnon(page)) {
184 dec_mm_counter(mm, MM_FILEPAGES); 191 dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
200 207
201 err = 0; 208 err = 0;
202 unlock: 209 unlock:
210 mem_cgroup_cancel_charge(kpage, memcg);
203 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 211 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
204 unlock_page(page); 212 unlock_page(page);
205 return err; 213 return err;
@@ -315,18 +323,11 @@ retry:
315 if (!new_page) 323 if (!new_page)
316 goto put_old; 324 goto put_old;
317 325
318 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
319 goto put_new;
320
321 __SetPageUptodate(new_page); 326 __SetPageUptodate(new_page);
322 copy_highpage(new_page, old_page); 327 copy_highpage(new_page, old_page);
323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 328 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
324 329
325 ret = __replace_page(vma, vaddr, old_page, new_page); 330 ret = __replace_page(vma, vaddr, old_page, new_page);
326 if (ret)
327 mem_cgroup_uncharge_page(new_page);
328
329put_new:
330 page_cache_release(new_page); 331 page_cache_release(new_page);
331put_old: 332put_old:
332 put_page(old_page); 333 put_page(old_page);
diff --git a/mm/filemap.c b/mm/filemap.c
index af19a6b079f5..349a40e35545 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,6 +31,7 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34#include <linux/hugetlb.h>
34#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
35#include <linux/cleancache.h> 36#include <linux/cleancache.h>
36#include <linux/rmap.h> 37#include <linux/rmap.h>
@@ -548,19 +549,24 @@ static int __add_to_page_cache_locked(struct page *page,
548 pgoff_t offset, gfp_t gfp_mask, 549 pgoff_t offset, gfp_t gfp_mask,
549 void **shadowp) 550 void **shadowp)
550{ 551{
552 int huge = PageHuge(page);
553 struct mem_cgroup *memcg;
551 int error; 554 int error;
552 555
553 VM_BUG_ON_PAGE(!PageLocked(page), page); 556 VM_BUG_ON_PAGE(!PageLocked(page), page);
554 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 557 VM_BUG_ON_PAGE(PageSwapBacked(page), page);
555 558
556 error = mem_cgroup_charge_file(page, current->mm, 559 if (!huge) {
557 gfp_mask & GFP_RECLAIM_MASK); 560 error = mem_cgroup_try_charge(page, current->mm,
558 if (error) 561 gfp_mask, &memcg);
559 return error; 562 if (error)
563 return error;
564 }
560 565
561 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 566 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
562 if (error) { 567 if (error) {
563 mem_cgroup_uncharge_cache_page(page); 568 if (!huge)
569 mem_cgroup_cancel_charge(page, memcg);
564 return error; 570 return error;
565 } 571 }
566 572
@@ -575,13 +581,16 @@ static int __add_to_page_cache_locked(struct page *page,
575 goto err_insert; 581 goto err_insert;
576 __inc_zone_page_state(page, NR_FILE_PAGES); 582 __inc_zone_page_state(page, NR_FILE_PAGES);
577 spin_unlock_irq(&mapping->tree_lock); 583 spin_unlock_irq(&mapping->tree_lock);
584 if (!huge)
585 mem_cgroup_commit_charge(page, memcg, false);
578 trace_mm_filemap_add_to_page_cache(page); 586 trace_mm_filemap_add_to_page_cache(page);
579 return 0; 587 return 0;
580err_insert: 588err_insert:
581 page->mapping = NULL; 589 page->mapping = NULL;
582 /* Leave page->index set: truncation relies upon it */ 590 /* Leave page->index set: truncation relies upon it */
583 spin_unlock_irq(&mapping->tree_lock); 591 spin_unlock_irq(&mapping->tree_lock);
584 mem_cgroup_uncharge_cache_page(page); 592 if (!huge)
593 mem_cgroup_cancel_charge(page, memcg);
585 page_cache_release(page); 594 page_cache_release(page);
586 return error; 595 return error;
587} 596}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3630d577e987..d9a21d06b862 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
715 unsigned long haddr, pmd_t *pmd, 715 unsigned long haddr, pmd_t *pmd,
716 struct page *page) 716 struct page *page)
717{ 717{
718 struct mem_cgroup *memcg;
718 pgtable_t pgtable; 719 pgtable_t pgtable;
719 spinlock_t *ptl; 720 spinlock_t *ptl;
720 721
721 VM_BUG_ON_PAGE(!PageCompound(page), page); 722 VM_BUG_ON_PAGE(!PageCompound(page), page);
723
724 if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
725 return VM_FAULT_OOM;
726
722 pgtable = pte_alloc_one(mm, haddr); 727 pgtable = pte_alloc_one(mm, haddr);
723 if (unlikely(!pgtable)) 728 if (unlikely(!pgtable)) {
729 mem_cgroup_cancel_charge(page, memcg);
724 return VM_FAULT_OOM; 730 return VM_FAULT_OOM;
731 }
725 732
726 clear_huge_page(page, haddr, HPAGE_PMD_NR); 733 clear_huge_page(page, haddr, HPAGE_PMD_NR);
727 /* 734 /*
@@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
734 ptl = pmd_lock(mm, pmd); 741 ptl = pmd_lock(mm, pmd);
735 if (unlikely(!pmd_none(*pmd))) { 742 if (unlikely(!pmd_none(*pmd))) {
736 spin_unlock(ptl); 743 spin_unlock(ptl);
737 mem_cgroup_uncharge_page(page); 744 mem_cgroup_cancel_charge(page, memcg);
738 put_page(page); 745 put_page(page);
739 pte_free(mm, pgtable); 746 pte_free(mm, pgtable);
740 } else { 747 } else {
@@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
742 entry = mk_huge_pmd(page, vma->vm_page_prot); 749 entry = mk_huge_pmd(page, vma->vm_page_prot);
743 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 750 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
744 page_add_new_anon_rmap(page, vma, haddr); 751 page_add_new_anon_rmap(page, vma, haddr);
752 mem_cgroup_commit_charge(page, memcg, false);
753 lru_cache_add_active_or_unevictable(page, vma);
745 pgtable_trans_huge_deposit(mm, pmd, pgtable); 754 pgtable_trans_huge_deposit(mm, pmd, pgtable);
746 set_pmd_at(mm, haddr, pmd, entry); 755 set_pmd_at(mm, haddr, pmd, entry);
747 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 756 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
827 count_vm_event(THP_FAULT_FALLBACK); 836 count_vm_event(THP_FAULT_FALLBACK);
828 return VM_FAULT_FALLBACK; 837 return VM_FAULT_FALLBACK;
829 } 838 }
830 if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) {
831 put_page(page);
832 count_vm_event(THP_FAULT_FALLBACK);
833 return VM_FAULT_FALLBACK;
834 }
835 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { 839 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
836 mem_cgroup_uncharge_page(page);
837 put_page(page); 840 put_page(page);
838 count_vm_event(THP_FAULT_FALLBACK); 841 count_vm_event(THP_FAULT_FALLBACK);
839 return VM_FAULT_FALLBACK; 842 return VM_FAULT_FALLBACK;
@@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
979 struct page *page, 982 struct page *page,
980 unsigned long haddr) 983 unsigned long haddr)
981{ 984{
985 struct mem_cgroup *memcg;
982 spinlock_t *ptl; 986 spinlock_t *ptl;
983 pgtable_t pgtable; 987 pgtable_t pgtable;
984 pmd_t _pmd; 988 pmd_t _pmd;
@@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
999 __GFP_OTHER_NODE, 1003 __GFP_OTHER_NODE,
1000 vma, address, page_to_nid(page)); 1004 vma, address, page_to_nid(page));
1001 if (unlikely(!pages[i] || 1005 if (unlikely(!pages[i] ||
1002 mem_cgroup_charge_anon(pages[i], mm, 1006 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
1003 GFP_KERNEL))) { 1007 &memcg))) {
1004 if (pages[i]) 1008 if (pages[i])
1005 put_page(pages[i]); 1009 put_page(pages[i]);
1006 mem_cgroup_uncharge_start();
1007 while (--i >= 0) { 1010 while (--i >= 0) {
1008 mem_cgroup_uncharge_page(pages[i]); 1011 memcg = (void *)page_private(pages[i]);
1012 set_page_private(pages[i], 0);
1013 mem_cgroup_cancel_charge(pages[i], memcg);
1009 put_page(pages[i]); 1014 put_page(pages[i]);
1010 } 1015 }
1011 mem_cgroup_uncharge_end();
1012 kfree(pages); 1016 kfree(pages);
1013 ret |= VM_FAULT_OOM; 1017 ret |= VM_FAULT_OOM;
1014 goto out; 1018 goto out;
1015 } 1019 }
1020 set_page_private(pages[i], (unsigned long)memcg);
1016 } 1021 }
1017 1022
1018 for (i = 0; i < HPAGE_PMD_NR; i++) { 1023 for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1041 pte_t *pte, entry; 1046 pte_t *pte, entry;
1042 entry = mk_pte(pages[i], vma->vm_page_prot); 1047 entry = mk_pte(pages[i], vma->vm_page_prot);
1043 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1048 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1049 memcg = (void *)page_private(pages[i]);
1050 set_page_private(pages[i], 0);
1044 page_add_new_anon_rmap(pages[i], vma, haddr); 1051 page_add_new_anon_rmap(pages[i], vma, haddr);
1052 mem_cgroup_commit_charge(pages[i], memcg, false);
1053 lru_cache_add_active_or_unevictable(pages[i], vma);
1045 pte = pte_offset_map(&_pmd, haddr); 1054 pte = pte_offset_map(&_pmd, haddr);
1046 VM_BUG_ON(!pte_none(*pte)); 1055 VM_BUG_ON(!pte_none(*pte));
1047 set_pte_at(mm, haddr, pte, entry); 1056 set_pte_at(mm, haddr, pte, entry);
@@ -1065,12 +1074,12 @@ out:
1065out_free_pages: 1074out_free_pages:
1066 spin_unlock(ptl); 1075 spin_unlock(ptl);
1067 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1076 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1068 mem_cgroup_uncharge_start();
1069 for (i = 0; i < HPAGE_PMD_NR; i++) { 1077 for (i = 0; i < HPAGE_PMD_NR; i++) {
1070 mem_cgroup_uncharge_page(pages[i]); 1078 memcg = (void *)page_private(pages[i]);
1079 set_page_private(pages[i], 0);
1080 mem_cgroup_cancel_charge(pages[i], memcg);
1071 put_page(pages[i]); 1081 put_page(pages[i]);
1072 } 1082 }
1073 mem_cgroup_uncharge_end();
1074 kfree(pages); 1083 kfree(pages);
1075 goto out; 1084 goto out;
1076} 1085}
@@ -1081,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1081 spinlock_t *ptl; 1090 spinlock_t *ptl;
1082 int ret = 0; 1091 int ret = 0;
1083 struct page *page = NULL, *new_page; 1092 struct page *page = NULL, *new_page;
1093 struct mem_cgroup *memcg;
1084 unsigned long haddr; 1094 unsigned long haddr;
1085 unsigned long mmun_start; /* For mmu_notifiers */ 1095 unsigned long mmun_start; /* For mmu_notifiers */
1086 unsigned long mmun_end; /* For mmu_notifiers */ 1096 unsigned long mmun_end; /* For mmu_notifiers */
@@ -1132,7 +1142,8 @@ alloc:
1132 goto out; 1142 goto out;
1133 } 1143 }
1134 1144
1135 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { 1145 if (unlikely(mem_cgroup_try_charge(new_page, mm,
1146 GFP_TRANSHUGE, &memcg))) {
1136 put_page(new_page); 1147 put_page(new_page);
1137 if (page) { 1148 if (page) {
1138 split_huge_page(page); 1149 split_huge_page(page);
@@ -1161,7 +1172,7 @@ alloc:
1161 put_user_huge_page(page); 1172 put_user_huge_page(page);
1162 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1173 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1163 spin_unlock(ptl); 1174 spin_unlock(ptl);
1164 mem_cgroup_uncharge_page(new_page); 1175 mem_cgroup_cancel_charge(new_page, memcg);
1165 put_page(new_page); 1176 put_page(new_page);
1166 goto out_mn; 1177 goto out_mn;
1167 } else { 1178 } else {
@@ -1170,6 +1181,8 @@ alloc:
1170 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1181 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1171 pmdp_clear_flush(vma, haddr, pmd); 1182 pmdp_clear_flush(vma, haddr, pmd);
1172 page_add_new_anon_rmap(new_page, vma, haddr); 1183 page_add_new_anon_rmap(new_page, vma, haddr);
1184 mem_cgroup_commit_charge(new_page, memcg, false);
1185 lru_cache_add_active_or_unevictable(new_page, vma);
1173 set_pmd_at(mm, haddr, pmd, entry); 1186 set_pmd_at(mm, haddr, pmd, entry);
1174 update_mmu_cache_pmd(vma, address, pmd); 1187 update_mmu_cache_pmd(vma, address, pmd);
1175 if (!page) { 1188 if (!page) {
@@ -2413,6 +2426,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2413 spinlock_t *pmd_ptl, *pte_ptl; 2426 spinlock_t *pmd_ptl, *pte_ptl;
2414 int isolated; 2427 int isolated;
2415 unsigned long hstart, hend; 2428 unsigned long hstart, hend;
2429 struct mem_cgroup *memcg;
2416 unsigned long mmun_start; /* For mmu_notifiers */ 2430 unsigned long mmun_start; /* For mmu_notifiers */
2417 unsigned long mmun_end; /* For mmu_notifiers */ 2431 unsigned long mmun_end; /* For mmu_notifiers */
2418 2432
@@ -2423,7 +2437,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2423 if (!new_page) 2437 if (!new_page)
2424 return; 2438 return;
2425 2439
2426 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) 2440 if (unlikely(mem_cgroup_try_charge(new_page, mm,
2441 GFP_TRANSHUGE, &memcg)))
2427 return; 2442 return;
2428 2443
2429 /* 2444 /*
@@ -2510,6 +2525,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2510 spin_lock(pmd_ptl); 2525 spin_lock(pmd_ptl);
2511 BUG_ON(!pmd_none(*pmd)); 2526 BUG_ON(!pmd_none(*pmd));
2512 page_add_new_anon_rmap(new_page, vma, address); 2527 page_add_new_anon_rmap(new_page, vma, address);
2528 mem_cgroup_commit_charge(new_page, memcg, false);
2529 lru_cache_add_active_or_unevictable(new_page, vma);
2513 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2530 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2514 set_pmd_at(mm, address, pmd, _pmd); 2531 set_pmd_at(mm, address, pmd, _pmd);
2515 update_mmu_cache_pmd(vma, address, pmd); 2532 update_mmu_cache_pmd(vma, address, pmd);
@@ -2523,7 +2540,7 @@ out_up_write:
2523 return; 2540 return;
2524 2541
2525out: 2542out:
2526 mem_cgroup_uncharge_page(new_page); 2543 mem_cgroup_cancel_charge(new_page, memcg);
2527 goto out_up_write; 2544 goto out_up_write;
2528} 2545}
2529 2546
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90dc501eaf3f..1cbe1e54ff5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,17 +2551,8 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2551 return NOTIFY_OK; 2551 return NOTIFY_OK;
2552} 2552}
2553 2553
2554/** 2554static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2555 * mem_cgroup_try_charge - try charging a memcg 2555 unsigned int nr_pages)
2556 * @memcg: memcg to charge
2557 * @nr_pages: number of pages to charge
2558 *
2559 * Returns 0 if @memcg was charged successfully, -EINTR if the charge
2560 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2561 */
2562static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2563 gfp_t gfp_mask,
2564 unsigned int nr_pages)
2565{ 2556{
2566 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2557 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2567 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2558 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -2660,41 +2651,7 @@ done:
2660 return ret; 2651 return ret;
2661} 2652}
2662 2653
2663/** 2654static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2664 * mem_cgroup_try_charge_mm - try charging a mm
2665 * @mm: mm_struct to charge
2666 * @nr_pages: number of pages to charge
2667 * @oom: trigger OOM if reclaim fails
2668 *
2669 * Returns the charged mem_cgroup associated with the given mm_struct or
2670 * NULL the charge failed.
2671 */
2672static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2673 gfp_t gfp_mask,
2674 unsigned int nr_pages)
2675
2676{
2677 struct mem_cgroup *memcg;
2678 int ret;
2679
2680 memcg = get_mem_cgroup_from_mm(mm);
2681 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages);
2682 css_put(&memcg->css);
2683 if (ret == -EINTR)
2684 memcg = root_mem_cgroup;
2685 else if (ret)
2686 memcg = NULL;
2687
2688 return memcg;
2689}
2690
2691/*
2692 * Somemtimes we have to undo a charge we got by try_charge().
2693 * This function is for that and do uncharge, put css's refcnt.
2694 * gotten by try_charge().
2695 */
2696static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2697 unsigned int nr_pages)
2698{ 2655{
2699 unsigned long bytes = nr_pages * PAGE_SIZE; 2656 unsigned long bytes = nr_pages * PAGE_SIZE;
2700 2657
@@ -2760,17 +2717,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2760 return memcg; 2717 return memcg;
2761} 2718}
2762 2719
2763static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2720static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2764 struct page *page, 2721 unsigned int nr_pages, bool anon, bool lrucare)
2765 unsigned int nr_pages,
2766 enum charge_type ctype,
2767 bool lrucare)
2768{ 2722{
2769 struct page_cgroup *pc = lookup_page_cgroup(page); 2723 struct page_cgroup *pc = lookup_page_cgroup(page);
2770 struct zone *uninitialized_var(zone); 2724 struct zone *uninitialized_var(zone);
2771 struct lruvec *lruvec; 2725 struct lruvec *lruvec;
2772 bool was_on_lru = false; 2726 bool was_on_lru = false;
2773 bool anon;
2774 2727
2775 lock_page_cgroup(pc); 2728 lock_page_cgroup(pc);
2776 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
@@ -2807,11 +2760,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2807 spin_unlock_irq(&zone->lru_lock); 2760 spin_unlock_irq(&zone->lru_lock);
2808 } 2761 }
2809 2762
2810 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2811 anon = true;
2812 else
2813 anon = false;
2814
2815 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2763 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2816 unlock_page_cgroup(pc); 2764 unlock_page_cgroup(pc);
2817 2765
@@ -2882,21 +2830,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2882 if (ret) 2830 if (ret)
2883 return ret; 2831 return ret;
2884 2832
2885 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); 2833 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
2886 if (ret == -EINTR) { 2834 if (ret == -EINTR) {
2887 /* 2835 /*
2888 * mem_cgroup_try_charge() chosed to bypass to root due to 2836 * try_charge() chose to bypass to root due to OOM kill or
2889 * OOM kill or fatal signal. Since our only options are to 2837 * fatal signal. Since our only options are to either fail
2890 * either fail the allocation or charge it to this cgroup, do 2838 * the allocation or charge it to this cgroup, do it as a
2891 * it as a temporary condition. But we can't fail. From a 2839 * temporary condition. But we can't fail. From a kmem/slab
2892 * kmem/slab perspective, the cache has already been selected, 2840 * perspective, the cache has already been selected, by
2893 * by mem_cgroup_kmem_get_cache(), so it is too late to change 2841 * mem_cgroup_kmem_get_cache(), so it is too late to change
2894 * our minds. 2842 * our minds.
2895 * 2843 *
2896 * This condition will only trigger if the task entered 2844 * This condition will only trigger if the task entered
2897 * memcg_charge_kmem in a sane state, but was OOM-killed during 2845 * memcg_charge_kmem in a sane state, but was OOM-killed
2898 * mem_cgroup_try_charge() above. Tasks that were already 2846 * during try_charge() above. Tasks that were already dying
2899 * dying when the allocation triggers should have been already 2847 * when the allocation triggers should have been already
2900 * directed to the root cgroup in memcontrol.h 2848 * directed to the root cgroup in memcontrol.h
2901 */ 2849 */
2902 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2850 res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@ -3618,164 +3566,6 @@ out:
3618 return ret; 3566 return ret;
3619} 3567}
3620 3568
3621int mem_cgroup_charge_anon(struct page *page,
3622 struct mm_struct *mm, gfp_t gfp_mask)
3623{
3624 unsigned int nr_pages = 1;
3625 struct mem_cgroup *memcg;
3626
3627 if (mem_cgroup_disabled())
3628 return 0;
3629
3630 VM_BUG_ON_PAGE(page_mapped(page), page);
3631 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3632 VM_BUG_ON(!mm);
3633
3634 if (PageTransHuge(page)) {
3635 nr_pages <<= compound_order(page);
3636 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3637 }
3638
3639 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages);
3640 if (!memcg)
3641 return -ENOMEM;
3642 __mem_cgroup_commit_charge(memcg, page, nr_pages,
3643 MEM_CGROUP_CHARGE_TYPE_ANON, false);
3644 return 0;
3645}
3646
3647/*
3648 * While swap-in, try_charge -> commit or cancel, the page is locked.
3649 * And when try_charge() successfully returns, one refcnt to memcg without
3650 * struct page_cgroup is acquired. This refcnt will be consumed by
3651 * "commit()" or removed by "cancel()"
3652 */
3653static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3654 struct page *page,
3655 gfp_t mask,
3656 struct mem_cgroup **memcgp)
3657{
3658 struct mem_cgroup *memcg = NULL;
3659 struct page_cgroup *pc;
3660 int ret;
3661
3662 pc = lookup_page_cgroup(page);
3663 /*
3664 * Every swap fault against a single page tries to charge the
3665 * page, bail as early as possible. shmem_unuse() encounters
3666 * already charged pages, too. The USED bit is protected by
3667 * the page lock, which serializes swap cache removal, which
3668 * in turn serializes uncharging.
3669 */
3670 if (PageCgroupUsed(pc))
3671 goto out;
3672 if (do_swap_account)
3673 memcg = try_get_mem_cgroup_from_page(page);
3674 if (!memcg)
3675 memcg = get_mem_cgroup_from_mm(mm);
3676 ret = mem_cgroup_try_charge(memcg, mask, 1);
3677 css_put(&memcg->css);
3678 if (ret == -EINTR)
3679 memcg = root_mem_cgroup;
3680 else if (ret)
3681 return ret;
3682out:
3683 *memcgp = memcg;
3684 return 0;
3685}
3686
3687int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3688 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3689{
3690 if (mem_cgroup_disabled()) {
3691 *memcgp = NULL;
3692 return 0;
3693 }
3694 /*
3695 * A racing thread's fault, or swapoff, may have already
3696 * updated the pte, and even removed page from swap cache: in
3697 * those cases unuse_pte()'s pte_same() test will fail; but
3698 * there's also a KSM case which does need to charge the page.
3699 */
3700 if (!PageSwapCache(page)) {
3701 struct mem_cgroup *memcg;
3702
3703 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
3704 if (!memcg)
3705 return -ENOMEM;
3706 *memcgp = memcg;
3707 return 0;
3708 }
3709 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3710}
3711
3712void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3713{
3714 if (mem_cgroup_disabled())
3715 return;
3716 if (!memcg)
3717 return;
3718 __mem_cgroup_cancel_charge(memcg, 1);
3719}
3720
3721static void
3722__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
3723 enum charge_type ctype)
3724{
3725 if (mem_cgroup_disabled())
3726 return;
3727 if (!memcg)
3728 return;
3729
3730 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
3731 /*
3732 * Now swap is on-memory. This means this page may be
3733 * counted both as mem and swap....double count.
3734 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
3735 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
3736 * may call delete_from_swap_cache() before reach here.
3737 */
3738 if (do_swap_account && PageSwapCache(page)) {
3739 swp_entry_t ent = {.val = page_private(page)};
3740 mem_cgroup_uncharge_swap(ent);
3741 }
3742}
3743
3744void mem_cgroup_commit_charge_swapin(struct page *page,
3745 struct mem_cgroup *memcg)
3746{
3747 __mem_cgroup_commit_charge_swapin(page, memcg,
3748 MEM_CGROUP_CHARGE_TYPE_ANON);
3749}
3750
3751int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
3752 gfp_t gfp_mask)
3753{
3754 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3755 struct mem_cgroup *memcg;
3756 int ret;
3757
3758 if (mem_cgroup_disabled())
3759 return 0;
3760 if (PageCompound(page))
3761 return 0;
3762
3763 if (PageSwapCache(page)) { /* shmem */
3764 ret = __mem_cgroup_try_charge_swapin(mm, page,
3765 gfp_mask, &memcg);
3766 if (ret)
3767 return ret;
3768 __mem_cgroup_commit_charge_swapin(page, memcg, type);
3769 return 0;
3770 }
3771
3772 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
3773 if (!memcg)
3774 return -ENOMEM;
3775 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3776 return 0;
3777}
3778
3779static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 3569static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3780 unsigned int nr_pages, 3570 unsigned int nr_pages,
3781 const enum charge_type ctype) 3571 const enum charge_type ctype)
@@ -4122,7 +3912,6 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4122 struct mem_cgroup *memcg = NULL; 3912 struct mem_cgroup *memcg = NULL;
4123 unsigned int nr_pages = 1; 3913 unsigned int nr_pages = 1;
4124 struct page_cgroup *pc; 3914 struct page_cgroup *pc;
4125 enum charge_type ctype;
4126 3915
4127 *memcgp = NULL; 3916 *memcgp = NULL;
4128 3917
@@ -4184,16 +3973,12 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4184 * page. In the case new page is migrated but not remapped, new page's 3973 * page. In the case new page is migrated but not remapped, new page's
4185 * mapcount will be finally 0 and we call uncharge in end_migration(). 3974 * mapcount will be finally 0 and we call uncharge in end_migration().
4186 */ 3975 */
4187 if (PageAnon(page))
4188 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4189 else
4190 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4191 /* 3976 /*
4192 * The page is committed to the memcg, but it's not actually 3977 * The page is committed to the memcg, but it's not actually
4193 * charged to the res_counter since we plan on replacing the 3978 * charged to the res_counter since we plan on replacing the
4194 * old one and only one page is going to be left afterwards. 3979 * old one and only one page is going to be left afterwards.
4195 */ 3980 */
4196 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 3981 commit_charge(newpage, memcg, nr_pages, PageAnon(page), false);
4197} 3982}
4198 3983
4199/* remove redundant charge if migration failed*/ 3984/* remove redundant charge if migration failed*/
@@ -4252,7 +4037,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
4252{ 4037{
4253 struct mem_cgroup *memcg = NULL; 4038 struct mem_cgroup *memcg = NULL;
4254 struct page_cgroup *pc; 4039 struct page_cgroup *pc;
4255 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4256 4040
4257 if (mem_cgroup_disabled()) 4041 if (mem_cgroup_disabled())
4258 return; 4042 return;
@@ -4278,7 +4062,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
4278 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4062 * the newpage may be on LRU(or pagevec for LRU) already. We lock
4279 * LRU while we overwrite pc->mem_cgroup. 4063 * LRU while we overwrite pc->mem_cgroup.
4280 */ 4064 */
4281 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4065 commit_charge(newpage, memcg, 1, false, true);
4282} 4066}
4283 4067
4284#ifdef CONFIG_DEBUG_VM 4068#ifdef CONFIG_DEBUG_VM
@@ -6319,20 +6103,19 @@ static int mem_cgroup_do_precharge(unsigned long count)
6319 int ret; 6103 int ret;
6320 6104
6321 /* Try a single bulk charge without reclaim first */ 6105 /* Try a single bulk charge without reclaim first */
6322 ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 6106 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
6323 if (!ret) { 6107 if (!ret) {
6324 mc.precharge += count; 6108 mc.precharge += count;
6325 return ret; 6109 return ret;
6326 } 6110 }
6327 if (ret == -EINTR) { 6111 if (ret == -EINTR) {
6328 __mem_cgroup_cancel_charge(root_mem_cgroup, count); 6112 cancel_charge(root_mem_cgroup, count);
6329 return ret; 6113 return ret;
6330 } 6114 }
6331 6115
6332 /* Try charges one by one with reclaim */ 6116 /* Try charges one by one with reclaim */
6333 while (count--) { 6117 while (count--) {
6334 ret = mem_cgroup_try_charge(mc.to, 6118 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
6335 GFP_KERNEL & ~__GFP_NORETRY, 1);
6336 /* 6119 /*
6337 * In case of failure, any residual charges against 6120 * In case of failure, any residual charges against
6338 * mc.to will be dropped by mem_cgroup_clear_mc() 6121 * mc.to will be dropped by mem_cgroup_clear_mc()
@@ -6340,7 +6123,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
6340 * bypassed to root right away or they'll be lost. 6123 * bypassed to root right away or they'll be lost.
6341 */ 6124 */
6342 if (ret == -EINTR) 6125 if (ret == -EINTR)
6343 __mem_cgroup_cancel_charge(root_mem_cgroup, 1); 6126 cancel_charge(root_mem_cgroup, 1);
6344 if (ret) 6127 if (ret)
6345 return ret; 6128 return ret;
6346 mc.precharge++; 6129 mc.precharge++;
@@ -6609,7 +6392,7 @@ static void __mem_cgroup_clear_mc(void)
6609 6392
6610 /* we must uncharge all the leftover precharges from mc.to */ 6393 /* we must uncharge all the leftover precharges from mc.to */
6611 if (mc.precharge) { 6394 if (mc.precharge) {
6612 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 6395 cancel_charge(mc.to, mc.precharge);
6613 mc.precharge = 0; 6396 mc.precharge = 0;
6614 } 6397 }
6615 /* 6398 /*
@@ -6617,7 +6400,7 @@ static void __mem_cgroup_clear_mc(void)
6617 * we must uncharge here. 6400 * we must uncharge here.
6618 */ 6401 */
6619 if (mc.moved_charge) { 6402 if (mc.moved_charge) {
6620 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6403 cancel_charge(mc.from, mc.moved_charge);
6621 mc.moved_charge = 0; 6404 mc.moved_charge = 0;
6622 } 6405 }
6623 /* we must fixup refcnts and charges */ 6406 /* we must fixup refcnts and charges */
@@ -6946,6 +6729,150 @@ static void __init enable_swap_cgroup(void)
6946} 6729}
6947#endif 6730#endif
6948 6731
6732/**
6733 * mem_cgroup_try_charge - try charging a page
6734 * @page: page to charge
6735 * @mm: mm context of the victim
6736 * @gfp_mask: reclaim mode
6737 * @memcgp: charged memcg return
6738 *
6739 * Try to charge @page to the memcg that @mm belongs to, reclaiming
6740 * pages according to @gfp_mask if necessary.
6741 *
6742 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
6743 * Otherwise, an error code is returned.
6744 *
6745 * After page->mapping has been set up, the caller must finalize the
6746 * charge with mem_cgroup_commit_charge(). Or abort the transaction
6747 * with mem_cgroup_cancel_charge() in case page instantiation fails.
6748 */
6749int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6750 gfp_t gfp_mask, struct mem_cgroup **memcgp)
6751{
6752 struct mem_cgroup *memcg = NULL;
6753 unsigned int nr_pages = 1;
6754 int ret = 0;
6755
6756 if (mem_cgroup_disabled())
6757 goto out;
6758
6759 if (PageSwapCache(page)) {
6760 struct page_cgroup *pc = lookup_page_cgroup(page);
6761 /*
6762 * Every swap fault against a single page tries to charge the
6763 * page, bail as early as possible. shmem_unuse() encounters
6764 * already charged pages, too. The USED bit is protected by
6765 * the page lock, which serializes swap cache removal, which
6766 * in turn serializes uncharging.
6767 */
6768 if (PageCgroupUsed(pc))
6769 goto out;
6770 }
6771
6772 if (PageTransHuge(page)) {
6773 nr_pages <<= compound_order(page);
6774 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6775 }
6776
6777 if (do_swap_account && PageSwapCache(page))
6778 memcg = try_get_mem_cgroup_from_page(page);
6779 if (!memcg)
6780 memcg = get_mem_cgroup_from_mm(mm);
6781
6782 ret = try_charge(memcg, gfp_mask, nr_pages);
6783
6784 css_put(&memcg->css);
6785
6786 if (ret == -EINTR) {
6787 memcg = root_mem_cgroup;
6788 ret = 0;
6789 }
6790out:
6791 *memcgp = memcg;
6792 return ret;
6793}
6794
6795/**
6796 * mem_cgroup_commit_charge - commit a page charge
6797 * @page: page to charge
6798 * @memcg: memcg to charge the page to
6799 * @lrucare: page might be on LRU already
6800 *
6801 * Finalize a charge transaction started by mem_cgroup_try_charge(),
6802 * after page->mapping has been set up. This must happen atomically
6803 * as part of the page instantiation, i.e. under the page table lock
6804 * for anonymous pages, under the page lock for page and swap cache.
6805 *
6806 * In addition, the page must not be on the LRU during the commit, to
6807 * prevent racing with task migration. If it might be, use @lrucare.
6808 *
6809 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6810 */
6811void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6812 bool lrucare)
6813{
6814 unsigned int nr_pages = 1;
6815
6816 VM_BUG_ON_PAGE(!page->mapping, page);
6817 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6818
6819 if (mem_cgroup_disabled())
6820 return;
6821 /*
6822 * Swap faults will attempt to charge the same page multiple
6823 * times. But reuse_swap_page() might have removed the page
6824 * from swapcache already, so we can't check PageSwapCache().
6825 */
6826 if (!memcg)
6827 return;
6828
6829 if (PageTransHuge(page)) {
6830 nr_pages <<= compound_order(page);
6831 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6832 }
6833
6834 commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare);
6835
6836 if (do_swap_account && PageSwapCache(page)) {
6837 swp_entry_t entry = { .val = page_private(page) };
6838 /*
6839 * The swap entry might not get freed for a long time,
6840 * let's not wait for it. The page already received a
6841 * memory+swap charge, drop the swap entry duplicate.
6842 */
6843 mem_cgroup_uncharge_swap(entry);
6844 }
6845}
6846
6847/**
6848 * mem_cgroup_cancel_charge - cancel a page charge
6849 * @page: page to charge
6850 * @memcg: memcg to charge the page to
6851 *
6852 * Cancel a charge transaction started by mem_cgroup_try_charge().
6853 */
6854void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6855{
6856 unsigned int nr_pages = 1;
6857
6858 if (mem_cgroup_disabled())
6859 return;
6860 /*
6861 * Swap faults will attempt to charge the same page multiple
6862 * times. But reuse_swap_page() might have removed the page
6863 * from swapcache already, so we can't check PageSwapCache().
6864 */
6865 if (!memcg)
6866 return;
6867
6868 if (PageTransHuge(page)) {
6869 nr_pages <<= compound_order(page);
6870 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6871 }
6872
6873 cancel_charge(memcg, nr_pages);
6874}
6875
6949/* 6876/*
6950 * subsys_initcall() for memory controller. 6877 * subsys_initcall() for memory controller.
6951 * 6878 *
diff --git a/mm/memory.c b/mm/memory.c
index 5c55270729f7..6d7648773dc4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2049,6 +2049,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2049 struct page *dirty_page = NULL; 2049 struct page *dirty_page = NULL;
2050 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2050 unsigned long mmun_start = 0; /* For mmu_notifiers */
2051 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2051 unsigned long mmun_end = 0; /* For mmu_notifiers */
2052 struct mem_cgroup *memcg;
2052 2053
2053 old_page = vm_normal_page(vma, address, orig_pte); 2054 old_page = vm_normal_page(vma, address, orig_pte);
2054 if (!old_page) { 2055 if (!old_page) {
@@ -2204,7 +2205,7 @@ gotten:
2204 } 2205 }
2205 __SetPageUptodate(new_page); 2206 __SetPageUptodate(new_page);
2206 2207
2207 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) 2208 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2208 goto oom_free_new; 2209 goto oom_free_new;
2209 2210
2210 mmun_start = address & PAGE_MASK; 2211 mmun_start = address & PAGE_MASK;
@@ -2234,6 +2235,8 @@ gotten:
2234 */ 2235 */
2235 ptep_clear_flush(vma, address, page_table); 2236 ptep_clear_flush(vma, address, page_table);
2236 page_add_new_anon_rmap(new_page, vma, address); 2237 page_add_new_anon_rmap(new_page, vma, address);
2238 mem_cgroup_commit_charge(new_page, memcg, false);
2239 lru_cache_add_active_or_unevictable(new_page, vma);
2237 /* 2240 /*
2238 * We call the notify macro here because, when using secondary 2241 * We call the notify macro here because, when using secondary
2239 * mmu page tables (such as kvm shadow page tables), we want the 2242 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2271,7 +2274,7 @@ gotten:
2271 new_page = old_page; 2274 new_page = old_page;
2272 ret |= VM_FAULT_WRITE; 2275 ret |= VM_FAULT_WRITE;
2273 } else 2276 } else
2274 mem_cgroup_uncharge_page(new_page); 2277 mem_cgroup_cancel_charge(new_page, memcg);
2275 2278
2276 if (new_page) 2279 if (new_page)
2277 page_cache_release(new_page); 2280 page_cache_release(new_page);
@@ -2410,10 +2413,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2410{ 2413{
2411 spinlock_t *ptl; 2414 spinlock_t *ptl;
2412 struct page *page, *swapcache; 2415 struct page *page, *swapcache;
2416 struct mem_cgroup *memcg;
2413 swp_entry_t entry; 2417 swp_entry_t entry;
2414 pte_t pte; 2418 pte_t pte;
2415 int locked; 2419 int locked;
2416 struct mem_cgroup *ptr;
2417 int exclusive = 0; 2420 int exclusive = 0;
2418 int ret = 0; 2421 int ret = 0;
2419 2422
@@ -2489,7 +2492,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2489 goto out_page; 2492 goto out_page;
2490 } 2493 }
2491 2494
2492 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2495 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2493 ret = VM_FAULT_OOM; 2496 ret = VM_FAULT_OOM;
2494 goto out_page; 2497 goto out_page;
2495 } 2498 }
@@ -2514,10 +2517,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2514 * while the page is counted on swap but not yet in mapcount i.e. 2517 * while the page is counted on swap but not yet in mapcount i.e.
2515 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 2518 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2516 * must be called after the swap_free(), or it will never succeed. 2519 * must be called after the swap_free(), or it will never succeed.
2517 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2518 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2519 * in page->private. In this case, a record in swap_cgroup is silently
2520 * discarded at swap_free().
2521 */ 2520 */
2522 2521
2523 inc_mm_counter_fast(mm, MM_ANONPAGES); 2522 inc_mm_counter_fast(mm, MM_ANONPAGES);
@@ -2533,12 +2532,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2533 if (pte_swp_soft_dirty(orig_pte)) 2532 if (pte_swp_soft_dirty(orig_pte))
2534 pte = pte_mksoft_dirty(pte); 2533 pte = pte_mksoft_dirty(pte);
2535 set_pte_at(mm, address, page_table, pte); 2534 set_pte_at(mm, address, page_table, pte);
2536 if (page == swapcache) 2535 if (page == swapcache) {
2537 do_page_add_anon_rmap(page, vma, address, exclusive); 2536 do_page_add_anon_rmap(page, vma, address, exclusive);
2538 else /* ksm created a completely new copy */ 2537 mem_cgroup_commit_charge(page, memcg, true);
2538 } else { /* ksm created a completely new copy */
2539 page_add_new_anon_rmap(page, vma, address); 2539 page_add_new_anon_rmap(page, vma, address);
2540 /* It's better to call commit-charge after rmap is established */ 2540 mem_cgroup_commit_charge(page, memcg, false);
2541 mem_cgroup_commit_charge_swapin(page, ptr); 2541 lru_cache_add_active_or_unevictable(page, vma);
2542 }
2542 2543
2543 swap_free(entry); 2544 swap_free(entry);
2544 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2545 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2571,7 +2572,7 @@ unlock:
2571out: 2572out:
2572 return ret; 2573 return ret;
2573out_nomap: 2574out_nomap:
2574 mem_cgroup_cancel_charge_swapin(ptr); 2575 mem_cgroup_cancel_charge(page, memcg);
2575 pte_unmap_unlock(page_table, ptl); 2576 pte_unmap_unlock(page_table, ptl);
2576out_page: 2577out_page:
2577 unlock_page(page); 2578 unlock_page(page);
@@ -2627,6 +2628,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2627 unsigned long address, pte_t *page_table, pmd_t *pmd, 2628 unsigned long address, pte_t *page_table, pmd_t *pmd,
2628 unsigned int flags) 2629 unsigned int flags)
2629{ 2630{
2631 struct mem_cgroup *memcg;
2630 struct page *page; 2632 struct page *page;
2631 spinlock_t *ptl; 2633 spinlock_t *ptl;
2632 pte_t entry; 2634 pte_t entry;
@@ -2660,7 +2662,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2660 */ 2662 */
2661 __SetPageUptodate(page); 2663 __SetPageUptodate(page);
2662 2664
2663 if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) 2665 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2664 goto oom_free_page; 2666 goto oom_free_page;
2665 2667
2666 entry = mk_pte(page, vma->vm_page_prot); 2668 entry = mk_pte(page, vma->vm_page_prot);
@@ -2673,6 +2675,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2673 2675
2674 inc_mm_counter_fast(mm, MM_ANONPAGES); 2676 inc_mm_counter_fast(mm, MM_ANONPAGES);
2675 page_add_new_anon_rmap(page, vma, address); 2677 page_add_new_anon_rmap(page, vma, address);
2678 mem_cgroup_commit_charge(page, memcg, false);
2679 lru_cache_add_active_or_unevictable(page, vma);
2676setpte: 2680setpte:
2677 set_pte_at(mm, address, page_table, entry); 2681 set_pte_at(mm, address, page_table, entry);
2678 2682
@@ -2682,7 +2686,7 @@ unlock:
2682 pte_unmap_unlock(page_table, ptl); 2686 pte_unmap_unlock(page_table, ptl);
2683 return 0; 2687 return 0;
2684release: 2688release:
2685 mem_cgroup_uncharge_page(page); 2689 mem_cgroup_cancel_charge(page, memcg);
2686 page_cache_release(page); 2690 page_cache_release(page);
2687 goto unlock; 2691 goto unlock;
2688oom_free_page: 2692oom_free_page:
@@ -2919,6 +2923,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2919 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2923 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2920{ 2924{
2921 struct page *fault_page, *new_page; 2925 struct page *fault_page, *new_page;
2926 struct mem_cgroup *memcg;
2922 spinlock_t *ptl; 2927 spinlock_t *ptl;
2923 pte_t *pte; 2928 pte_t *pte;
2924 int ret; 2929 int ret;
@@ -2930,7 +2935,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2930 if (!new_page) 2935 if (!new_page)
2931 return VM_FAULT_OOM; 2936 return VM_FAULT_OOM;
2932 2937
2933 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { 2938 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2934 page_cache_release(new_page); 2939 page_cache_release(new_page);
2935 return VM_FAULT_OOM; 2940 return VM_FAULT_OOM;
2936 } 2941 }
@@ -2950,12 +2955,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2950 goto uncharge_out; 2955 goto uncharge_out;
2951 } 2956 }
2952 do_set_pte(vma, address, new_page, pte, true, true); 2957 do_set_pte(vma, address, new_page, pte, true, true);
2958 mem_cgroup_commit_charge(new_page, memcg, false);
2959 lru_cache_add_active_or_unevictable(new_page, vma);
2953 pte_unmap_unlock(pte, ptl); 2960 pte_unmap_unlock(pte, ptl);
2954 unlock_page(fault_page); 2961 unlock_page(fault_page);
2955 page_cache_release(fault_page); 2962 page_cache_release(fault_page);
2956 return ret; 2963 return ret;
2957uncharge_out: 2964uncharge_out:
2958 mem_cgroup_uncharge_page(new_page); 2965 mem_cgroup_cancel_charge(new_page, memcg);
2959 page_cache_release(new_page); 2966 page_cache_release(new_page);
2960 return ret; 2967 return ret;
2961} 2968}
diff --git a/mm/rmap.c b/mm/rmap.c
index 22a4a7699cdb..f56b5ed78128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1032,25 +1032,6 @@ void page_add_new_anon_rmap(struct page *page,
1032 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1032 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1033 hpage_nr_pages(page)); 1033 hpage_nr_pages(page));
1034 __page_set_anon_rmap(page, vma, address, 1); 1034 __page_set_anon_rmap(page, vma, address, 1);
1035
1036 VM_BUG_ON_PAGE(PageLRU(page), page);
1037 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
1038 SetPageActive(page);
1039 lru_cache_add(page);
1040 return;
1041 }
1042
1043 if (!TestSetPageMlocked(page)) {
1044 /*
1045 * We use the irq-unsafe __mod_zone_page_stat because this
1046 * counter is not modified from interrupt context, and the pte
1047 * lock is held(spinlock), which implies preemption disabled.
1048 */
1049 __mod_zone_page_state(page_zone(page), NR_MLOCK,
1050 hpage_nr_pages(page));
1051 count_vm_event(UNEVICTABLE_PGMLOCKED);
1052 }
1053 add_page_to_unevictable_list(page);
1054} 1035}
1055 1036
1056/** 1037/**
diff --git a/mm/shmem.c b/mm/shmem.c
index 302d1cf7ad07..1f1a8085538b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -621,7 +621,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
621 radswap = swp_to_radix_entry(swap); 621 radswap = swp_to_radix_entry(swap);
622 index = radix_tree_locate_item(&mapping->page_tree, radswap); 622 index = radix_tree_locate_item(&mapping->page_tree, radswap);
623 if (index == -1) 623 if (index == -1)
624 return 0; 624 return -EAGAIN; /* tell shmem_unuse we found nothing */
625 625
626 /* 626 /*
627 * Move _head_ to start search for next from here. 627 * Move _head_ to start search for next from here.
@@ -680,7 +680,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
680 spin_unlock(&info->lock); 680 spin_unlock(&info->lock);
681 swap_free(swap); 681 swap_free(swap);
682 } 682 }
683 error = 1; /* not an error, but entry was found */
684 } 683 }
685 return error; 684 return error;
686} 685}
@@ -692,7 +691,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
692{ 691{
693 struct list_head *this, *next; 692 struct list_head *this, *next;
694 struct shmem_inode_info *info; 693 struct shmem_inode_info *info;
695 int found = 0; 694 struct mem_cgroup *memcg;
696 int error = 0; 695 int error = 0;
697 696
698 /* 697 /*
@@ -707,26 +706,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
707 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 706 * the shmem_swaplist_mutex which might hold up shmem_writepage().
708 * Charged back to the user (not to caller) when swap account is used. 707 * Charged back to the user (not to caller) when swap account is used.
709 */ 708 */
710 error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); 709 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
711 if (error) 710 if (error)
712 goto out; 711 goto out;
713 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 712 /* No radix_tree_preload: swap entry keeps a place for page in tree */
713 error = -EAGAIN;
714 714
715 mutex_lock(&shmem_swaplist_mutex); 715 mutex_lock(&shmem_swaplist_mutex);
716 list_for_each_safe(this, next, &shmem_swaplist) { 716 list_for_each_safe(this, next, &shmem_swaplist) {
717 info = list_entry(this, struct shmem_inode_info, swaplist); 717 info = list_entry(this, struct shmem_inode_info, swaplist);
718 if (info->swapped) 718 if (info->swapped)
719 found = shmem_unuse_inode(info, swap, &page); 719 error = shmem_unuse_inode(info, swap, &page);
720 else 720 else
721 list_del_init(&info->swaplist); 721 list_del_init(&info->swaplist);
722 cond_resched(); 722 cond_resched();
723 if (found) 723 if (error != -EAGAIN)
724 break; 724 break;
725 /* found nothing in this: move on to search the next */
725 } 726 }
726 mutex_unlock(&shmem_swaplist_mutex); 727 mutex_unlock(&shmem_swaplist_mutex);
727 728
728 if (found < 0) 729 if (error) {
729 error = found; 730 if (error != -ENOMEM)
731 error = 0;
732 mem_cgroup_cancel_charge(page, memcg);
733 } else
734 mem_cgroup_commit_charge(page, memcg, true);
730out: 735out:
731 unlock_page(page); 736 unlock_page(page);
732 page_cache_release(page); 737 page_cache_release(page);
@@ -1030,6 +1035,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1030 struct address_space *mapping = inode->i_mapping; 1035 struct address_space *mapping = inode->i_mapping;
1031 struct shmem_inode_info *info; 1036 struct shmem_inode_info *info;
1032 struct shmem_sb_info *sbinfo; 1037 struct shmem_sb_info *sbinfo;
1038 struct mem_cgroup *memcg;
1033 struct page *page; 1039 struct page *page;
1034 swp_entry_t swap; 1040 swp_entry_t swap;
1035 int error; 1041 int error;
@@ -1108,8 +1114,7 @@ repeat:
1108 goto failed; 1114 goto failed;
1109 } 1115 }
1110 1116
1111 error = mem_cgroup_charge_file(page, current->mm, 1117 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1112 gfp & GFP_RECLAIM_MASK);
1113 if (!error) { 1118 if (!error) {
1114 error = shmem_add_to_page_cache(page, mapping, index, 1119 error = shmem_add_to_page_cache(page, mapping, index,
1115 swp_to_radix_entry(swap)); 1120 swp_to_radix_entry(swap));
@@ -1125,12 +1130,16 @@ repeat:
1125 * Reset swap.val? No, leave it so "failed" goes back to 1130 * Reset swap.val? No, leave it so "failed" goes back to
1126 * "repeat": reading a hole and writing should succeed. 1131 * "repeat": reading a hole and writing should succeed.
1127 */ 1132 */
1128 if (error) 1133 if (error) {
1134 mem_cgroup_cancel_charge(page, memcg);
1129 delete_from_swap_cache(page); 1135 delete_from_swap_cache(page);
1136 }
1130 } 1137 }
1131 if (error) 1138 if (error)
1132 goto failed; 1139 goto failed;
1133 1140
1141 mem_cgroup_commit_charge(page, memcg, true);
1142
1134 spin_lock(&info->lock); 1143 spin_lock(&info->lock);
1135 info->swapped--; 1144 info->swapped--;
1136 shmem_recalc_inode(inode); 1145 shmem_recalc_inode(inode);
@@ -1168,8 +1177,7 @@ repeat:
1168 if (sgp == SGP_WRITE) 1177 if (sgp == SGP_WRITE)
1169 __SetPageReferenced(page); 1178 __SetPageReferenced(page);
1170 1179
1171 error = mem_cgroup_charge_file(page, current->mm, 1180 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1172 gfp & GFP_RECLAIM_MASK);
1173 if (error) 1181 if (error)
1174 goto decused; 1182 goto decused;
1175 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1183 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1179,9 +1187,10 @@ repeat:
1179 radix_tree_preload_end(); 1187 radix_tree_preload_end();
1180 } 1188 }
1181 if (error) { 1189 if (error) {
1182 mem_cgroup_uncharge_cache_page(page); 1190 mem_cgroup_cancel_charge(page, memcg);
1183 goto decused; 1191 goto decused;
1184 } 1192 }
1193 mem_cgroup_commit_charge(page, memcg, false);
1185 lru_cache_add_anon(page); 1194 lru_cache_add_anon(page);
1186 1195
1187 spin_lock(&info->lock); 1196 spin_lock(&info->lock);
diff --git a/mm/swap.c b/mm/swap.c
index c789d01c9ec3..3baca701bb78 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -687,6 +687,40 @@ void add_page_to_unevictable_list(struct page *page)
687 spin_unlock_irq(&zone->lru_lock); 687 spin_unlock_irq(&zone->lru_lock);
688} 688}
689 689
690/**
691 * lru_cache_add_active_or_unevictable
692 * @page: the page to be added to LRU
693 * @vma: vma in which page is mapped for determining reclaimability
694 *
695 * Place @page on the active or unevictable LRU list, depending on its
696 * evictability. Note that if the page is not evictable, it goes
697 * directly back onto it's zone's unevictable list, it does NOT use a
698 * per cpu pagevec.
699 */
700void lru_cache_add_active_or_unevictable(struct page *page,
701 struct vm_area_struct *vma)
702{
703 VM_BUG_ON_PAGE(PageLRU(page), page);
704
705 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
706 SetPageActive(page);
707 lru_cache_add(page);
708 return;
709 }
710
711 if (!TestSetPageMlocked(page)) {
712 /*
713 * We use the irq-unsafe __mod_zone_page_stat because this
714 * counter is not modified from interrupt context, and the pte
715 * lock is held(spinlock), which implies preemption disabled.
716 */
717 __mod_zone_page_state(page_zone(page), NR_MLOCK,
718 hpage_nr_pages(page));
719 count_vm_event(UNEVICTABLE_PGMLOCKED);
720 }
721 add_page_to_unevictable_list(page);
722}
723
690/* 724/*
691 * If the page can not be invalidated, it is moved to the 725 * If the page can not be invalidated, it is moved to the
692 * inactive list to speed up its reclaim. It is moved to the 726 * inactive list to speed up its reclaim. It is moved to the
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4c524f7bd0bf..0883b4912ff7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1106,15 +1106,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1106 if (unlikely(!page)) 1106 if (unlikely(!page))
1107 return -ENOMEM; 1107 return -ENOMEM;
1108 1108
1109 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, 1109 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
1110 GFP_KERNEL, &memcg)) {
1111 ret = -ENOMEM; 1110 ret = -ENOMEM;
1112 goto out_nolock; 1111 goto out_nolock;
1113 } 1112 }
1114 1113
1115 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1114 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1116 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { 1115 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
1117 mem_cgroup_cancel_charge_swapin(memcg); 1116 mem_cgroup_cancel_charge(page, memcg);
1118 ret = 0; 1117 ret = 0;
1119 goto out; 1118 goto out;
1120 } 1119 }
@@ -1124,11 +1123,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1124 get_page(page); 1123 get_page(page);
1125 set_pte_at(vma->vm_mm, addr, pte, 1124 set_pte_at(vma->vm_mm, addr, pte,
1126 pte_mkold(mk_pte(page, vma->vm_page_prot))); 1125 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1127 if (page == swapcache) 1126 if (page == swapcache) {
1128 page_add_anon_rmap(page, vma, addr); 1127 page_add_anon_rmap(page, vma, addr);
1129 else /* ksm created a completely new copy */ 1128 mem_cgroup_commit_charge(page, memcg, true);
1129 } else { /* ksm created a completely new copy */
1130 page_add_new_anon_rmap(page, vma, addr); 1130 page_add_new_anon_rmap(page, vma, addr);
1131 mem_cgroup_commit_charge_swapin(page, memcg); 1131 mem_cgroup_commit_charge(page, memcg, false);
1132 lru_cache_add_active_or_unevictable(page, vma);
1133 }
1132 swap_free(entry); 1134 swap_free(entry);
1133 /* 1135 /*
1134 * Move the page to the active list so it is not 1136 * Move the page to the active list so it is not