aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit0a31bc97c80c3fa87b32c091d9a930ac19cd0c40 (patch)
tree06dafd237309f9b8ded980eb420a5377989e2c0b
parent00501b531c4723972aa11d6d4ebcf8d6552007c8 (diff)
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's lifetime - truncation, reclaim, swapout, migration - is impressively complicated and fragile. Because anonymous and file pages were always charged before they had their page->mapping established, uncharges had to happen when the page type could still be known from the context; as in unmap for anonymous, page cache removal for file and shmem pages, and swap cache truncation for swap pages. However, these operations happen well before the page is actually freed, and so a lot of synchronization is necessary: - Charging, uncharging, page migration, and charge migration all need to take a per-page bit spinlock as they could race with uncharging. - Swap cache truncation happens during both swap-in and swap-out, and possibly repeatedly before the page is actually freed. This means that the memcg swapout code is called from many contexts that make no sense and it has to figure out the direction from page state to make sure memory and memory+swap are always correctly charged. - On page migration, the old page might be unmapped but then reused, so memcg code has to prevent untimely uncharging in that case. Because this code - which should be a simple charge transfer - is so special-cased, it is not reusable for replace_page_cache(). But now that charged pages always have a page->mapping, introduce mem_cgroup_uncharge(), which is called after the final put_page(), when we know for sure that nobody is looking at the page anymore. For page migration, introduce mem_cgroup_migrate(), which is called after the migration is successful and the new page is fully rmapped. Because the old page is no longer uncharged after migration, prevent double charges by decoupling the page's memcg association (PCG_USED and pc->mem_cgroup) from the page holding an actual charge. The new bits PCG_MEM and PCG_MEMSW represent the respective charges and are transferred to the new page during migration. mem_cgroup_migrate() is suitable for replace_page_cache() as well, which gets rid of mem_cgroup_replace_page_cache(). However, care needs to be taken because both the source and the target page can already be charged and on the LRU when fuse is splicing: grab the page lock on the charge moving side to prevent changing pc->mem_cgroup of a page under migration. Also, the lruvecs of both pages change as we uncharge the old and charge the new during migration, and putback may race with us, so grab the lru lock and isolate the pages iff on LRU to prevent races and ensure the pages are on the right lruvec afterward. Swap accounting is massively simplified: because the page is no longer uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry before the final put_page() in page reclaim. Finally, page_cgroup changes are now protected by whatever protection the page itself offers: anonymous pages are charged under the page table lock, whereas page cache insertions, swapin, and migration hold the page lock. Uncharging happens under full exclusion with no outstanding references. Charging and uncharging also ensure that the page is off-LRU, which serializes against charge migration. Remove the very costly page_cgroup lock and set pc->flags non-atomically. [mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable] [vdavydov@parallels.com: fix flags definition] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Hugh Dickins <hughd@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Tested-by: Jet Chen <jet.chen@intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Tested-by: Felipe Balbi <balbi@ti.com> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memcg_test.txt128
-rw-r--r--include/linux/memcontrol.h49
-rw-r--r--include/linux/page_cgroup.h43
-rw-r--r--include/linux/swap.h12
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/memcontrol.c828
-rw-r--r--mm/memory.c2
-rw-r--r--mm/migrate.c38
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/truncate.c9
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/zswap.c2
16 files changed, 389 insertions, 768 deletions
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index bcf750d3cecd..8870b0212150 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -29,28 +29,13 @@ Please note that implementation details can be changed.
292. Uncharge 292. Uncharge
30 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by 30 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
31 31
32 mem_cgroup_uncharge_page() 32 mem_cgroup_uncharge()
33 Called when an anonymous page is fully unmapped. I.e., mapcount goes 33 Called when a page's refcount goes down to 0.
34 to 0. If the page is SwapCache, uncharge is delayed until
35 mem_cgroup_uncharge_swapcache().
36
37 mem_cgroup_uncharge_cache_page()
38 Called when a page-cache is deleted from radix-tree. If the page is
39 SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache().
40
41 mem_cgroup_uncharge_swapcache()
42 Called when SwapCache is removed from radix-tree. The charge itself
43 is moved to swap_cgroup. (If mem+swap controller is disabled, no
44 charge to swap occurs.)
45 34
46 mem_cgroup_uncharge_swap() 35 mem_cgroup_uncharge_swap()
47 Called when swp_entry's refcnt goes down to 0. A charge against swap 36 Called when swp_entry's refcnt goes down to 0. A charge against swap
48 disappears. 37 disappears.
49 38
50 mem_cgroup_end_migration(old, new)
51 At success of migration old is uncharged (if necessary), a charge
52 to new page is committed. At failure, charge to old page is committed.
53
543. charge-commit-cancel 393. charge-commit-cancel
55 Memcg pages are charged in two steps: 40 Memcg pages are charged in two steps:
56 mem_cgroup_try_charge() 41 mem_cgroup_try_charge()
@@ -69,18 +54,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
69 Anonymous page is newly allocated at 54 Anonymous page is newly allocated at
70 - page fault into MAP_ANONYMOUS mapping. 55 - page fault into MAP_ANONYMOUS mapping.
71 - Copy-On-Write. 56 - Copy-On-Write.
72 It is charged right after it's allocated before doing any page table
73 related operations. Of course, it's uncharged when another page is used
74 for the fault address.
75
76 At freeing anonymous page (by exit() or munmap()), zap_pte() is called
77 and pages for ptes are freed one by one.(see mm/memory.c). Uncharges
78 are done at page_remove_rmap() when page_mapcount() goes down to 0.
79
80 Another page freeing is by page-reclaim (vmscan.c) and anonymous
81 pages are swapped out. In this case, the page is marked as
82 PageSwapCache(). uncharge() routine doesn't uncharge the page marked
83 as SwapCache(). It's delayed until __delete_from_swap_cache().
84 57
85 4.1 Swap-in. 58 4.1 Swap-in.
86 At swap-in, the page is taken from swap-cache. There are 2 cases. 59 At swap-in, the page is taken from swap-cache. There are 2 cases.
@@ -89,41 +62,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
89 (b) If the SwapCache has been mapped by processes, it has been 62 (b) If the SwapCache has been mapped by processes, it has been
90 charged already. 63 charged already.
91 64
92 This swap-in is one of the most complicated work. In do_swap_page(),
93 following events occur when pte is unchanged.
94
95 (1) the page (SwapCache) is looked up.
96 (2) lock_page()
97 (3) try_charge_swapin()
98 (4) reuse_swap_page() (may call delete_swap_cache())
99 (5) commit_charge_swapin()
100 (6) swap_free().
101
102 Considering following situation for example.
103
104 (A) The page has not been charged before (2) and reuse_swap_page()
105 doesn't call delete_from_swap_cache().
106 (B) The page has not been charged before (2) and reuse_swap_page()
107 calls delete_from_swap_cache().
108 (C) The page has been charged before (2) and reuse_swap_page() doesn't
109 call delete_from_swap_cache().
110 (D) The page has been charged before (2) and reuse_swap_page() calls
111 delete_from_swap_cache().
112
113 memory.usage/memsw.usage changes to this page/swp_entry will be
114 Case (A) (B) (C) (D)
115 Event
116 Before (2) 0/ 1 0/ 1 1/ 1 1/ 1
117 ===========================================
118 (3) +1/+1 +1/+1 +1/+1 +1/+1
119 (4) - 0/ 0 - -1/ 0
120 (5) 0/-1 0/ 0 -1/-1 0/ 0
121 (6) - 0/-1 - 0/-1
122 ===========================================
123 Result 1/ 1 1/ 1 1/ 1 1/ 1
124
125 In any cases, charges to this page should be 1/ 1.
126
127 4.2 Swap-out. 65 4.2 Swap-out.
128 At swap-out, typical state transition is below. 66 At swap-out, typical state transition is below.
129 67
@@ -136,28 +74,20 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
136 swp_entry's refcnt -= 1. 74 swp_entry's refcnt -= 1.
137 75
138 76
139 At (b), the page is marked as SwapCache and not uncharged.
140 At (d), the page is removed from SwapCache and a charge in page_cgroup
141 is moved to swap_cgroup.
142
143 Finally, at task exit, 77 Finally, at task exit,
144 (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. 78 (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
145 Here, a charge in swap_cgroup disappears.
146 79
1475. Page Cache 805. Page Cache
148 Page Cache is charged at 81 Page Cache is charged at
149 - add_to_page_cache_locked(). 82 - add_to_page_cache_locked().
150 83
151 uncharged at
152 - __remove_from_page_cache().
153
154 The logic is very clear. (About migration, see below) 84 The logic is very clear. (About migration, see below)
155 Note: __remove_from_page_cache() is called by remove_from_page_cache() 85 Note: __remove_from_page_cache() is called by remove_from_page_cache()
156 and __remove_mapping(). 86 and __remove_mapping().
157 87
1586. Shmem(tmpfs) Page Cache 886. Shmem(tmpfs) Page Cache
159 Memcg's charge/uncharge have special handlers of shmem. The best way 89 The best way to understand shmem's page state transition is to read
160 to understand shmem's page state transition is to read mm/shmem.c. 90 mm/shmem.c.
161 But brief explanation of the behavior of memcg around shmem will be 91 But brief explanation of the behavior of memcg around shmem will be
162 helpful to understand the logic. 92 helpful to understand the logic.
163 93
@@ -170,56 +100,10 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
170 It's charged when... 100 It's charged when...
171 - A new page is added to shmem's radix-tree. 101 - A new page is added to shmem's radix-tree.
172 - A swp page is read. (move a charge from swap_cgroup to page_cgroup) 102 - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
173 It's uncharged when
174 - A page is removed from radix-tree and not SwapCache.
175 - When SwapCache is removed, a charge is moved to swap_cgroup.
176 - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup
177 disappears.
178 103
1797. Page Migration 1047. Page Migration
180 One of the most complicated functions is page-migration-handler. 105
181 Memcg has 2 routines. Assume that we are migrating a page's contents 106 mem_cgroup_migrate()
182 from OLDPAGE to NEWPAGE.
183
184 Usual migration logic is..
185 (a) remove the page from LRU.
186 (b) allocate NEWPAGE (migration target)
187 (c) lock by lock_page().
188 (d) unmap all mappings.
189 (e-1) If necessary, replace entry in radix-tree.
190 (e-2) move contents of a page.
191 (f) map all mappings again.
192 (g) pushback the page to LRU.
193 (-) OLDPAGE will be freed.
194
195 Before (g), memcg should complete all necessary charge/uncharge to
196 NEWPAGE/OLDPAGE.
197
198 The point is....
199 - If OLDPAGE is anonymous, all charges will be dropped at (d) because
200 try_to_unmap() drops all mapcount and the page will not be
201 SwapCache.
202
203 - If OLDPAGE is SwapCache, charges will be kept at (g) because
204 __delete_from_swap_cache() isn't called at (e-1)
205
206 - If OLDPAGE is page-cache, charges will be kept at (g) because
207 remove_from_swap_cache() isn't called at (e-1)
208
209 memcg provides following hooks.
210
211 - mem_cgroup_prepare_migration(OLDPAGE)
212 Called after (b) to account a charge (usage += PAGE_SIZE) against
213 memcg which OLDPAGE belongs to.
214
215 - mem_cgroup_end_migration(OLDPAGE, NEWPAGE)
216 Called after (f) before (g).
217 If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already
218 charged, a charge by prepare_migration() is automatically canceled.
219 If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE.
220
221 But zap_pte() (by exit or munmap) can be called while migration,
222 we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
223 107
2248. LRU 1088. LRU
225 Each memcg has its own private LRU. Now, its handling is under global 109 Each memcg has its own private LRU. Now, its handling is under global
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1a9a096858e0..806b8fa15c5f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -60,15 +60,17 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
60 bool lrucare); 60 bool lrucare);
61void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); 61void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
62 62
63struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 63void mem_cgroup_uncharge(struct page *page);
64struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 64
65/* Batched uncharging */
66void mem_cgroup_uncharge_start(void);
67void mem_cgroup_uncharge_end(void);
65 68
66/* For coalescing uncharge for reducing memcg' overhead*/ 69void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
67extern void mem_cgroup_uncharge_start(void); 70 bool lrucare);
68extern void mem_cgroup_uncharge_end(void);
69 71
70extern void mem_cgroup_uncharge_page(struct page *page); 72struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
71extern void mem_cgroup_uncharge_cache_page(struct page *page); 73struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
72 74
73bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 75bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
74 struct mem_cgroup *memcg); 76 struct mem_cgroup *memcg);
@@ -96,12 +98,6 @@ bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
96 98
97extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); 99extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
98 100
99extern void
100mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
101 struct mem_cgroup **memcgp);
102extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
103 struct page *oldpage, struct page *newpage, bool migration_ok);
104
105struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 101struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
106 struct mem_cgroup *, 102 struct mem_cgroup *,
107 struct mem_cgroup_reclaim_cookie *); 103 struct mem_cgroup_reclaim_cookie *);
@@ -116,8 +112,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
116void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); 112void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
117extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 113extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
118 struct task_struct *p); 114 struct task_struct *p);
119extern void mem_cgroup_replace_page_cache(struct page *oldpage,
120 struct page *newpage);
121 115
122static inline void mem_cgroup_oom_enable(void) 116static inline void mem_cgroup_oom_enable(void)
123{ 117{
@@ -235,19 +229,21 @@ static inline void mem_cgroup_cancel_charge(struct page *page,
235{ 229{
236} 230}
237 231
238static inline void mem_cgroup_uncharge_start(void) 232static inline void mem_cgroup_uncharge(struct page *page)
239{ 233{
240} 234}
241 235
242static inline void mem_cgroup_uncharge_end(void) 236static inline void mem_cgroup_uncharge_start(void)
243{ 237{
244} 238}
245 239
246static inline void mem_cgroup_uncharge_page(struct page *page) 240static inline void mem_cgroup_uncharge_end(void)
247{ 241{
248} 242}
249 243
250static inline void mem_cgroup_uncharge_cache_page(struct page *page) 244static inline void mem_cgroup_migrate(struct page *oldpage,
245 struct page *newpage,
246 bool lrucare)
251{ 247{
252} 248}
253 249
@@ -286,17 +282,6 @@ static inline struct cgroup_subsys_state
286 return NULL; 282 return NULL;
287} 283}
288 284
289static inline void
290mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
291 struct mem_cgroup **memcgp)
292{
293}
294
295static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
296 struct page *oldpage, struct page *newpage, bool migration_ok)
297{
298}
299
300static inline struct mem_cgroup * 285static inline struct mem_cgroup *
301mem_cgroup_iter(struct mem_cgroup *root, 286mem_cgroup_iter(struct mem_cgroup *root,
302 struct mem_cgroup *prev, 287 struct mem_cgroup *prev,
@@ -392,10 +377,6 @@ static inline
392void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 377void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
393{ 378{
394} 379}
395static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
396 struct page *newpage)
397{
398}
399#endif /* CONFIG_MEMCG */ 380#endif /* CONFIG_MEMCG */
400 381
401#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) 382#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 777a524716db..9bfb8e68a595 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -3,9 +3,9 @@
3 3
4enum { 4enum {
5 /* flags for mem_cgroup */ 5 /* flags for mem_cgroup */
6 PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ 6 PCG_USED = 0x01, /* This page is charged to a memcg */
7 PCG_USED, /* this object is in use. */ 7 PCG_MEM = 0x02, /* This page holds a memory charge */
8 PCG_MIGRATION, /* under page migration */ 8 PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */
9 __NR_PCG_FLAGS, 9 __NR_PCG_FLAGS,
10}; 10};
11 11
@@ -44,42 +44,9 @@ static inline void __init page_cgroup_init(void)
44struct page_cgroup *lookup_page_cgroup(struct page *page); 44struct page_cgroup *lookup_page_cgroup(struct page *page);
45struct page *lookup_cgroup_page(struct page_cgroup *pc); 45struct page *lookup_cgroup_page(struct page_cgroup *pc);
46 46
47#define TESTPCGFLAG(uname, lname) \ 47static inline int PageCgroupUsed(struct page_cgroup *pc)
48static inline int PageCgroup##uname(struct page_cgroup *pc) \
49 { return test_bit(PCG_##lname, &pc->flags); }
50
51#define SETPCGFLAG(uname, lname) \
52static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
53 { set_bit(PCG_##lname, &pc->flags); }
54
55#define CLEARPCGFLAG(uname, lname) \
56static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
57 { clear_bit(PCG_##lname, &pc->flags); }
58
59#define TESTCLEARPCGFLAG(uname, lname) \
60static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
61 { return test_and_clear_bit(PCG_##lname, &pc->flags); }
62
63TESTPCGFLAG(Used, USED)
64CLEARPCGFLAG(Used, USED)
65SETPCGFLAG(Used, USED)
66
67SETPCGFLAG(Migration, MIGRATION)
68CLEARPCGFLAG(Migration, MIGRATION)
69TESTPCGFLAG(Migration, MIGRATION)
70
71static inline void lock_page_cgroup(struct page_cgroup *pc)
72{
73 /*
74 * Don't take this lock in IRQ context.
75 * This lock is for pc->mem_cgroup, USED, MIGRATION
76 */
77 bit_spin_lock(PCG_LOCK, &pc->flags);
78}
79
80static inline void unlock_page_cgroup(struct page_cgroup *pc)
81{ 48{
82 bit_spin_unlock(PCG_LOCK, &pc->flags); 49 return !!(pc->flags & PCG_USED);
83} 50}
84 51
85#else /* CONFIG_MEMCG */ 52#else /* CONFIG_MEMCG */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46a649e4e8cd..1b72060f093a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -381,9 +381,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
381} 381}
382#endif 382#endif
383#ifdef CONFIG_MEMCG_SWAP 383#ifdef CONFIG_MEMCG_SWAP
384extern void mem_cgroup_uncharge_swap(swp_entry_t ent); 384extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
385extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
385#else 386#else
386static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) 387static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
388{
389}
390static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
387{ 391{
388} 392}
389#endif 393#endif
@@ -443,7 +447,7 @@ extern void swap_shmem_alloc(swp_entry_t);
443extern int swap_duplicate(swp_entry_t); 447extern int swap_duplicate(swp_entry_t);
444extern int swapcache_prepare(swp_entry_t); 448extern int swapcache_prepare(swp_entry_t);
445extern void swap_free(swp_entry_t); 449extern void swap_free(swp_entry_t);
446extern void swapcache_free(swp_entry_t, struct page *page); 450extern void swapcache_free(swp_entry_t);
447extern int free_swap_and_cache(swp_entry_t); 451extern int free_swap_and_cache(swp_entry_t);
448extern int swap_type_of(dev_t, sector_t, struct block_device **); 452extern int swap_type_of(dev_t, sector_t, struct block_device **);
449extern unsigned int count_swap_pages(int, int); 453extern unsigned int count_swap_pages(int, int);
@@ -507,7 +511,7 @@ static inline void swap_free(swp_entry_t swp)
507{ 511{
508} 512}
509 513
510static inline void swapcache_free(swp_entry_t swp, struct page *page) 514static inline void swapcache_free(swp_entry_t swp)
511{ 515{
512} 516}
513 517
diff --git a/mm/filemap.c b/mm/filemap.c
index 349a40e35545..f501b56ec2c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -234,7 +234,6 @@ void delete_from_page_cache(struct page *page)
234 spin_lock_irq(&mapping->tree_lock); 234 spin_lock_irq(&mapping->tree_lock);
235 __delete_from_page_cache(page, NULL); 235 __delete_from_page_cache(page, NULL);
236 spin_unlock_irq(&mapping->tree_lock); 236 spin_unlock_irq(&mapping->tree_lock);
237 mem_cgroup_uncharge_cache_page(page);
238 237
239 if (freepage) 238 if (freepage)
240 freepage(page); 239 freepage(page);
@@ -490,8 +489,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
490 if (PageSwapBacked(new)) 489 if (PageSwapBacked(new))
491 __inc_zone_page_state(new, NR_SHMEM); 490 __inc_zone_page_state(new, NR_SHMEM);
492 spin_unlock_irq(&mapping->tree_lock); 491 spin_unlock_irq(&mapping->tree_lock);
493 /* mem_cgroup codes must not be called under tree_lock */ 492 mem_cgroup_migrate(old, new, true);
494 mem_cgroup_replace_page_cache(old, new);
495 radix_tree_preload_end(); 493 radix_tree_preload_end();
496 if (freepage) 494 if (freepage)
497 freepage(old); 495 freepage(old);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1cbe1e54ff5f..9106f1b12f56 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -754,9 +754,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
754static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 754static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
755 struct mem_cgroup_tree_per_zone *mctz) 755 struct mem_cgroup_tree_per_zone *mctz)
756{ 756{
757 spin_lock(&mctz->lock); 757 unsigned long flags;
758
759 spin_lock_irqsave(&mctz->lock, flags);
758 __mem_cgroup_remove_exceeded(mz, mctz); 760 __mem_cgroup_remove_exceeded(mz, mctz);
759 spin_unlock(&mctz->lock); 761 spin_unlock_irqrestore(&mctz->lock, flags);
760} 762}
761 763
762 764
@@ -779,7 +781,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
779 * mem is over its softlimit. 781 * mem is over its softlimit.
780 */ 782 */
781 if (excess || mz->on_tree) { 783 if (excess || mz->on_tree) {
782 spin_lock(&mctz->lock); 784 unsigned long flags;
785
786 spin_lock_irqsave(&mctz->lock, flags);
783 /* if on-tree, remove it */ 787 /* if on-tree, remove it */
784 if (mz->on_tree) 788 if (mz->on_tree)
785 __mem_cgroup_remove_exceeded(mz, mctz); 789 __mem_cgroup_remove_exceeded(mz, mctz);
@@ -788,7 +792,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
788 * If excess is 0, no tree ops. 792 * If excess is 0, no tree ops.
789 */ 793 */
790 __mem_cgroup_insert_exceeded(mz, mctz, excess); 794 __mem_cgroup_insert_exceeded(mz, mctz, excess);
791 spin_unlock(&mctz->lock); 795 spin_unlock_irqrestore(&mctz->lock, flags);
792 } 796 }
793 } 797 }
794} 798}
@@ -839,9 +843,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
839{ 843{
840 struct mem_cgroup_per_zone *mz; 844 struct mem_cgroup_per_zone *mz;
841 845
842 spin_lock(&mctz->lock); 846 spin_lock_irq(&mctz->lock);
843 mz = __mem_cgroup_largest_soft_limit_node(mctz); 847 mz = __mem_cgroup_largest_soft_limit_node(mctz);
844 spin_unlock(&mctz->lock); 848 spin_unlock_irq(&mctz->lock);
845 return mz; 849 return mz;
846} 850}
847 851
@@ -882,13 +886,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
882 return val; 886 return val;
883} 887}
884 888
885static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
886 bool charge)
887{
888 int val = (charge) ? 1 : -1;
889 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
890}
891
892static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 889static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
893 enum mem_cgroup_events_index idx) 890 enum mem_cgroup_events_index idx)
894{ 891{
@@ -909,13 +906,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
909 906
910static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 907static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
911 struct page *page, 908 struct page *page,
912 bool anon, int nr_pages) 909 int nr_pages)
913{ 910{
914 /* 911 /*
915 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 912 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
916 * counted as CACHE even if it's on ANON LRU. 913 * counted as CACHE even if it's on ANON LRU.
917 */ 914 */
918 if (anon) 915 if (PageAnon(page))
919 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 916 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
920 nr_pages); 917 nr_pages);
921 else 918 else
@@ -1013,7 +1010,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1013 */ 1010 */
1014static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1011static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1015{ 1012{
1016 preempt_disable();
1017 /* threshold event is triggered in finer grain than soft limit */ 1013 /* threshold event is triggered in finer grain than soft limit */
1018 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1014 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1019 MEM_CGROUP_TARGET_THRESH))) { 1015 MEM_CGROUP_TARGET_THRESH))) {
@@ -1026,8 +1022,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1026 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1022 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1027 MEM_CGROUP_TARGET_NUMAINFO); 1023 MEM_CGROUP_TARGET_NUMAINFO);
1028#endif 1024#endif
1029 preempt_enable();
1030
1031 mem_cgroup_threshold(memcg); 1025 mem_cgroup_threshold(memcg);
1032 if (unlikely(do_softlimit)) 1026 if (unlikely(do_softlimit))
1033 mem_cgroup_update_tree(memcg, page); 1027 mem_cgroup_update_tree(memcg, page);
@@ -1035,8 +1029,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1035 if (unlikely(do_numainfo)) 1029 if (unlikely(do_numainfo))
1036 atomic_inc(&memcg->numainfo_events); 1030 atomic_inc(&memcg->numainfo_events);
1037#endif 1031#endif
1038 } else 1032 }
1039 preempt_enable();
1040} 1033}
1041 1034
1042struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1035struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -1347,20 +1340,6 @@ out:
1347 return lruvec; 1340 return lruvec;
1348} 1341}
1349 1342
1350/*
1351 * Following LRU functions are allowed to be used without PCG_LOCK.
1352 * Operations are called by routine of global LRU independently from memcg.
1353 * What we have to take care of here is validness of pc->mem_cgroup.
1354 *
1355 * Changes to pc->mem_cgroup happens when
1356 * 1. charge
1357 * 2. moving account
1358 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1359 * It is added to LRU before charge.
1360 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1361 * When moving account, the page is not on LRU. It's isolated.
1362 */
1363
1364/** 1343/**
1365 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1344 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1366 * @page: the page 1345 * @page: the page
@@ -2261,22 +2240,14 @@ cleanup:
2261 * 2240 *
2262 * Notes: Race condition 2241 * Notes: Race condition
2263 * 2242 *
2264 * We usually use lock_page_cgroup() for accessing page_cgroup member but 2243 * Charging occurs during page instantiation, while the page is
2265 * it tends to be costly. But considering some conditions, we doesn't need 2244 * unmapped and locked in page migration, or while the page table is
2266 * to do so _always_. 2245 * locked in THP migration. No race is possible.
2267 *
2268 * Considering "charge", lock_page_cgroup() is not required because all
2269 * file-stat operations happen after a page is attached to radix-tree. There
2270 * are no race with "charge".
2271 * 2246 *
2272 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2247 * Uncharge happens to pages with zero references, no race possible.
2273 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2274 * if there are race with "uncharge". Statistics itself is properly handled
2275 * by flags.
2276 * 2248 *
2277 * Considering "move", this is an only case we see a race. To make the race 2249 * Charge moving between groups is protected by checking mm->moving
2278 * small, we check memcg->moving_account and detect there are possibility 2250 * account and taking the move_lock in the slowpath.
2279 * of race or not. If there is, we take a lock.
2280 */ 2251 */
2281 2252
2282void __mem_cgroup_begin_update_page_stat(struct page *page, 2253void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2689,6 +2660,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2689 return mem_cgroup_from_id(id); 2660 return mem_cgroup_from_id(id);
2690} 2661}
2691 2662
2663/*
2664 * try_get_mem_cgroup_from_page - look up page's memcg association
2665 * @page: the page
2666 *
2667 * Look up, get a css reference, and return the memcg that owns @page.
2668 *
2669 * The page must be locked to prevent racing with swap-in and page
2670 * cache charges. If coming from an unlocked page table, the caller
2671 * must ensure the page is on the LRU or this can race with charging.
2672 */
2692struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2673struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2693{ 2674{
2694 struct mem_cgroup *memcg = NULL; 2675 struct mem_cgroup *memcg = NULL;
@@ -2699,7 +2680,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2699 VM_BUG_ON_PAGE(!PageLocked(page), page); 2680 VM_BUG_ON_PAGE(!PageLocked(page), page);
2700 2681
2701 pc = lookup_page_cgroup(page); 2682 pc = lookup_page_cgroup(page);
2702 lock_page_cgroup(pc);
2703 if (PageCgroupUsed(pc)) { 2683 if (PageCgroupUsed(pc)) {
2704 memcg = pc->mem_cgroup; 2684 memcg = pc->mem_cgroup;
2705 if (memcg && !css_tryget_online(&memcg->css)) 2685 if (memcg && !css_tryget_online(&memcg->css))
@@ -2713,19 +2693,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2713 memcg = NULL; 2693 memcg = NULL;
2714 rcu_read_unlock(); 2694 rcu_read_unlock();
2715 } 2695 }
2716 unlock_page_cgroup(pc);
2717 return memcg; 2696 return memcg;
2718} 2697}
2719 2698
2699static void lock_page_lru(struct page *page, int *isolated)
2700{
2701 struct zone *zone = page_zone(page);
2702
2703 spin_lock_irq(&zone->lru_lock);
2704 if (PageLRU(page)) {
2705 struct lruvec *lruvec;
2706
2707 lruvec = mem_cgroup_page_lruvec(page, zone);
2708 ClearPageLRU(page);
2709 del_page_from_lru_list(page, lruvec, page_lru(page));
2710 *isolated = 1;
2711 } else
2712 *isolated = 0;
2713}
2714
2715static void unlock_page_lru(struct page *page, int isolated)
2716{
2717 struct zone *zone = page_zone(page);
2718
2719 if (isolated) {
2720 struct lruvec *lruvec;
2721
2722 lruvec = mem_cgroup_page_lruvec(page, zone);
2723 VM_BUG_ON_PAGE(PageLRU(page), page);
2724 SetPageLRU(page);
2725 add_page_to_lru_list(page, lruvec, page_lru(page));
2726 }
2727 spin_unlock_irq(&zone->lru_lock);
2728}
2729
2720static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2730static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2721 unsigned int nr_pages, bool anon, bool lrucare) 2731 unsigned int nr_pages, bool lrucare)
2722{ 2732{
2723 struct page_cgroup *pc = lookup_page_cgroup(page); 2733 struct page_cgroup *pc = lookup_page_cgroup(page);
2724 struct zone *uninitialized_var(zone); 2734 int isolated;
2725 struct lruvec *lruvec;
2726 bool was_on_lru = false;
2727 2735
2728 lock_page_cgroup(pc);
2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2736 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2730 /* 2737 /*
2731 * we don't need page_cgroup_lock about tail pages, becase they are not 2738 * we don't need page_cgroup_lock about tail pages, becase they are not
@@ -2736,39 +2743,38 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2736 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2743 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2737 * may already be on some other mem_cgroup's LRU. Take care of it. 2744 * may already be on some other mem_cgroup's LRU. Take care of it.
2738 */ 2745 */
2739 if (lrucare) { 2746 if (lrucare)
2740 zone = page_zone(page); 2747 lock_page_lru(page, &isolated);
2741 spin_lock_irq(&zone->lru_lock);
2742 if (PageLRU(page)) {
2743 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2744 ClearPageLRU(page);
2745 del_page_from_lru_list(page, lruvec, page_lru(page));
2746 was_on_lru = true;
2747 }
2748 }
2749 2748
2749 /*
2750 * Nobody should be changing or seriously looking at
2751 * pc->mem_cgroup and pc->flags at this point:
2752 *
2753 * - the page is uncharged
2754 *
2755 * - the page is off-LRU
2756 *
2757 * - an anonymous fault has exclusive page access, except for
2758 * a locked page table
2759 *
2760 * - a page cache insertion, a swapin fault, or a migration
2761 * have the page locked
2762 */
2750 pc->mem_cgroup = memcg; 2763 pc->mem_cgroup = memcg;
2751 SetPageCgroupUsed(pc); 2764 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2752
2753 if (lrucare) {
2754 if (was_on_lru) {
2755 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2756 VM_BUG_ON_PAGE(PageLRU(page), page);
2757 SetPageLRU(page);
2758 add_page_to_lru_list(page, lruvec, page_lru(page));
2759 }
2760 spin_unlock_irq(&zone->lru_lock);
2761 }
2762 2765
2763 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2766 if (lrucare)
2764 unlock_page_cgroup(pc); 2767 unlock_page_lru(page, isolated);
2765 2768
2769 local_irq_disable();
2770 mem_cgroup_charge_statistics(memcg, page, nr_pages);
2766 /* 2771 /*
2767 * "charge_statistics" updated event counter. Then, check it. 2772 * "charge_statistics" updated event counter. Then, check it.
2768 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2773 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2769 * if they exceeds softlimit. 2774 * if they exceeds softlimit.
2770 */ 2775 */
2771 memcg_check_events(memcg, page); 2776 memcg_check_events(memcg, page);
2777 local_irq_enable();
2772} 2778}
2773 2779
2774static DEFINE_MUTEX(set_limit_mutex); 2780static DEFINE_MUTEX(set_limit_mutex);
@@ -3395,7 +3401,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3395 3401
3396#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3402#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3397 3403
3398#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3399/* 3404/*
3400 * Because tail pages are not marked as "used", set it. We're under 3405 * Because tail pages are not marked as "used", set it. We're under
3401 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3406 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -3416,7 +3421,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3416 for (i = 1; i < HPAGE_PMD_NR; i++) { 3421 for (i = 1; i < HPAGE_PMD_NR; i++) {
3417 pc = head_pc + i; 3422 pc = head_pc + i;
3418 pc->mem_cgroup = memcg; 3423 pc->mem_cgroup = memcg;
3419 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3424 pc->flags = head_pc->flags;
3420 } 3425 }
3421 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3426 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3422 HPAGE_PMD_NR); 3427 HPAGE_PMD_NR);
@@ -3446,7 +3451,6 @@ static int mem_cgroup_move_account(struct page *page,
3446{ 3451{
3447 unsigned long flags; 3452 unsigned long flags;
3448 int ret; 3453 int ret;
3449 bool anon = PageAnon(page);
3450 3454
3451 VM_BUG_ON(from == to); 3455 VM_BUG_ON(from == to);
3452 VM_BUG_ON_PAGE(PageLRU(page), page); 3456 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -3460,15 +3464,21 @@ static int mem_cgroup_move_account(struct page *page,
3460 if (nr_pages > 1 && !PageTransHuge(page)) 3464 if (nr_pages > 1 && !PageTransHuge(page))
3461 goto out; 3465 goto out;
3462 3466
3463 lock_page_cgroup(pc); 3467 /*
3468 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
3469 * of its source page while we change it: page migration takes
3470 * both pages off the LRU, but page cache replacement doesn't.
3471 */
3472 if (!trylock_page(page))
3473 goto out;
3464 3474
3465 ret = -EINVAL; 3475 ret = -EINVAL;
3466 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3476 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3467 goto unlock; 3477 goto out_unlock;
3468 3478
3469 move_lock_mem_cgroup(from, &flags); 3479 move_lock_mem_cgroup(from, &flags);
3470 3480
3471 if (!anon && page_mapped(page)) { 3481 if (!PageAnon(page) && page_mapped(page)) {
3472 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3482 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3473 nr_pages); 3483 nr_pages);
3474 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3484 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3482,20 +3492,25 @@ static int mem_cgroup_move_account(struct page *page,
3482 nr_pages); 3492 nr_pages);
3483 } 3493 }
3484 3494
3485 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3495 /*
3496 * It is safe to change pc->mem_cgroup here because the page
3497 * is referenced, charged, and isolated - we can't race with
3498 * uncharging, charging, migration, or LRU putback.
3499 */
3486 3500
3487 /* caller should have done css_get */ 3501 /* caller should have done css_get */
3488 pc->mem_cgroup = to; 3502 pc->mem_cgroup = to;
3489 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3490 move_unlock_mem_cgroup(from, &flags); 3503 move_unlock_mem_cgroup(from, &flags);
3491 ret = 0; 3504 ret = 0;
3492unlock: 3505
3493 unlock_page_cgroup(pc); 3506 local_irq_disable();
3494 /* 3507 mem_cgroup_charge_statistics(to, page, nr_pages);
3495 * check events
3496 */
3497 memcg_check_events(to, page); 3508 memcg_check_events(to, page);
3509 mem_cgroup_charge_statistics(from, page, -nr_pages);
3498 memcg_check_events(from, page); 3510 memcg_check_events(from, page);
3511 local_irq_enable();
3512out_unlock:
3513 unlock_page(page);
3499out: 3514out:
3500 return ret; 3515 return ret;
3501} 3516}
@@ -3566,193 +3581,6 @@ out:
3566 return ret; 3581 return ret;
3567} 3582}
3568 3583
3569static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3570 unsigned int nr_pages,
3571 const enum charge_type ctype)
3572{
3573 struct memcg_batch_info *batch = NULL;
3574 bool uncharge_memsw = true;
3575
3576 /* If swapout, usage of swap doesn't decrease */
3577 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3578 uncharge_memsw = false;
3579
3580 batch = &current->memcg_batch;
3581 /*
3582 * In usual, we do css_get() when we remember memcg pointer.
3583 * But in this case, we keep res->usage until end of a series of
3584 * uncharges. Then, it's ok to ignore memcg's refcnt.
3585 */
3586 if (!batch->memcg)
3587 batch->memcg = memcg;
3588 /*
3589 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
3590 * In those cases, all pages freed continuously can be expected to be in
3591 * the same cgroup and we have chance to coalesce uncharges.
3592 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
3593 * because we want to do uncharge as soon as possible.
3594 */
3595
3596 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
3597 goto direct_uncharge;
3598
3599 if (nr_pages > 1)
3600 goto direct_uncharge;
3601
3602 /*
3603 * In typical case, batch->memcg == mem. This means we can
3604 * merge a series of uncharges to an uncharge of res_counter.
3605 * If not, we uncharge res_counter ony by one.
3606 */
3607 if (batch->memcg != memcg)
3608 goto direct_uncharge;
3609 /* remember freed charge and uncharge it later */
3610 batch->nr_pages++;
3611 if (uncharge_memsw)
3612 batch->memsw_nr_pages++;
3613 return;
3614direct_uncharge:
3615 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3616 if (uncharge_memsw)
3617 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3618 if (unlikely(batch->memcg != memcg))
3619 memcg_oom_recover(memcg);
3620}
3621
3622/*
3623 * uncharge if !page_mapped(page)
3624 */
3625static struct mem_cgroup *
3626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
3627 bool end_migration)
3628{
3629 struct mem_cgroup *memcg = NULL;
3630 unsigned int nr_pages = 1;
3631 struct page_cgroup *pc;
3632 bool anon;
3633
3634 if (mem_cgroup_disabled())
3635 return NULL;
3636
3637 if (PageTransHuge(page)) {
3638 nr_pages <<= compound_order(page);
3639 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3640 }
3641 /*
3642 * Check if our page_cgroup is valid
3643 */
3644 pc = lookup_page_cgroup(page);
3645 if (unlikely(!PageCgroupUsed(pc)))
3646 return NULL;
3647
3648 lock_page_cgroup(pc);
3649
3650 memcg = pc->mem_cgroup;
3651
3652 if (!PageCgroupUsed(pc))
3653 goto unlock_out;
3654
3655 anon = PageAnon(page);
3656
3657 switch (ctype) {
3658 case MEM_CGROUP_CHARGE_TYPE_ANON:
3659 /*
3660 * Generally PageAnon tells if it's the anon statistics to be
3661 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
3662 * used before page reached the stage of being marked PageAnon.
3663 */
3664 anon = true;
3665 /* fallthrough */
3666 case MEM_CGROUP_CHARGE_TYPE_DROP:
3667 /* See mem_cgroup_prepare_migration() */
3668 if (page_mapped(page))
3669 goto unlock_out;
3670 /*
3671 * Pages under migration may not be uncharged. But
3672 * end_migration() /must/ be the one uncharging the
3673 * unused post-migration page and so it has to call
3674 * here with the migration bit still set. See the
3675 * res_counter handling below.
3676 */
3677 if (!end_migration && PageCgroupMigration(pc))
3678 goto unlock_out;
3679 break;
3680 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3681 if (!PageAnon(page)) { /* Shared memory */
3682 if (page->mapping && !page_is_file_cache(page))
3683 goto unlock_out;
3684 } else if (page_mapped(page)) /* Anon */
3685 goto unlock_out;
3686 break;
3687 default:
3688 break;
3689 }
3690
3691 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
3692
3693 ClearPageCgroupUsed(pc);
3694 /*
3695 * pc->mem_cgroup is not cleared here. It will be accessed when it's
3696 * freed from LRU. This is safe because uncharged page is expected not
3697 * to be reused (freed soon). Exception is SwapCache, it's handled by
3698 * special functions.
3699 */
3700
3701 unlock_page_cgroup(pc);
3702 /*
3703 * even after unlock, we have memcg->res.usage here and this memcg
3704 * will never be freed, so it's safe to call css_get().
3705 */
3706 memcg_check_events(memcg, page);
3707 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3708 mem_cgroup_swap_statistics(memcg, true);
3709 css_get(&memcg->css);
3710 }
3711 /*
3712 * Migration does not charge the res_counter for the
3713 * replacement page, so leave it alone when phasing out the
3714 * page that is unused after the migration.
3715 */
3716 if (!end_migration)
3717 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3718
3719 return memcg;
3720
3721unlock_out:
3722 unlock_page_cgroup(pc);
3723 return NULL;
3724}
3725
3726void mem_cgroup_uncharge_page(struct page *page)
3727{
3728 /* early check. */
3729 if (page_mapped(page))
3730 return;
3731 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3732 /*
3733 * If the page is in swap cache, uncharge should be deferred
3734 * to the swap path, which also properly accounts swap usage
3735 * and handles memcg lifetime.
3736 *
3737 * Note that this check is not stable and reclaim may add the
3738 * page to swap cache at any time after this. However, if the
3739 * page is not in swap cache by the time page->mapcount hits
3740 * 0, there won't be any page table references to the swap
3741 * slot, and reclaim will free it and not actually write the
3742 * page to disk.
3743 */
3744 if (PageSwapCache(page))
3745 return;
3746 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3747}
3748
3749void mem_cgroup_uncharge_cache_page(struct page *page)
3750{
3751 VM_BUG_ON_PAGE(page_mapped(page), page);
3752 VM_BUG_ON_PAGE(page->mapping, page);
3753 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3754}
3755
3756/* 3584/*
3757 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3585 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3758 * In that cases, pages are freed continuously and we can expect pages 3586 * In that cases, pages are freed continuously and we can expect pages
@@ -3763,6 +3591,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
3763 3591
3764void mem_cgroup_uncharge_start(void) 3592void mem_cgroup_uncharge_start(void)
3765{ 3593{
3594 unsigned long flags;
3595
3596 local_irq_save(flags);
3766 current->memcg_batch.do_batch++; 3597 current->memcg_batch.do_batch++;
3767 /* We can do nest. */ 3598 /* We can do nest. */
3768 if (current->memcg_batch.do_batch == 1) { 3599 if (current->memcg_batch.do_batch == 1) {
@@ -3770,21 +3601,18 @@ void mem_cgroup_uncharge_start(void)
3770 current->memcg_batch.nr_pages = 0; 3601 current->memcg_batch.nr_pages = 0;
3771 current->memcg_batch.memsw_nr_pages = 0; 3602 current->memcg_batch.memsw_nr_pages = 0;
3772 } 3603 }
3604 local_irq_restore(flags);
3773} 3605}
3774 3606
3775void mem_cgroup_uncharge_end(void) 3607void mem_cgroup_uncharge_end(void)
3776{ 3608{
3777 struct memcg_batch_info *batch = &current->memcg_batch; 3609 struct memcg_batch_info *batch = &current->memcg_batch;
3610 unsigned long flags;
3778 3611
3779 if (!batch->do_batch) 3612 local_irq_save(flags);
3780 return; 3613 VM_BUG_ON(!batch->do_batch);
3781 3614 if (--batch->do_batch) /* If stacked, do nothing */
3782 batch->do_batch--; 3615 goto out;
3783 if (batch->do_batch) /* If stacked, do nothing. */
3784 return;
3785
3786 if (!batch->memcg)
3787 return;
3788 /* 3616 /*
3789 * This "batch->memcg" is valid without any css_get/put etc... 3617 * This "batch->memcg" is valid without any css_get/put etc...
3790 * bacause we hide charges behind us. 3618 * bacause we hide charges behind us.
@@ -3796,61 +3624,16 @@ void mem_cgroup_uncharge_end(void)
3796 res_counter_uncharge(&batch->memcg->memsw, 3624 res_counter_uncharge(&batch->memcg->memsw,
3797 batch->memsw_nr_pages * PAGE_SIZE); 3625 batch->memsw_nr_pages * PAGE_SIZE);
3798 memcg_oom_recover(batch->memcg); 3626 memcg_oom_recover(batch->memcg);
3799 /* forget this pointer (for sanity check) */ 3627out:
3800 batch->memcg = NULL; 3628 local_irq_restore(flags);
3801}
3802
3803#ifdef CONFIG_SWAP
3804/*
3805 * called after __delete_from_swap_cache() and drop "page" account.
3806 * memcg information is recorded to swap_cgroup of "ent"
3807 */
3808void
3809mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3810{
3811 struct mem_cgroup *memcg;
3812 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3813
3814 if (!swapout) /* this was a swap cache but the swap is unused ! */
3815 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3816
3817 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3818
3819 /*
3820 * record memcg information, if swapout && memcg != NULL,
3821 * css_get() was called in uncharge().
3822 */
3823 if (do_swap_account && swapout && memcg)
3824 swap_cgroup_record(ent, mem_cgroup_id(memcg));
3825} 3629}
3826#endif
3827 3630
3828#ifdef CONFIG_MEMCG_SWAP 3631#ifdef CONFIG_MEMCG_SWAP
3829/* 3632static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3830 * called from swap_entry_free(). remove record in swap_cgroup and 3633 bool charge)
3831 * uncharge "memsw" account.
3832 */
3833void mem_cgroup_uncharge_swap(swp_entry_t ent)
3834{ 3634{
3835 struct mem_cgroup *memcg; 3635 int val = (charge) ? 1 : -1;
3836 unsigned short id; 3636 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
3837
3838 if (!do_swap_account)
3839 return;
3840
3841 id = swap_cgroup_record(ent, 0);
3842 rcu_read_lock();
3843 memcg = mem_cgroup_lookup(id);
3844 if (memcg) {
3845 /*
3846 * We uncharge this because swap is freed. This memcg can
3847 * be obsolete one. We avoid calling css_tryget_online().
3848 */
3849 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3850 mem_cgroup_swap_statistics(memcg, false);
3851 css_put(&memcg->css);
3852 }
3853 rcu_read_unlock();
3854} 3637}
3855 3638
3856/** 3639/**
@@ -3902,169 +3685,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3902} 3685}
3903#endif 3686#endif
3904 3687
3905/*
3906 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3907 * page belongs to.
3908 */
3909void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3910 struct mem_cgroup **memcgp)
3911{
3912 struct mem_cgroup *memcg = NULL;
3913 unsigned int nr_pages = 1;
3914 struct page_cgroup *pc;
3915
3916 *memcgp = NULL;
3917
3918 if (mem_cgroup_disabled())
3919 return;
3920
3921 if (PageTransHuge(page))
3922 nr_pages <<= compound_order(page);
3923
3924 pc = lookup_page_cgroup(page);
3925 lock_page_cgroup(pc);
3926 if (PageCgroupUsed(pc)) {
3927 memcg = pc->mem_cgroup;
3928 css_get(&memcg->css);
3929 /*
3930 * At migrating an anonymous page, its mapcount goes down
3931 * to 0 and uncharge() will be called. But, even if it's fully
3932 * unmapped, migration may fail and this page has to be
3933 * charged again. We set MIGRATION flag here and delay uncharge
3934 * until end_migration() is called
3935 *
3936 * Corner Case Thinking
3937 * A)
3938 * When the old page was mapped as Anon and it's unmap-and-freed
3939 * while migration was ongoing.
3940 * If unmap finds the old page, uncharge() of it will be delayed
3941 * until end_migration(). If unmap finds a new page, it's
3942 * uncharged when it make mapcount to be 1->0. If unmap code
3943 * finds swap_migration_entry, the new page will not be mapped
3944 * and end_migration() will find it(mapcount==0).
3945 *
3946 * B)
3947 * When the old page was mapped but migraion fails, the kernel
3948 * remaps it. A charge for it is kept by MIGRATION flag even
3949 * if mapcount goes down to 0. We can do remap successfully
3950 * without charging it again.
3951 *
3952 * C)
3953 * The "old" page is under lock_page() until the end of
3954 * migration, so, the old page itself will not be swapped-out.
3955 * If the new page is swapped out before end_migraton, our
3956 * hook to usual swap-out path will catch the event.
3957 */
3958 if (PageAnon(page))
3959 SetPageCgroupMigration(pc);
3960 }
3961 unlock_page_cgroup(pc);
3962 /*
3963 * If the page is not charged at this point,
3964 * we return here.
3965 */
3966 if (!memcg)
3967 return;
3968
3969 *memcgp = memcg;
3970 /*
3971 * We charge new page before it's used/mapped. So, even if unlock_page()
3972 * is called before end_migration, we can catch all events on this new
3973 * page. In the case new page is migrated but not remapped, new page's
3974 * mapcount will be finally 0 and we call uncharge in end_migration().
3975 */
3976 /*
3977 * The page is committed to the memcg, but it's not actually
3978 * charged to the res_counter since we plan on replacing the
3979 * old one and only one page is going to be left afterwards.
3980 */
3981 commit_charge(newpage, memcg, nr_pages, PageAnon(page), false);
3982}
3983
3984/* remove redundant charge if migration failed*/
3985void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3986 struct page *oldpage, struct page *newpage, bool migration_ok)
3987{
3988 struct page *used, *unused;
3989 struct page_cgroup *pc;
3990 bool anon;
3991
3992 if (!memcg)
3993 return;
3994
3995 if (!migration_ok) {
3996 used = oldpage;
3997 unused = newpage;
3998 } else {
3999 used = newpage;
4000 unused = oldpage;
4001 }
4002 anon = PageAnon(used);
4003 __mem_cgroup_uncharge_common(unused,
4004 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4005 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4006 true);
4007 css_put(&memcg->css);
4008 /*
4009 * We disallowed uncharge of pages under migration because mapcount
4010 * of the page goes down to zero, temporarly.
4011 * Clear the flag and check the page should be charged.
4012 */
4013 pc = lookup_page_cgroup(oldpage);
4014 lock_page_cgroup(pc);
4015 ClearPageCgroupMigration(pc);
4016 unlock_page_cgroup(pc);
4017
4018 /*
4019 * If a page is a file cache, radix-tree replacement is very atomic
4020 * and we can skip this check. When it was an Anon page, its mapcount
4021 * goes down to 0. But because we added MIGRATION flage, it's not
4022 * uncharged yet. There are several case but page->mapcount check
4023 * and USED bit check in mem_cgroup_uncharge_page() will do enough
4024 * check. (see prepare_charge() also)
4025 */
4026 if (anon)
4027 mem_cgroup_uncharge_page(used);
4028}
4029
4030/*
4031 * At replace page cache, newpage is not under any memcg but it's on
4032 * LRU. So, this function doesn't touch res_counter but handles LRU
4033 * in correct way. Both pages are locked so we cannot race with uncharge.
4034 */
4035void mem_cgroup_replace_page_cache(struct page *oldpage,
4036 struct page *newpage)
4037{
4038 struct mem_cgroup *memcg = NULL;
4039 struct page_cgroup *pc;
4040
4041 if (mem_cgroup_disabled())
4042 return;
4043
4044 pc = lookup_page_cgroup(oldpage);
4045 /* fix accounting on old pages */
4046 lock_page_cgroup(pc);
4047 if (PageCgroupUsed(pc)) {
4048 memcg = pc->mem_cgroup;
4049 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4050 ClearPageCgroupUsed(pc);
4051 }
4052 unlock_page_cgroup(pc);
4053
4054 /*
4055 * When called from shmem_replace_page(), in some cases the
4056 * oldpage has already been charged, and in some cases not.
4057 */
4058 if (!memcg)
4059 return;
4060 /*
4061 * Even if newpage->mapping was NULL before starting replacement,
4062 * the newpage may be on LRU(or pagevec for LRU) already. We lock
4063 * LRU while we overwrite pc->mem_cgroup.
4064 */
4065 commit_charge(newpage, memcg, 1, false, true);
4066}
4067
4068#ifdef CONFIG_DEBUG_VM 3688#ifdef CONFIG_DEBUG_VM
4069static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3689static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4070{ 3690{
@@ -4263,7 +3883,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4263 gfp_mask, &nr_scanned); 3883 gfp_mask, &nr_scanned);
4264 nr_reclaimed += reclaimed; 3884 nr_reclaimed += reclaimed;
4265 *total_scanned += nr_scanned; 3885 *total_scanned += nr_scanned;
4266 spin_lock(&mctz->lock); 3886 spin_lock_irq(&mctz->lock);
4267 3887
4268 /* 3888 /*
4269 * If we failed to reclaim anything from this memory cgroup 3889 * If we failed to reclaim anything from this memory cgroup
@@ -4303,7 +3923,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4303 */ 3923 */
4304 /* If excess == 0, no tree ops */ 3924 /* If excess == 0, no tree ops */
4305 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3925 __mem_cgroup_insert_exceeded(mz, mctz, excess);
4306 spin_unlock(&mctz->lock); 3926 spin_unlock_irq(&mctz->lock);
4307 css_put(&mz->memcg->css); 3927 css_put(&mz->memcg->css);
4308 loop++; 3928 loop++;
4309 /* 3929 /*
@@ -6265,9 +5885,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6265 if (page) { 5885 if (page) {
6266 pc = lookup_page_cgroup(page); 5886 pc = lookup_page_cgroup(page);
6267 /* 5887 /*
6268 * Do only loose check w/o page_cgroup lock. 5888 * Do only loose check w/o serialization.
6269 * mem_cgroup_move_account() checks the pc is valid or not under 5889 * mem_cgroup_move_account() checks the pc is valid or
6270 * the lock. 5890 * not under LRU exclusion.
6271 */ 5891 */
6272 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5892 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6273 ret = MC_TARGET_PAGE; 5893 ret = MC_TARGET_PAGE;
@@ -6729,6 +6349,67 @@ static void __init enable_swap_cgroup(void)
6729} 6349}
6730#endif 6350#endif
6731 6351
6352#ifdef CONFIG_MEMCG_SWAP
6353/**
6354 * mem_cgroup_swapout - transfer a memsw charge to swap
6355 * @page: page whose memsw charge to transfer
6356 * @entry: swap entry to move the charge to
6357 *
6358 * Transfer the memsw charge of @page to @entry.
6359 */
6360void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6361{
6362 struct page_cgroup *pc;
6363 unsigned short oldid;
6364
6365 VM_BUG_ON_PAGE(PageLRU(page), page);
6366 VM_BUG_ON_PAGE(page_count(page), page);
6367
6368 if (!do_swap_account)
6369 return;
6370
6371 pc = lookup_page_cgroup(page);
6372
6373 /* Readahead page, never charged */
6374 if (!PageCgroupUsed(pc))
6375 return;
6376
6377 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
6378
6379 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6380 VM_BUG_ON_PAGE(oldid, page);
6381
6382 pc->flags &= ~PCG_MEMSW;
6383 css_get(&pc->mem_cgroup->css);
6384 mem_cgroup_swap_statistics(pc->mem_cgroup, true);
6385}
6386
6387/**
6388 * mem_cgroup_uncharge_swap - uncharge a swap entry
6389 * @entry: swap entry to uncharge
6390 *
6391 * Drop the memsw charge associated with @entry.
6392 */
6393void mem_cgroup_uncharge_swap(swp_entry_t entry)
6394{
6395 struct mem_cgroup *memcg;
6396 unsigned short id;
6397
6398 if (!do_swap_account)
6399 return;
6400
6401 id = swap_cgroup_record(entry, 0);
6402 rcu_read_lock();
6403 memcg = mem_cgroup_lookup(id);
6404 if (memcg) {
6405 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
6406 mem_cgroup_swap_statistics(memcg, false);
6407 css_put(&memcg->css);
6408 }
6409 rcu_read_unlock();
6410}
6411#endif
6412
6732/** 6413/**
6733 * mem_cgroup_try_charge - try charging a page 6414 * mem_cgroup_try_charge - try charging a page
6734 * @page: page to charge 6415 * @page: page to charge
@@ -6831,7 +6512,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6831 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6512 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6832 } 6513 }
6833 6514
6834 commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare); 6515 commit_charge(page, memcg, nr_pages, lrucare);
6835 6516
6836 if (do_swap_account && PageSwapCache(page)) { 6517 if (do_swap_account && PageSwapCache(page)) {
6837 swp_entry_t entry = { .val = page_private(page) }; 6518 swp_entry_t entry = { .val = page_private(page) };
@@ -6873,6 +6554,139 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6873 cancel_charge(memcg, nr_pages); 6554 cancel_charge(memcg, nr_pages);
6874} 6555}
6875 6556
6557/**
6558 * mem_cgroup_uncharge - uncharge a page
6559 * @page: page to uncharge
6560 *
6561 * Uncharge a page previously charged with mem_cgroup_try_charge() and
6562 * mem_cgroup_commit_charge().
6563 */
6564void mem_cgroup_uncharge(struct page *page)
6565{
6566 struct memcg_batch_info *batch;
6567 unsigned int nr_pages = 1;
6568 struct mem_cgroup *memcg;
6569 struct page_cgroup *pc;
6570 unsigned long pc_flags;
6571 unsigned long flags;
6572
6573 VM_BUG_ON_PAGE(PageLRU(page), page);
6574 VM_BUG_ON_PAGE(page_count(page), page);
6575
6576 if (mem_cgroup_disabled())
6577 return;
6578
6579 pc = lookup_page_cgroup(page);
6580
6581 /* Every final put_page() ends up here */
6582 if (!PageCgroupUsed(pc))
6583 return;
6584
6585 if (PageTransHuge(page)) {
6586 nr_pages <<= compound_order(page);
6587 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6588 }
6589 /*
6590 * Nobody should be changing or seriously looking at
6591 * pc->mem_cgroup and pc->flags at this point, we have fully
6592 * exclusive access to the page.
6593 */
6594 memcg = pc->mem_cgroup;
6595 pc_flags = pc->flags;
6596 pc->flags = 0;
6597
6598 local_irq_save(flags);
6599
6600 if (nr_pages > 1)
6601 goto direct;
6602 if (unlikely(test_thread_flag(TIF_MEMDIE)))
6603 goto direct;
6604 batch = &current->memcg_batch;
6605 if (!batch->do_batch)
6606 goto direct;
6607 if (batch->memcg && batch->memcg != memcg)
6608 goto direct;
6609 if (!batch->memcg)
6610 batch->memcg = memcg;
6611 if (pc_flags & PCG_MEM)
6612 batch->nr_pages++;
6613 if (pc_flags & PCG_MEMSW)
6614 batch->memsw_nr_pages++;
6615 goto out;
6616direct:
6617 if (pc_flags & PCG_MEM)
6618 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
6619 if (pc_flags & PCG_MEMSW)
6620 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
6621 memcg_oom_recover(memcg);
6622out:
6623 mem_cgroup_charge_statistics(memcg, page, -nr_pages);
6624 memcg_check_events(memcg, page);
6625
6626 local_irq_restore(flags);
6627}
6628
6629/**
6630 * mem_cgroup_migrate - migrate a charge to another page
6631 * @oldpage: currently charged page
6632 * @newpage: page to transfer the charge to
6633 * @lrucare: both pages might be on the LRU already
6634 *
6635 * Migrate the charge from @oldpage to @newpage.
6636 *
6637 * Both pages must be locked, @newpage->mapping must be set up.
6638 */
6639void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6640 bool lrucare)
6641{
6642 unsigned int nr_pages = 1;
6643 struct page_cgroup *pc;
6644 int isolated;
6645
6646 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6647 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6648 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
6649 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
6650 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6651
6652 if (mem_cgroup_disabled())
6653 return;
6654
6655 /* Page cache replacement: new page already charged? */
6656 pc = lookup_page_cgroup(newpage);
6657 if (PageCgroupUsed(pc))
6658 return;
6659
6660 /* Re-entrant migration: old page already uncharged? */
6661 pc = lookup_page_cgroup(oldpage);
6662 if (!PageCgroupUsed(pc))
6663 return;
6664
6665 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6666 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6667
6668 if (PageTransHuge(oldpage)) {
6669 nr_pages <<= compound_order(oldpage);
6670 VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
6671 VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
6672 }
6673
6674 if (lrucare)
6675 lock_page_lru(oldpage, &isolated);
6676
6677 pc->flags = 0;
6678
6679 if (lrucare)
6680 unlock_page_lru(oldpage, isolated);
6681
6682 local_irq_disable();
6683 mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
6684 memcg_check_events(pc->mem_cgroup, oldpage);
6685 local_irq_enable();
6686
6687 commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
6688}
6689
6876/* 6690/*
6877 * subsys_initcall() for memory controller. 6691 * subsys_initcall() for memory controller.
6878 * 6692 *
diff --git a/mm/memory.c b/mm/memory.c
index 6d7648773dc4..2a899e4e82ba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1292,7 +1292,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
1292 details = NULL; 1292 details = NULL;
1293 1293
1294 BUG_ON(addr >= end); 1294 BUG_ON(addr >= end);
1295 mem_cgroup_uncharge_start();
1296 tlb_start_vma(tlb, vma); 1295 tlb_start_vma(tlb, vma);
1297 pgd = pgd_offset(vma->vm_mm, addr); 1296 pgd = pgd_offset(vma->vm_mm, addr);
1298 do { 1297 do {
@@ -1302,7 +1301,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
1302 next = zap_pud_range(tlb, vma, pgd, addr, next, details); 1301 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1303 } while (pgd++, addr = next, addr != end); 1302 } while (pgd++, addr = next, addr != end);
1304 tlb_end_vma(tlb, vma); 1303 tlb_end_vma(tlb, vma);
1305 mem_cgroup_uncharge_end();
1306} 1304}
1307 1305
1308 1306
diff --git a/mm/migrate.c b/mm/migrate.c
index be6dbf995c0c..f78ec9bd454d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -780,6 +780,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
780 if (rc != MIGRATEPAGE_SUCCESS) { 780 if (rc != MIGRATEPAGE_SUCCESS) {
781 newpage->mapping = NULL; 781 newpage->mapping = NULL;
782 } else { 782 } else {
783 mem_cgroup_migrate(page, newpage, false);
783 if (remap_swapcache) 784 if (remap_swapcache)
784 remove_migration_ptes(page, newpage); 785 remove_migration_ptes(page, newpage);
785 page->mapping = NULL; 786 page->mapping = NULL;
@@ -795,7 +796,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
795{ 796{
796 int rc = -EAGAIN; 797 int rc = -EAGAIN;
797 int remap_swapcache = 1; 798 int remap_swapcache = 1;
798 struct mem_cgroup *mem;
799 struct anon_vma *anon_vma = NULL; 799 struct anon_vma *anon_vma = NULL;
800 800
801 if (!trylock_page(page)) { 801 if (!trylock_page(page)) {
@@ -821,9 +821,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
821 lock_page(page); 821 lock_page(page);
822 } 822 }
823 823
824 /* charge against new page */
825 mem_cgroup_prepare_migration(page, newpage, &mem);
826
827 if (PageWriteback(page)) { 824 if (PageWriteback(page)) {
828 /* 825 /*
829 * Only in the case of a full synchronous migration is it 826 * Only in the case of a full synchronous migration is it
@@ -833,10 +830,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
833 */ 830 */
834 if (mode != MIGRATE_SYNC) { 831 if (mode != MIGRATE_SYNC) {
835 rc = -EBUSY; 832 rc = -EBUSY;
836 goto uncharge; 833 goto out_unlock;
837 } 834 }
838 if (!force) 835 if (!force)
839 goto uncharge; 836 goto out_unlock;
840 wait_on_page_writeback(page); 837 wait_on_page_writeback(page);
841 } 838 }
842 /* 839 /*
@@ -872,7 +869,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
872 */ 869 */
873 remap_swapcache = 0; 870 remap_swapcache = 0;
874 } else { 871 } else {
875 goto uncharge; 872 goto out_unlock;
876 } 873 }
877 } 874 }
878 875
@@ -885,7 +882,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
885 * the page migration right away (proteced by page lock). 882 * the page migration right away (proteced by page lock).
886 */ 883 */
887 rc = balloon_page_migrate(newpage, page, mode); 884 rc = balloon_page_migrate(newpage, page, mode);
888 goto uncharge; 885 goto out_unlock;
889 } 886 }
890 887
891 /* 888 /*
@@ -904,7 +901,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
904 VM_BUG_ON_PAGE(PageAnon(page), page); 901 VM_BUG_ON_PAGE(PageAnon(page), page);
905 if (page_has_private(page)) { 902 if (page_has_private(page)) {
906 try_to_free_buffers(page); 903 try_to_free_buffers(page);
907 goto uncharge; 904 goto out_unlock;
908 } 905 }
909 goto skip_unmap; 906 goto skip_unmap;
910 } 907 }
@@ -923,10 +920,7 @@ skip_unmap:
923 if (anon_vma) 920 if (anon_vma)
924 put_anon_vma(anon_vma); 921 put_anon_vma(anon_vma);
925 922
926uncharge: 923out_unlock:
927 mem_cgroup_end_migration(mem, page, newpage,
928 (rc == MIGRATEPAGE_SUCCESS ||
929 rc == MIGRATEPAGE_BALLOON_SUCCESS));
930 unlock_page(page); 924 unlock_page(page);
931out: 925out:
932 return rc; 926 return rc;
@@ -1786,7 +1780,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1786 pg_data_t *pgdat = NODE_DATA(node); 1780 pg_data_t *pgdat = NODE_DATA(node);
1787 int isolated = 0; 1781 int isolated = 0;
1788 struct page *new_page = NULL; 1782 struct page *new_page = NULL;
1789 struct mem_cgroup *memcg = NULL;
1790 int page_lru = page_is_file_cache(page); 1783 int page_lru = page_is_file_cache(page);
1791 unsigned long mmun_start = address & HPAGE_PMD_MASK; 1784 unsigned long mmun_start = address & HPAGE_PMD_MASK;
1792 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; 1785 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
@@ -1852,15 +1845,6 @@ fail_putback:
1852 goto out_unlock; 1845 goto out_unlock;
1853 } 1846 }
1854 1847
1855 /*
1856 * Traditional migration needs to prepare the memcg charge
1857 * transaction early to prevent the old page from being
1858 * uncharged when installing migration entries. Here we can
1859 * save the potential rollback and start the charge transfer
1860 * only when migration is already known to end successfully.
1861 */
1862 mem_cgroup_prepare_migration(page, new_page, &memcg);
1863
1864 orig_entry = *pmd; 1848 orig_entry = *pmd;
1865 entry = mk_pmd(new_page, vma->vm_page_prot); 1849 entry = mk_pmd(new_page, vma->vm_page_prot);
1866 entry = pmd_mkhuge(entry); 1850 entry = pmd_mkhuge(entry);
@@ -1888,14 +1872,10 @@ fail_putback:
1888 goto fail_putback; 1872 goto fail_putback;
1889 } 1873 }
1890 1874
1875 mem_cgroup_migrate(page, new_page, false);
1876
1891 page_remove_rmap(page); 1877 page_remove_rmap(page);
1892 1878
1893 /*
1894 * Finish the charge transaction under the page table lock to
1895 * prevent split_huge_page() from dividing up the charge
1896 * before it's fully transferred to the new page.
1897 */
1898 mem_cgroup_end_migration(memcg, page, new_page, true);
1899 spin_unlock(ptl); 1879 spin_unlock(ptl);
1900 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1880 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1901 1881
diff --git a/mm/rmap.c b/mm/rmap.c
index f56b5ed78128..3e8491c504f8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1089,7 +1089,6 @@ void page_remove_rmap(struct page *page)
1089 if (unlikely(PageHuge(page))) 1089 if (unlikely(PageHuge(page)))
1090 goto out; 1090 goto out;
1091 if (anon) { 1091 if (anon) {
1092 mem_cgroup_uncharge_page(page);
1093 if (PageTransHuge(page)) 1092 if (PageTransHuge(page))
1094 __dec_zone_page_state(page, 1093 __dec_zone_page_state(page,
1095 NR_ANON_TRANSPARENT_HUGEPAGES); 1094 NR_ANON_TRANSPARENT_HUGEPAGES);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f1a8085538b..6dc80d298f9d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -419,7 +419,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
419 pvec.pages, indices); 419 pvec.pages, indices);
420 if (!pvec.nr) 420 if (!pvec.nr)
421 break; 421 break;
422 mem_cgroup_uncharge_start();
423 for (i = 0; i < pagevec_count(&pvec); i++) { 422 for (i = 0; i < pagevec_count(&pvec); i++) {
424 struct page *page = pvec.pages[i]; 423 struct page *page = pvec.pages[i];
425 424
@@ -447,7 +446,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
447 } 446 }
448 pagevec_remove_exceptionals(&pvec); 447 pagevec_remove_exceptionals(&pvec);
449 pagevec_release(&pvec); 448 pagevec_release(&pvec);
450 mem_cgroup_uncharge_end();
451 cond_resched(); 449 cond_resched();
452 index++; 450 index++;
453 } 451 }
@@ -495,7 +493,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
495 index = start; 493 index = start;
496 continue; 494 continue;
497 } 495 }
498 mem_cgroup_uncharge_start();
499 for (i = 0; i < pagevec_count(&pvec); i++) { 496 for (i = 0; i < pagevec_count(&pvec); i++) {
500 struct page *page = pvec.pages[i]; 497 struct page *page = pvec.pages[i];
501 498
@@ -531,7 +528,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
531 } 528 }
532 pagevec_remove_exceptionals(&pvec); 529 pagevec_remove_exceptionals(&pvec);
533 pagevec_release(&pvec); 530 pagevec_release(&pvec);
534 mem_cgroup_uncharge_end();
535 index++; 531 index++;
536 } 532 }
537 533
@@ -835,7 +831,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
835 } 831 }
836 832
837 mutex_unlock(&shmem_swaplist_mutex); 833 mutex_unlock(&shmem_swaplist_mutex);
838 swapcache_free(swap, NULL); 834 swapcache_free(swap);
839redirty: 835redirty:
840 set_page_dirty(page); 836 set_page_dirty(page);
841 if (wbc->for_reclaim) 837 if (wbc->for_reclaim)
@@ -1008,7 +1004,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1008 */ 1004 */
1009 oldpage = newpage; 1005 oldpage = newpage;
1010 } else { 1006 } else {
1011 mem_cgroup_replace_page_cache(oldpage, newpage); 1007 mem_cgroup_migrate(oldpage, newpage, false);
1012 lru_cache_add_anon(newpage); 1008 lru_cache_add_anon(newpage);
1013 *pagep = newpage; 1009 *pagep = newpage;
1014 } 1010 }
diff --git a/mm/swap.c b/mm/swap.c
index 3baca701bb78..00523fffa5ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -62,6 +62,7 @@ static void __page_cache_release(struct page *page)
62 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 62 del_page_from_lru_list(page, lruvec, page_off_lru(page));
63 spin_unlock_irqrestore(&zone->lru_lock, flags); 63 spin_unlock_irqrestore(&zone->lru_lock, flags);
64 } 64 }
65 mem_cgroup_uncharge(page);
65} 66}
66 67
67static void __put_single_page(struct page *page) 68static void __put_single_page(struct page *page)
@@ -907,6 +908,8 @@ void release_pages(struct page **pages, int nr, bool cold)
907 struct lruvec *lruvec; 908 struct lruvec *lruvec;
908 unsigned long uninitialized_var(flags); 909 unsigned long uninitialized_var(flags);
909 910
911 mem_cgroup_uncharge_start();
912
910 for (i = 0; i < nr; i++) { 913 for (i = 0; i < nr; i++) {
911 struct page *page = pages[i]; 914 struct page *page = pages[i];
912 915
@@ -938,6 +941,7 @@ void release_pages(struct page **pages, int nr, bool cold)
938 __ClearPageLRU(page); 941 __ClearPageLRU(page);
939 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 942 del_page_from_lru_list(page, lruvec, page_off_lru(page));
940 } 943 }
944 mem_cgroup_uncharge(page);
941 945
942 /* Clear Active bit in case of parallel mark_page_accessed */ 946 /* Clear Active bit in case of parallel mark_page_accessed */
943 __ClearPageActive(page); 947 __ClearPageActive(page);
@@ -947,6 +951,8 @@ void release_pages(struct page **pages, int nr, bool cold)
947 if (zone) 951 if (zone)
948 spin_unlock_irqrestore(&zone->lru_lock, flags); 952 spin_unlock_irqrestore(&zone->lru_lock, flags);
949 953
954 mem_cgroup_uncharge_end();
955
950 free_hot_cold_page_list(&pages_to_free, cold); 956 free_hot_cold_page_list(&pages_to_free, cold);
951} 957}
952EXPORT_SYMBOL(release_pages); 958EXPORT_SYMBOL(release_pages);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2972eee184a4..e160151da6b8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -176,7 +176,7 @@ int add_to_swap(struct page *page, struct list_head *list)
176 176
177 if (unlikely(PageTransHuge(page))) 177 if (unlikely(PageTransHuge(page)))
178 if (unlikely(split_huge_page_to_list(page, list))) { 178 if (unlikely(split_huge_page_to_list(page, list))) {
179 swapcache_free(entry, NULL); 179 swapcache_free(entry);
180 return 0; 180 return 0;
181 } 181 }
182 182
@@ -202,7 +202,7 @@ int add_to_swap(struct page *page, struct list_head *list)
202 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 202 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
203 * clear SWAP_HAS_CACHE flag. 203 * clear SWAP_HAS_CACHE flag.
204 */ 204 */
205 swapcache_free(entry, NULL); 205 swapcache_free(entry);
206 return 0; 206 return 0;
207 } 207 }
208} 208}
@@ -225,7 +225,7 @@ void delete_from_swap_cache(struct page *page)
225 __delete_from_swap_cache(page); 225 __delete_from_swap_cache(page);
226 spin_unlock_irq(&address_space->tree_lock); 226 spin_unlock_irq(&address_space->tree_lock);
227 227
228 swapcache_free(entry, page); 228 swapcache_free(entry);
229 page_cache_release(page); 229 page_cache_release(page);
230} 230}
231 231
@@ -386,7 +386,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
386 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 386 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
387 * clear SWAP_HAS_CACHE flag. 387 * clear SWAP_HAS_CACHE flag.
388 */ 388 */
389 swapcache_free(entry, NULL); 389 swapcache_free(entry);
390 } while (err != -ENOMEM); 390 } while (err != -ENOMEM);
391 391
392 if (new_page) 392 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0883b4912ff7..8798b2e0ac59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -843,16 +843,13 @@ void swap_free(swp_entry_t entry)
843/* 843/*
844 * Called after dropping swapcache to decrease refcnt to swap entries. 844 * Called after dropping swapcache to decrease refcnt to swap entries.
845 */ 845 */
846void swapcache_free(swp_entry_t entry, struct page *page) 846void swapcache_free(swp_entry_t entry)
847{ 847{
848 struct swap_info_struct *p; 848 struct swap_info_struct *p;
849 unsigned char count;
850 849
851 p = swap_info_get(entry); 850 p = swap_info_get(entry);
852 if (p) { 851 if (p) {
853 count = swap_entry_free(p, entry, SWAP_HAS_CACHE); 852 swap_entry_free(p, entry, SWAP_HAS_CACHE);
854 if (page)
855 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
856 spin_unlock(&p->lock); 853 spin_unlock(&p->lock);
857 } 854 }
858} 855}
diff --git a/mm/truncate.c b/mm/truncate.c
index eda247307164..96d167372d89 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -281,7 +281,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
281 while (index < end && pagevec_lookup_entries(&pvec, mapping, index, 281 while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
282 min(end - index, (pgoff_t)PAGEVEC_SIZE), 282 min(end - index, (pgoff_t)PAGEVEC_SIZE),
283 indices)) { 283 indices)) {
284 mem_cgroup_uncharge_start();
285 for (i = 0; i < pagevec_count(&pvec); i++) { 284 for (i = 0; i < pagevec_count(&pvec); i++) {
286 struct page *page = pvec.pages[i]; 285 struct page *page = pvec.pages[i];
287 286
@@ -307,7 +306,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
307 } 306 }
308 pagevec_remove_exceptionals(&pvec); 307 pagevec_remove_exceptionals(&pvec);
309 pagevec_release(&pvec); 308 pagevec_release(&pvec);
310 mem_cgroup_uncharge_end();
311 cond_resched(); 309 cond_resched();
312 index++; 310 index++;
313 } 311 }
@@ -369,7 +367,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
369 pagevec_release(&pvec); 367 pagevec_release(&pvec);
370 break; 368 break;
371 } 369 }
372 mem_cgroup_uncharge_start();
373 for (i = 0; i < pagevec_count(&pvec); i++) { 370 for (i = 0; i < pagevec_count(&pvec); i++) {
374 struct page *page = pvec.pages[i]; 371 struct page *page = pvec.pages[i];
375 372
@@ -394,7 +391,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
394 } 391 }
395 pagevec_remove_exceptionals(&pvec); 392 pagevec_remove_exceptionals(&pvec);
396 pagevec_release(&pvec); 393 pagevec_release(&pvec);
397 mem_cgroup_uncharge_end();
398 index++; 394 index++;
399 } 395 }
400 cleancache_invalidate_inode(mapping); 396 cleancache_invalidate_inode(mapping);
@@ -493,7 +489,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
493 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 489 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
494 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 490 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
495 indices)) { 491 indices)) {
496 mem_cgroup_uncharge_start();
497 for (i = 0; i < pagevec_count(&pvec); i++) { 492 for (i = 0; i < pagevec_count(&pvec); i++) {
498 struct page *page = pvec.pages[i]; 493 struct page *page = pvec.pages[i];
499 494
@@ -522,7 +517,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
522 } 517 }
523 pagevec_remove_exceptionals(&pvec); 518 pagevec_remove_exceptionals(&pvec);
524 pagevec_release(&pvec); 519 pagevec_release(&pvec);
525 mem_cgroup_uncharge_end();
526 cond_resched(); 520 cond_resched();
527 index++; 521 index++;
528 } 522 }
@@ -553,7 +547,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
553 BUG_ON(page_has_private(page)); 547 BUG_ON(page_has_private(page));
554 __delete_from_page_cache(page, NULL); 548 __delete_from_page_cache(page, NULL);
555 spin_unlock_irq(&mapping->tree_lock); 549 spin_unlock_irq(&mapping->tree_lock);
556 mem_cgroup_uncharge_cache_page(page);
557 550
558 if (mapping->a_ops->freepage) 551 if (mapping->a_ops->freepage)
559 mapping->a_ops->freepage(page); 552 mapping->a_ops->freepage(page);
@@ -602,7 +595,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
602 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 595 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
603 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 596 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
604 indices)) { 597 indices)) {
605 mem_cgroup_uncharge_start();
606 for (i = 0; i < pagevec_count(&pvec); i++) { 598 for (i = 0; i < pagevec_count(&pvec); i++) {
607 struct page *page = pvec.pages[i]; 599 struct page *page = pvec.pages[i];
608 600
@@ -655,7 +647,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
655 } 647 }
656 pagevec_remove_exceptionals(&pvec); 648 pagevec_remove_exceptionals(&pvec);
657 pagevec_release(&pvec); 649 pagevec_release(&pvec);
658 mem_cgroup_uncharge_end();
659 cond_resched(); 650 cond_resched();
660 index++; 651 index++;
661 } 652 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d2f65c856350..7068e838d22b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -577,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
577 577
578 if (PageSwapCache(page)) { 578 if (PageSwapCache(page)) {
579 swp_entry_t swap = { .val = page_private(page) }; 579 swp_entry_t swap = { .val = page_private(page) };
580 mem_cgroup_swapout(page, swap);
580 __delete_from_swap_cache(page); 581 __delete_from_swap_cache(page);
581 spin_unlock_irq(&mapping->tree_lock); 582 spin_unlock_irq(&mapping->tree_lock);
582 swapcache_free(swap, page); 583 swapcache_free(swap);
583 } else { 584 } else {
584 void (*freepage)(struct page *); 585 void (*freepage)(struct page *);
585 void *shadow = NULL; 586 void *shadow = NULL;
@@ -600,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
600 shadow = workingset_eviction(mapping, page); 601 shadow = workingset_eviction(mapping, page);
601 __delete_from_page_cache(page, shadow); 602 __delete_from_page_cache(page, shadow);
602 spin_unlock_irq(&mapping->tree_lock); 603 spin_unlock_irq(&mapping->tree_lock);
603 mem_cgroup_uncharge_cache_page(page);
604 604
605 if (freepage != NULL) 605 if (freepage != NULL)
606 freepage(page); 606 freepage(page);
@@ -1103,6 +1103,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1103 */ 1103 */
1104 __clear_page_locked(page); 1104 __clear_page_locked(page);
1105free_it: 1105free_it:
1106 mem_cgroup_uncharge(page);
1106 nr_reclaimed++; 1107 nr_reclaimed++;
1107 1108
1108 /* 1109 /*
@@ -1132,12 +1133,13 @@ keep:
1132 list_add(&page->lru, &ret_pages); 1133 list_add(&page->lru, &ret_pages);
1133 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); 1134 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1134 } 1135 }
1136 mem_cgroup_uncharge_end();
1135 1137
1136 free_hot_cold_page_list(&free_pages, true); 1138 free_hot_cold_page_list(&free_pages, true);
1137 1139
1138 list_splice(&ret_pages, page_list); 1140 list_splice(&ret_pages, page_list);
1139 count_vm_events(PGACTIVATE, pgactivate); 1141 count_vm_events(PGACTIVATE, pgactivate);
1140 mem_cgroup_uncharge_end(); 1142
1141 *ret_nr_dirty += nr_dirty; 1143 *ret_nr_dirty += nr_dirty;
1142 *ret_nr_congested += nr_congested; 1144 *ret_nr_congested += nr_congested;
1143 *ret_nr_unqueued_dirty += nr_unqueued_dirty; 1145 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1435,6 +1437,8 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1435 __ClearPageActive(page); 1437 __ClearPageActive(page);
1436 del_page_from_lru_list(page, lruvec, lru); 1438 del_page_from_lru_list(page, lruvec, lru);
1437 1439
1440 mem_cgroup_uncharge(page);
1441
1438 if (unlikely(PageCompound(page))) { 1442 if (unlikely(PageCompound(page))) {
1439 spin_unlock_irq(&zone->lru_lock); 1443 spin_unlock_irq(&zone->lru_lock);
1440 (*get_compound_page_dtor(page))(page); 1444 (*get_compound_page_dtor(page))(page);
@@ -1656,6 +1660,8 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
1656 __ClearPageActive(page); 1660 __ClearPageActive(page);
1657 del_page_from_lru_list(page, lruvec, lru); 1661 del_page_from_lru_list(page, lruvec, lru);
1658 1662
1663 mem_cgroup_uncharge(page);
1664
1659 if (unlikely(PageCompound(page))) { 1665 if (unlikely(PageCompound(page))) {
1660 spin_unlock_irq(&zone->lru_lock); 1666 spin_unlock_irq(&zone->lru_lock);
1661 (*get_compound_page_dtor(page))(page); 1667 (*get_compound_page_dtor(page))(page);
diff --git a/mm/zswap.c b/mm/zswap.c
index 032c21eeab2b..9da56af24df5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -507,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
507 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 507 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
508 * clear SWAP_HAS_CACHE flag. 508 * clear SWAP_HAS_CACHE flag.
509 */ 509 */
510 swapcache_free(entry, NULL); 510 swapcache_free(entry);
511 } while (err != -ENOMEM); 511 } while (err != -ENOMEM);
512 512
513 if (new_page) 513 if (new_page)