aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-02-07 03:13:53 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:18 -0500
commit8a9f3ccd24741b50200c3f33d62534c7271f3dfc (patch)
tree066aabd8d2952299501f067a91cbfd6f47ee62f6
parent78fb74669e80883323391090e4d26d17fe29488f (diff)
Memory controller: memory accounting
Add the accounting hooks. The accounting is carried out for RSS and Page Cache (unmapped) pages. There is now a common limit and accounting for both. The RSS accounting is accounted at page_add_*_rmap() and page_remove_rmap() time. Page cache is accounted at add_to_page_cache(), __delete_from_page_cache(). Swap cache is also accounted for. Each page's page_cgroup is protected with the last bit of the page_cgroup pointer, this makes handling of race conditions involving simultaneous mappings of a page easier. A reference count is kept in the page_cgroup to deal with cases where a page might be unmapped from the RSS of all tasks, but still lives in the page cache. Credits go to Vaidyanathan Srinivasan for helping with reference counting work of the page cgroup. Almost all of the page cache accounting code has help from Vaidyanathan Srinivasan. [hugh@veritas.com: fix swapoff breakage] [akpm@linux-foundation.org: fix locking] Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: <Valdis.Kletnieks@vt.edu> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h20
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/memcontrol.c166
-rw-r--r--mm/memory.c47
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/rmap.c17
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c41
9 files changed, 295 insertions, 27 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7d1f119c796e..f5b47efab48b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,13 @@ extern void mm_free_cgroup(struct mm_struct *mm);
30extern void page_assign_page_cgroup(struct page *page, 30extern void page_assign_page_cgroup(struct page *page,
31 struct page_cgroup *pc); 31 struct page_cgroup *pc);
32extern struct page_cgroup *page_get_page_cgroup(struct page *page); 32extern struct page_cgroup *page_get_page_cgroup(struct page *page);
33extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm);
34extern void mem_cgroup_uncharge(struct page_cgroup *pc);
35
36static inline void mem_cgroup_uncharge_page(struct page *page)
37{
38 mem_cgroup_uncharge(page_get_page_cgroup(page));
39}
33 40
34#else /* CONFIG_CGROUP_MEM_CONT */ 41#else /* CONFIG_CGROUP_MEM_CONT */
35static inline void mm_init_cgroup(struct mm_struct *mm, 42static inline void mm_init_cgroup(struct mm_struct *mm,
@@ -51,6 +58,19 @@ static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
51 return NULL; 58 return NULL;
52} 59}
53 60
61static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
62{
63 return 0;
64}
65
66static inline void mem_cgroup_uncharge(struct page_cgroup *pc)
67{
68}
69
70static inline void mem_cgroup_uncharge_page(struct page *page)
71{
72}
73
54#endif /* CONFIG_CGROUP_MEM_CONT */ 74#endif /* CONFIG_CGROUP_MEM_CONT */
55 75
56#endif /* _LINUX_MEMCONTROL_H */ 76#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 81fb9bff0d4f..b7a01e927953 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/syscalls.h> 33#include <linux/syscalls.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
36#include <linux/memcontrol.h>
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
118{ 119{
119 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
120 121
122 mem_cgroup_uncharge_page(page);
121 radix_tree_delete(&mapping->page_tree, page->index); 123 radix_tree_delete(&mapping->page_tree, page->index);
122 page->mapping = NULL; 124 page->mapping = NULL;
123 mapping->nrpages--; 125 mapping->nrpages--;
@@ -461,6 +463,11 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
461 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 463 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
462 464
463 if (error == 0) { 465 if (error == 0) {
466
467 error = mem_cgroup_charge(page, current->mm);
468 if (error)
469 goto out;
470
464 write_lock_irq(&mapping->tree_lock); 471 write_lock_irq(&mapping->tree_lock);
465 error = radix_tree_insert(&mapping->page_tree, offset, page); 472 error = radix_tree_insert(&mapping->page_tree, offset, page);
466 if (!error) { 473 if (!error) {
@@ -470,10 +477,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
470 page->index = offset; 477 page->index = offset;
471 mapping->nrpages++; 478 mapping->nrpages++;
472 __inc_zone_page_state(page, NR_FILE_PAGES); 479 __inc_zone_page_state(page, NR_FILE_PAGES);
473 } 480 } else
481 mem_cgroup_uncharge_page(page);
482
474 write_unlock_irq(&mapping->tree_lock); 483 write_unlock_irq(&mapping->tree_lock);
475 radix_tree_preload_end(); 484 radix_tree_preload_end();
476 } 485 }
486out:
477 return error; 487 return error;
478} 488}
479EXPORT_SYMBOL(add_to_page_cache); 489EXPORT_SYMBOL(add_to_page_cache);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d4805eb37c7..ebca767292dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/page-flags.h>
25#include <linux/bit_spinlock.h>
26#include <linux/rcupdate.h>
24 27
25struct cgroup_subsys mem_cgroup_subsys; 28struct cgroup_subsys mem_cgroup_subsys;
26 29
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
31 * to help the administrator determine what knobs to tune. 34 * to help the administrator determine what knobs to tune.
32 * 35 *
33 * TODO: Add a water mark for the memory controller. Reclaim will begin when 36 * TODO: Add a water mark for the memory controller. Reclaim will begin when
34 * we hit the water mark. 37 * we hit the water mark. May be even add a low water mark, such that
38 * no reclaim occurs from a cgroup at it's low water mark, this is
39 * a feature that will be implemented much later in the future.
35 */ 40 */
36struct mem_cgroup { 41struct mem_cgroup {
37 struct cgroup_subsys_state css; 42 struct cgroup_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_cgroup {
49}; 54};
50 55
51/* 56/*
57 * We use the lower bit of the page->page_cgroup pointer as a bit spin
58 * lock. We need to ensure that page->page_cgroup is atleast two
59 * byte aligned (based on comments from Nick Piggin)
60 */
61#define PAGE_CGROUP_LOCK_BIT 0x0
62#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
63
64/*
52 * A page_cgroup page is associated with every page descriptor. The 65 * A page_cgroup page is associated with every page descriptor. The
53 * page_cgroup helps us identify information about the cgroup 66 * page_cgroup helps us identify information about the cgroup
54 */ 67 */
@@ -56,6 +69,8 @@ struct page_cgroup {
56 struct list_head lru; /* per cgroup LRU list */ 69 struct list_head lru; /* per cgroup LRU list */
57 struct page *page; 70 struct page *page;
58 struct mem_cgroup *mem_cgroup; 71 struct mem_cgroup *mem_cgroup;
72 atomic_t ref_cnt; /* Helpful when pages move b/w */
73 /* mapped and cached states */
59}; 74};
60 75
61 76
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
88 css_put(&mm->mem_cgroup->css); 103 css_put(&mm->mem_cgroup->css);
89} 104}
90 105
106static inline int page_cgroup_locked(struct page *page)
107{
108 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
109 &page->page_cgroup);
110}
111
91void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 112void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
92{ 113{
93 page->page_cgroup = (unsigned long)pc; 114 int locked;
115
116 /*
117 * While resetting the page_cgroup we might not hold the
118 * page_cgroup lock. free_hot_cold_page() is an example
119 * of such a scenario
120 */
121 if (pc)
122 VM_BUG_ON(!page_cgroup_locked(page));
123 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
124 page->page_cgroup = ((unsigned long)pc | locked);
94} 125}
95 126
96struct page_cgroup *page_get_page_cgroup(struct page *page) 127struct page_cgroup *page_get_page_cgroup(struct page *page)
97{ 128{
98 return page->page_cgroup; 129 return (struct page_cgroup *)
130 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
131}
132
133void __always_inline lock_page_cgroup(struct page *page)
134{
135 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
136 VM_BUG_ON(!page_cgroup_locked(page));
137}
138
139void __always_inline unlock_page_cgroup(struct page *page)
140{
141 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142}
143
144/*
145 * Charge the memory controller for page usage.
146 * Return
147 * 0 if the charge was successful
148 * < 0 if the cgroup is over its limit
149 */
150int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
151{
152 struct mem_cgroup *mem;
153 struct page_cgroup *pc, *race_pc;
154
155 /*
156 * Should page_cgroup's go to their own slab?
157 * One could optimize the performance of the charging routine
158 * by saving a bit in the page_flags and using it as a lock
159 * to see if the cgroup page already has a page_cgroup associated
160 * with it
161 */
162 lock_page_cgroup(page);
163 pc = page_get_page_cgroup(page);
164 /*
165 * The page_cgroup exists and the page has already been accounted
166 */
167 if (pc) {
168 atomic_inc(&pc->ref_cnt);
169 goto done;
170 }
171
172 unlock_page_cgroup(page);
173
174 pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
175 if (pc == NULL)
176 goto err;
177
178 rcu_read_lock();
179 /*
180 * We always charge the cgroup the mm_struct belongs to
181 * the mm_struct's mem_cgroup changes on task migration if the
182 * thread group leader migrates. It's possible that mm is not
183 * set, if so charge the init_mm (happens for pagecache usage).
184 */
185 if (!mm)
186 mm = &init_mm;
187
188 mem = rcu_dereference(mm->mem_cgroup);
189 /*
190 * For every charge from the cgroup, increment reference
191 * count
192 */
193 css_get(&mem->css);
194 rcu_read_unlock();
195
196 /*
197 * If we created the page_cgroup, we should free it on exceeding
198 * the cgroup limit.
199 */
200 if (res_counter_charge(&mem->res, 1)) {
201 css_put(&mem->css);
202 goto free_pc;
203 }
204
205 lock_page_cgroup(page);
206 /*
207 * Check if somebody else beat us to allocating the page_cgroup
208 */
209 race_pc = page_get_page_cgroup(page);
210 if (race_pc) {
211 kfree(pc);
212 pc = race_pc;
213 atomic_inc(&pc->ref_cnt);
214 res_counter_uncharge(&mem->res, 1);
215 css_put(&mem->css);
216 goto done;
217 }
218
219 atomic_set(&pc->ref_cnt, 1);
220 pc->mem_cgroup = mem;
221 pc->page = page;
222 page_assign_page_cgroup(page, pc);
223
224done:
225 unlock_page_cgroup(page);
226 return 0;
227free_pc:
228 kfree(pc);
229 return -ENOMEM;
230err:
231 unlock_page_cgroup(page);
232 return -ENOMEM;
233}
234
235/*
236 * Uncharging is always a welcome operation, we never complain, simply
237 * uncharge.
238 */
239void mem_cgroup_uncharge(struct page_cgroup *pc)
240{
241 struct mem_cgroup *mem;
242 struct page *page;
243
244 if (!pc)
245 return;
246
247 if (atomic_dec_and_test(&pc->ref_cnt)) {
248 page = pc->page;
249 lock_page_cgroup(page);
250 mem = pc->mem_cgroup;
251 css_put(&mem->css);
252 page_assign_page_cgroup(page, NULL);
253 unlock_page_cgroup(page);
254 res_counter_uncharge(&mem->res, 1);
255 kfree(pc);
256 }
99} 257}
100 258
101static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 259static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
150 return NULL; 308 return NULL;
151 309
152 res_counter_init(&mem->res); 310 res_counter_init(&mem->res);
311 INIT_LIST_HEAD(&mem->active_list);
312 INIT_LIST_HEAD(&mem->inactive_list);
153 return &mem->css; 313 return &mem->css;
154} 314}
155 315
diff --git a/mm/memory.c b/mm/memory.c
index 9d073fa0a2d0..0ba224ea6ba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
53 54
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
@@ -1144,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1144{ 1145{
1145 int retval; 1146 int retval;
1146 pte_t *pte; 1147 pte_t *pte;
1147 spinlock_t *ptl; 1148 spinlock_t *ptl;
1149
1150 retval = mem_cgroup_charge(page, mm);
1151 if (retval)
1152 goto out;
1148 1153
1149 retval = -EINVAL; 1154 retval = -EINVAL;
1150 if (PageAnon(page)) 1155 if (PageAnon(page))
1151 goto out; 1156 goto out_uncharge;
1152 retval = -ENOMEM; 1157 retval = -ENOMEM;
1153 flush_dcache_page(page); 1158 flush_dcache_page(page);
1154 pte = get_locked_pte(mm, addr, &ptl); 1159 pte = get_locked_pte(mm, addr, &ptl);
1155 if (!pte) 1160 if (!pte)
1156 goto out; 1161 goto out_uncharge;
1157 retval = -EBUSY; 1162 retval = -EBUSY;
1158 if (!pte_none(*pte)) 1163 if (!pte_none(*pte))
1159 goto out_unlock; 1164 goto out_unlock;
@@ -1165,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1165 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1170 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1166 1171
1167 retval = 0; 1172 retval = 0;
1173 pte_unmap_unlock(pte, ptl);
1174 return retval;
1168out_unlock: 1175out_unlock:
1169 pte_unmap_unlock(pte, ptl); 1176 pte_unmap_unlock(pte, ptl);
1177out_uncharge:
1178 mem_cgroup_uncharge_page(page);
1170out: 1179out:
1171 return retval; 1180 return retval;
1172} 1181}
@@ -1641,6 +1650,9 @@ gotten:
1641 cow_user_page(new_page, old_page, address, vma); 1650 cow_user_page(new_page, old_page, address, vma);
1642 __SetPageUptodate(new_page); 1651 __SetPageUptodate(new_page);
1643 1652
1653 if (mem_cgroup_charge(new_page, mm))
1654 goto oom_free_new;
1655
1644 /* 1656 /*
1645 * Re-check the pte - we dropped the lock 1657 * Re-check the pte - we dropped the lock
1646 */ 1658 */
@@ -1672,7 +1684,9 @@ gotten:
1672 /* Free the old page.. */ 1684 /* Free the old page.. */
1673 new_page = old_page; 1685 new_page = old_page;
1674 ret |= VM_FAULT_WRITE; 1686 ret |= VM_FAULT_WRITE;
1675 } 1687 } else
1688 mem_cgroup_uncharge_page(new_page);
1689
1676 if (new_page) 1690 if (new_page)
1677 page_cache_release(new_page); 1691 page_cache_release(new_page);
1678 if (old_page) 1692 if (old_page)
@@ -1696,6 +1710,8 @@ unlock:
1696 put_page(dirty_page); 1710 put_page(dirty_page);
1697 } 1711 }
1698 return ret; 1712 return ret;
1713oom_free_new:
1714 __free_page(new_page);
1699oom: 1715oom:
1700 if (old_page) 1716 if (old_page)
1701 page_cache_release(old_page); 1717 page_cache_release(old_page);
@@ -2036,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2036 count_vm_event(PGMAJFAULT); 2052 count_vm_event(PGMAJFAULT);
2037 } 2053 }
2038 2054
2055 if (mem_cgroup_charge(page, mm)) {
2056 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2057 ret = VM_FAULT_OOM;
2058 goto out;
2059 }
2060
2039 mark_page_accessed(page); 2061 mark_page_accessed(page);
2040 lock_page(page); 2062 lock_page(page);
2041 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2063 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2073,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2073 if (write_access) { 2095 if (write_access) {
2074 /* XXX: We could OR the do_wp_page code with this one? */ 2096 /* XXX: We could OR the do_wp_page code with this one? */
2075 if (do_wp_page(mm, vma, address, 2097 if (do_wp_page(mm, vma, address,
2076 page_table, pmd, ptl, pte) & VM_FAULT_OOM) 2098 page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
2099 mem_cgroup_uncharge_page(page);
2077 ret = VM_FAULT_OOM; 2100 ret = VM_FAULT_OOM;
2101 }
2078 goto out; 2102 goto out;
2079 } 2103 }
2080 2104
@@ -2085,6 +2109,7 @@ unlock:
2085out: 2109out:
2086 return ret; 2110 return ret;
2087out_nomap: 2111out_nomap:
2112 mem_cgroup_uncharge_page(page);
2088 pte_unmap_unlock(page_table, ptl); 2113 pte_unmap_unlock(page_table, ptl);
2089 unlock_page(page); 2114 unlock_page(page);
2090 page_cache_release(page); 2115 page_cache_release(page);
@@ -2114,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2114 goto oom; 2139 goto oom;
2115 __SetPageUptodate(page); 2140 __SetPageUptodate(page);
2116 2141
2142 if (mem_cgroup_charge(page, mm))
2143 goto oom_free_page;
2144
2117 entry = mk_pte(page, vma->vm_page_prot); 2145 entry = mk_pte(page, vma->vm_page_prot);
2118 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2146 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2119 2147
@@ -2131,8 +2159,11 @@ unlock:
2131 pte_unmap_unlock(page_table, ptl); 2159 pte_unmap_unlock(page_table, ptl);
2132 return 0; 2160 return 0;
2133release: 2161release:
2162 mem_cgroup_uncharge_page(page);
2134 page_cache_release(page); 2163 page_cache_release(page);
2135 goto unlock; 2164 goto unlock;
2165oom_free_page:
2166 __free_page(page);
2136oom: 2167oom:
2137 return VM_FAULT_OOM; 2168 return VM_FAULT_OOM;
2138} 2169}
@@ -2246,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2246 2277
2247 } 2278 }
2248 2279
2280 if (mem_cgroup_charge(page, mm)) {
2281 ret = VM_FAULT_OOM;
2282 goto out;
2283 }
2284
2249 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2285 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2250 2286
2251 /* 2287 /*
@@ -2281,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2281 /* no need to invalidate: a not-present page won't be cached */ 2317 /* no need to invalidate: a not-present page won't be cached */
2282 update_mmu_cache(vma, address, entry); 2318 update_mmu_cache(vma, address, entry);
2283 } else { 2319 } else {
2320 mem_cgroup_uncharge_page(page);
2284 if (anon) 2321 if (anon)
2285 page_cache_release(page); 2322 page_cache_release(page);
2286 else 2323 else
diff --git a/mm/migrate.c b/mm/migrate.c
index 857a987e3690..417bbda14e5b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
29#include <linux/mempolicy.h> 29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h>
32 33
33#include "internal.h" 34#include "internal.h"
34 35
@@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
152 return; 153 return;
153 } 154 }
154 155
156 if (mem_cgroup_charge(new, mm)) {
157 pte_unmap(ptep);
158 return;
159 }
160
155 ptl = pte_lockptr(mm, pmd); 161 ptl = pte_lockptr(mm, pmd);
156 spin_lock(ptl); 162 spin_lock(ptl);
157 pte = *ptep; 163 pte = *ptep;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 37576b822f06..26a54a17dc9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/fault-inject.h> 44#include <linux/fault-inject.h>
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/memcontrol.h>
46 47
47#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
48#include <asm/div64.h> 49#include <asm/div64.h>
@@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold)
987 988
988 if (!PageHighMem(page)) 989 if (!PageHighMem(page))
989 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
991 VM_BUG_ON(page_get_page_cgroup(page));
990 arch_free_page(page, 0); 992 arch_free_page(page, 0);
991 kernel_map_pages(page, 1, 0); 993 kernel_map_pages(page, 1, 0);
992 994
@@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2525 set_page_links(page, zone, nid, pfn); 2527 set_page_links(page, zone, nid, pfn);
2526 init_page_count(page); 2528 init_page_count(page);
2527 reset_page_mapcount(page); 2529 reset_page_mapcount(page);
2530 page_assign_page_cgroup(page, NULL);
2528 SetPageReserved(page); 2531 SetPageReserved(page);
2529 2532
2530 /* 2533 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 57ad276900c9..4a3487921eff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -48,6 +48,7 @@
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53 54
@@ -554,8 +555,14 @@ void page_add_anon_rmap(struct page *page,
554 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 555 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
555 if (atomic_inc_and_test(&page->_mapcount)) 556 if (atomic_inc_and_test(&page->_mapcount))
556 __page_set_anon_rmap(page, vma, address); 557 __page_set_anon_rmap(page, vma, address);
557 else 558 else {
558 __page_check_anon_rmap(page, vma, address); 559 __page_check_anon_rmap(page, vma, address);
560 /*
561 * We unconditionally charged during prepare, we uncharge here
562 * This takes care of balancing the reference counts
563 */
564 mem_cgroup_uncharge_page(page);
565 }
559} 566}
560 567
561/* 568/*
@@ -586,6 +593,12 @@ void page_add_file_rmap(struct page *page)
586{ 593{
587 if (atomic_inc_and_test(&page->_mapcount)) 594 if (atomic_inc_and_test(&page->_mapcount))
588 __inc_zone_page_state(page, NR_FILE_MAPPED); 595 __inc_zone_page_state(page, NR_FILE_MAPPED);
596 else
597 /*
598 * We unconditionally charged during prepare, we uncharge here
599 * This takes care of balancing the reference counts
600 */
601 mem_cgroup_uncharge_page(page);
589} 602}
590 603
591#ifdef CONFIG_DEBUG_VM 604#ifdef CONFIG_DEBUG_VM
@@ -646,6 +659,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
646 page_clear_dirty(page); 659 page_clear_dirty(page);
647 set_page_dirty(page); 660 set_page_dirty(page);
648 } 661 }
662 mem_cgroup_uncharge_page(page);
663
649 __dec_zone_page_state(page, 664 __dec_zone_page_state(page,
650 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 665 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
651 } 666 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ec42f01a8d02..f96e3ff1e791 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/memcontrol.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
@@ -76,6 +77,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
80
81 error = mem_cgroup_charge(page, current->mm);
82 if (error)
83 goto out;
84
79 write_lock_irq(&swapper_space.tree_lock); 85 write_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 86 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 87 entry.val, page);
@@ -86,10 +92,13 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
86 total_swapcache_pages++; 92 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 93 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 94 INC_CACHE_INFO(add_total);
95 } else {
96 mem_cgroup_uncharge_page(page);
89 } 97 }
90 write_unlock_irq(&swapper_space.tree_lock); 98 write_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 99 radix_tree_preload_end();
92 } 100 }
101out:
93 return error; 102 return error;
94} 103}
95 104
@@ -104,6 +113,7 @@ void __delete_from_swap_cache(struct page *page)
104 BUG_ON(PageWriteback(page)); 113 BUG_ON(PageWriteback(page));
105 BUG_ON(PagePrivate(page)); 114 BUG_ON(PagePrivate(page));
106 115
116 mem_cgroup_uncharge_page(page);
107 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 117 radix_tree_delete(&swapper_space.page_tree, page_private(page));
108 set_page_private(page, 0); 118 set_page_private(page, 0);
109 ClearPageSwapCache(page); 119 ClearPageSwapCache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index afae7b1f680b..fddc4cc4149b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/capability.h> 28#include <linux/capability.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/memcontrol.h>
30 31
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
@@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, int free)
506 * just let do_wp_page work it out if a write is requested later - to 507 * just let do_wp_page work it out if a write is requested later - to
507 * force COW, vm_page_prot omits write permission from any private vma. 508 * force COW, vm_page_prot omits write permission from any private vma.
508 */ 509 */
509static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 510static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
510 unsigned long addr, swp_entry_t entry, struct page *page) 511 unsigned long addr, swp_entry_t entry, struct page *page)
511{ 512{
513 if (mem_cgroup_charge(page, vma->vm_mm))
514 return -ENOMEM;
515
512 inc_mm_counter(vma->vm_mm, anon_rss); 516 inc_mm_counter(vma->vm_mm, anon_rss);
513 get_page(page); 517 get_page(page);
514 set_pte_at(vma->vm_mm, addr, pte, 518 set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
520 * immediately swapped out again after swapon. 524 * immediately swapped out again after swapon.
521 */ 525 */
522 activate_page(page); 526 activate_page(page);
527 return 1;
523} 528}
524 529
525static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 530static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
529 pte_t swp_pte = swp_entry_to_pte(entry); 534 pte_t swp_pte = swp_entry_to_pte(entry);
530 pte_t *pte; 535 pte_t *pte;
531 spinlock_t *ptl; 536 spinlock_t *ptl;
532 int found = 0; 537 int ret = 0;
533 538
534 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 539 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
535 do { 540 do {
@@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
538 * Test inline before going to call unuse_pte. 543 * Test inline before going to call unuse_pte.
539 */ 544 */
540 if (unlikely(pte_same(*pte, swp_pte))) { 545 if (unlikely(pte_same(*pte, swp_pte))) {
541 unuse_pte(vma, pte++, addr, entry, page); 546 ret = unuse_pte(vma, pte++, addr, entry, page);
542 found = 1;
543 break; 547 break;
544 } 548 }
545 } while (pte++, addr += PAGE_SIZE, addr != end); 549 } while (pte++, addr += PAGE_SIZE, addr != end);
546 pte_unmap_unlock(pte - 1, ptl); 550 pte_unmap_unlock(pte - 1, ptl);
547 return found; 551 return ret;
548} 552}
549 553
550static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 554static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
553{ 557{
554 pmd_t *pmd; 558 pmd_t *pmd;
555 unsigned long next; 559 unsigned long next;
560 int ret;
556 561
557 pmd = pmd_offset(pud, addr); 562 pmd = pmd_offset(pud, addr);
558 do { 563 do {
559 next = pmd_addr_end(addr, end); 564 next = pmd_addr_end(addr, end);
560 if (pmd_none_or_clear_bad(pmd)) 565 if (pmd_none_or_clear_bad(pmd))
561 continue; 566 continue;
562 if (unuse_pte_range(vma, pmd, addr, next, entry, page)) 567 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
563 return 1; 568 if (ret)
569 return ret;
564 } while (pmd++, addr = next, addr != end); 570 } while (pmd++, addr = next, addr != end);
565 return 0; 571 return 0;
566} 572}
@@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
571{ 577{
572 pud_t *pud; 578 pud_t *pud;
573 unsigned long next; 579 unsigned long next;
580 int ret;
574 581
575 pud = pud_offset(pgd, addr); 582 pud = pud_offset(pgd, addr);
576 do { 583 do {
577 next = pud_addr_end(addr, end); 584 next = pud_addr_end(addr, end);
578 if (pud_none_or_clear_bad(pud)) 585 if (pud_none_or_clear_bad(pud))
579 continue; 586 continue;
580 if (unuse_pmd_range(vma, pud, addr, next, entry, page)) 587 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
581 return 1; 588 if (ret)
589 return ret;
582 } while (pud++, addr = next, addr != end); 590 } while (pud++, addr = next, addr != end);
583 return 0; 591 return 0;
584} 592}
@@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_struct *vma,
588{ 596{
589 pgd_t *pgd; 597 pgd_t *pgd;
590 unsigned long addr, end, next; 598 unsigned long addr, end, next;
599 int ret;
591 600
592 if (page->mapping) { 601 if (page->mapping) {
593 addr = page_address_in_vma(page, vma); 602 addr = page_address_in_vma(page, vma);
@@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_struct *vma,
605 next = pgd_addr_end(addr, end); 614 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd)) 615 if (pgd_none_or_clear_bad(pgd))
607 continue; 616 continue;
608 if (unuse_pud_range(vma, pgd, addr, next, entry, page)) 617 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
609 return 1; 618 if (ret)
619 return ret;
610 } while (pgd++, addr = next, addr != end); 620 } while (pgd++, addr = next, addr != end);
611 return 0; 621 return 0;
612} 622}
@@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm,
615 swp_entry_t entry, struct page *page) 625 swp_entry_t entry, struct page *page)
616{ 626{
617 struct vm_area_struct *vma; 627 struct vm_area_struct *vma;
628 int ret = 0;
618 629
619 if (!down_read_trylock(&mm->mmap_sem)) { 630 if (!down_read_trylock(&mm->mmap_sem)) {
620 /* 631 /*
@@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm,
627 lock_page(page); 638 lock_page(page);
628 } 639 }
629 for (vma = mm->mmap; vma; vma = vma->vm_next) { 640 for (vma = mm->mmap; vma; vma = vma->vm_next) {
630 if (vma->anon_vma && unuse_vma(vma, entry, page)) 641 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
631 break; 642 break;
632 } 643 }
633 up_read(&mm->mmap_sem); 644 up_read(&mm->mmap_sem);
634 /* 645 return (ret < 0)? ret: 0;
635 * Currently unuse_mm cannot fail, but leave error handling
636 * at call sites for now, since we change it from time to time.
637 */
638 return 0;
639} 646}
640 647
641/* 648/*