aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/memcontrol.c166
-rw-r--r--mm/memory.c47
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/rmap.c17
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c41
8 files changed, 275 insertions, 27 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 81fb9bff0d4f..b7a01e927953 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/syscalls.h> 33#include <linux/syscalls.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
36#include <linux/memcontrol.h>
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
118{ 119{
119 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
120 121
122 mem_cgroup_uncharge_page(page);
121 radix_tree_delete(&mapping->page_tree, page->index); 123 radix_tree_delete(&mapping->page_tree, page->index);
122 page->mapping = NULL; 124 page->mapping = NULL;
123 mapping->nrpages--; 125 mapping->nrpages--;
@@ -461,6 +463,11 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
461 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 463 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
462 464
463 if (error == 0) { 465 if (error == 0) {
466
467 error = mem_cgroup_charge(page, current->mm);
468 if (error)
469 goto out;
470
464 write_lock_irq(&mapping->tree_lock); 471 write_lock_irq(&mapping->tree_lock);
465 error = radix_tree_insert(&mapping->page_tree, offset, page); 472 error = radix_tree_insert(&mapping->page_tree, offset, page);
466 if (!error) { 473 if (!error) {
@@ -470,10 +477,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
470 page->index = offset; 477 page->index = offset;
471 mapping->nrpages++; 478 mapping->nrpages++;
472 __inc_zone_page_state(page, NR_FILE_PAGES); 479 __inc_zone_page_state(page, NR_FILE_PAGES);
473 } 480 } else
481 mem_cgroup_uncharge_page(page);
482
474 write_unlock_irq(&mapping->tree_lock); 483 write_unlock_irq(&mapping->tree_lock);
475 radix_tree_preload_end(); 484 radix_tree_preload_end();
476 } 485 }
486out:
477 return error; 487 return error;
478} 488}
479EXPORT_SYMBOL(add_to_page_cache); 489EXPORT_SYMBOL(add_to_page_cache);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d4805eb37c7..ebca767292dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/page-flags.h>
25#include <linux/bit_spinlock.h>
26#include <linux/rcupdate.h>
24 27
25struct cgroup_subsys mem_cgroup_subsys; 28struct cgroup_subsys mem_cgroup_subsys;
26 29
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
31 * to help the administrator determine what knobs to tune. 34 * to help the administrator determine what knobs to tune.
32 * 35 *
33 * TODO: Add a water mark for the memory controller. Reclaim will begin when 36 * TODO: Add a water mark for the memory controller. Reclaim will begin when
34 * we hit the water mark. 37 * we hit the water mark. May be even add a low water mark, such that
38 * no reclaim occurs from a cgroup at it's low water mark, this is
39 * a feature that will be implemented much later in the future.
35 */ 40 */
36struct mem_cgroup { 41struct mem_cgroup {
37 struct cgroup_subsys_state css; 42 struct cgroup_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_cgroup {
49}; 54};
50 55
51/* 56/*
57 * We use the lower bit of the page->page_cgroup pointer as a bit spin
58 * lock. We need to ensure that page->page_cgroup is atleast two
59 * byte aligned (based on comments from Nick Piggin)
60 */
61#define PAGE_CGROUP_LOCK_BIT 0x0
62#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
63
64/*
52 * A page_cgroup page is associated with every page descriptor. The 65 * A page_cgroup page is associated with every page descriptor. The
53 * page_cgroup helps us identify information about the cgroup 66 * page_cgroup helps us identify information about the cgroup
54 */ 67 */
@@ -56,6 +69,8 @@ struct page_cgroup {
56 struct list_head lru; /* per cgroup LRU list */ 69 struct list_head lru; /* per cgroup LRU list */
57 struct page *page; 70 struct page *page;
58 struct mem_cgroup *mem_cgroup; 71 struct mem_cgroup *mem_cgroup;
72 atomic_t ref_cnt; /* Helpful when pages move b/w */
73 /* mapped and cached states */
59}; 74};
60 75
61 76
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
88 css_put(&mm->mem_cgroup->css); 103 css_put(&mm->mem_cgroup->css);
89} 104}
90 105
106static inline int page_cgroup_locked(struct page *page)
107{
108 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
109 &page->page_cgroup);
110}
111
91void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 112void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
92{ 113{
93 page->page_cgroup = (unsigned long)pc; 114 int locked;
115
116 /*
117 * While resetting the page_cgroup we might not hold the
118 * page_cgroup lock. free_hot_cold_page() is an example
119 * of such a scenario
120 */
121 if (pc)
122 VM_BUG_ON(!page_cgroup_locked(page));
123 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
124 page->page_cgroup = ((unsigned long)pc | locked);
94} 125}
95 126
96struct page_cgroup *page_get_page_cgroup(struct page *page) 127struct page_cgroup *page_get_page_cgroup(struct page *page)
97{ 128{
98 return page->page_cgroup; 129 return (struct page_cgroup *)
130 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
131}
132
133void __always_inline lock_page_cgroup(struct page *page)
134{
135 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
136 VM_BUG_ON(!page_cgroup_locked(page));
137}
138
139void __always_inline unlock_page_cgroup(struct page *page)
140{
141 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142}
143
144/*
145 * Charge the memory controller for page usage.
146 * Return
147 * 0 if the charge was successful
148 * < 0 if the cgroup is over its limit
149 */
150int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
151{
152 struct mem_cgroup *mem;
153 struct page_cgroup *pc, *race_pc;
154
155 /*
156 * Should page_cgroup's go to their own slab?
157 * One could optimize the performance of the charging routine
158 * by saving a bit in the page_flags and using it as a lock
159 * to see if the cgroup page already has a page_cgroup associated
160 * with it
161 */
162 lock_page_cgroup(page);
163 pc = page_get_page_cgroup(page);
164 /*
165 * The page_cgroup exists and the page has already been accounted
166 */
167 if (pc) {
168 atomic_inc(&pc->ref_cnt);
169 goto done;
170 }
171
172 unlock_page_cgroup(page);
173
174 pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
175 if (pc == NULL)
176 goto err;
177
178 rcu_read_lock();
179 /*
180 * We always charge the cgroup the mm_struct belongs to
181 * the mm_struct's mem_cgroup changes on task migration if the
182 * thread group leader migrates. It's possible that mm is not
183 * set, if so charge the init_mm (happens for pagecache usage).
184 */
185 if (!mm)
186 mm = &init_mm;
187
188 mem = rcu_dereference(mm->mem_cgroup);
189 /*
190 * For every charge from the cgroup, increment reference
191 * count
192 */
193 css_get(&mem->css);
194 rcu_read_unlock();
195
196 /*
197 * If we created the page_cgroup, we should free it on exceeding
198 * the cgroup limit.
199 */
200 if (res_counter_charge(&mem->res, 1)) {
201 css_put(&mem->css);
202 goto free_pc;
203 }
204
205 lock_page_cgroup(page);
206 /*
207 * Check if somebody else beat us to allocating the page_cgroup
208 */
209 race_pc = page_get_page_cgroup(page);
210 if (race_pc) {
211 kfree(pc);
212 pc = race_pc;
213 atomic_inc(&pc->ref_cnt);
214 res_counter_uncharge(&mem->res, 1);
215 css_put(&mem->css);
216 goto done;
217 }
218
219 atomic_set(&pc->ref_cnt, 1);
220 pc->mem_cgroup = mem;
221 pc->page = page;
222 page_assign_page_cgroup(page, pc);
223
224done:
225 unlock_page_cgroup(page);
226 return 0;
227free_pc:
228 kfree(pc);
229 return -ENOMEM;
230err:
231 unlock_page_cgroup(page);
232 return -ENOMEM;
233}
234
235/*
236 * Uncharging is always a welcome operation, we never complain, simply
237 * uncharge.
238 */
239void mem_cgroup_uncharge(struct page_cgroup *pc)
240{
241 struct mem_cgroup *mem;
242 struct page *page;
243
244 if (!pc)
245 return;
246
247 if (atomic_dec_and_test(&pc->ref_cnt)) {
248 page = pc->page;
249 lock_page_cgroup(page);
250 mem = pc->mem_cgroup;
251 css_put(&mem->css);
252 page_assign_page_cgroup(page, NULL);
253 unlock_page_cgroup(page);
254 res_counter_uncharge(&mem->res, 1);
255 kfree(pc);
256 }
99} 257}
100 258
101static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 259static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
150 return NULL; 308 return NULL;
151 309
152 res_counter_init(&mem->res); 310 res_counter_init(&mem->res);
311 INIT_LIST_HEAD(&mem->active_list);
312 INIT_LIST_HEAD(&mem->inactive_list);
153 return &mem->css; 313 return &mem->css;
154} 314}
155 315
diff --git a/mm/memory.c b/mm/memory.c
index 9d073fa0a2d0..0ba224ea6ba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
53 54
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
@@ -1144,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1144{ 1145{
1145 int retval; 1146 int retval;
1146 pte_t *pte; 1147 pte_t *pte;
1147 spinlock_t *ptl; 1148 spinlock_t *ptl;
1149
1150 retval = mem_cgroup_charge(page, mm);
1151 if (retval)
1152 goto out;
1148 1153
1149 retval = -EINVAL; 1154 retval = -EINVAL;
1150 if (PageAnon(page)) 1155 if (PageAnon(page))
1151 goto out; 1156 goto out_uncharge;
1152 retval = -ENOMEM; 1157 retval = -ENOMEM;
1153 flush_dcache_page(page); 1158 flush_dcache_page(page);
1154 pte = get_locked_pte(mm, addr, &ptl); 1159 pte = get_locked_pte(mm, addr, &ptl);
1155 if (!pte) 1160 if (!pte)
1156 goto out; 1161 goto out_uncharge;
1157 retval = -EBUSY; 1162 retval = -EBUSY;
1158 if (!pte_none(*pte)) 1163 if (!pte_none(*pte))
1159 goto out_unlock; 1164 goto out_unlock;
@@ -1165,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1165 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1170 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1166 1171
1167 retval = 0; 1172 retval = 0;
1173 pte_unmap_unlock(pte, ptl);
1174 return retval;
1168out_unlock: 1175out_unlock:
1169 pte_unmap_unlock(pte, ptl); 1176 pte_unmap_unlock(pte, ptl);
1177out_uncharge:
1178 mem_cgroup_uncharge_page(page);
1170out: 1179out:
1171 return retval; 1180 return retval;
1172} 1181}
@@ -1641,6 +1650,9 @@ gotten:
1641 cow_user_page(new_page, old_page, address, vma); 1650 cow_user_page(new_page, old_page, address, vma);
1642 __SetPageUptodate(new_page); 1651 __SetPageUptodate(new_page);
1643 1652
1653 if (mem_cgroup_charge(new_page, mm))
1654 goto oom_free_new;
1655
1644 /* 1656 /*
1645 * Re-check the pte - we dropped the lock 1657 * Re-check the pte - we dropped the lock
1646 */ 1658 */
@@ -1672,7 +1684,9 @@ gotten:
1672 /* Free the old page.. */ 1684 /* Free the old page.. */
1673 new_page = old_page; 1685 new_page = old_page;
1674 ret |= VM_FAULT_WRITE; 1686 ret |= VM_FAULT_WRITE;
1675 } 1687 } else
1688 mem_cgroup_uncharge_page(new_page);
1689
1676 if (new_page) 1690 if (new_page)
1677 page_cache_release(new_page); 1691 page_cache_release(new_page);
1678 if (old_page) 1692 if (old_page)
@@ -1696,6 +1710,8 @@ unlock:
1696 put_page(dirty_page); 1710 put_page(dirty_page);
1697 } 1711 }
1698 return ret; 1712 return ret;
1713oom_free_new:
1714 __free_page(new_page);
1699oom: 1715oom:
1700 if (old_page) 1716 if (old_page)
1701 page_cache_release(old_page); 1717 page_cache_release(old_page);
@@ -2036,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2036 count_vm_event(PGMAJFAULT); 2052 count_vm_event(PGMAJFAULT);
2037 } 2053 }
2038 2054
2055 if (mem_cgroup_charge(page, mm)) {
2056 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2057 ret = VM_FAULT_OOM;
2058 goto out;
2059 }
2060
2039 mark_page_accessed(page); 2061 mark_page_accessed(page);
2040 lock_page(page); 2062 lock_page(page);
2041 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2063 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2073,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2073 if (write_access) { 2095 if (write_access) {
2074 /* XXX: We could OR the do_wp_page code with this one? */ 2096 /* XXX: We could OR the do_wp_page code with this one? */
2075 if (do_wp_page(mm, vma, address, 2097 if (do_wp_page(mm, vma, address,
2076 page_table, pmd, ptl, pte) & VM_FAULT_OOM) 2098 page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
2099 mem_cgroup_uncharge_page(page);
2077 ret = VM_FAULT_OOM; 2100 ret = VM_FAULT_OOM;
2101 }
2078 goto out; 2102 goto out;
2079 } 2103 }
2080 2104
@@ -2085,6 +2109,7 @@ unlock:
2085out: 2109out:
2086 return ret; 2110 return ret;
2087out_nomap: 2111out_nomap:
2112 mem_cgroup_uncharge_page(page);
2088 pte_unmap_unlock(page_table, ptl); 2113 pte_unmap_unlock(page_table, ptl);
2089 unlock_page(page); 2114 unlock_page(page);
2090 page_cache_release(page); 2115 page_cache_release(page);
@@ -2114,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2114 goto oom; 2139 goto oom;
2115 __SetPageUptodate(page); 2140 __SetPageUptodate(page);
2116 2141
2142 if (mem_cgroup_charge(page, mm))
2143 goto oom_free_page;
2144
2117 entry = mk_pte(page, vma->vm_page_prot); 2145 entry = mk_pte(page, vma->vm_page_prot);
2118 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2146 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2119 2147
@@ -2131,8 +2159,11 @@ unlock:
2131 pte_unmap_unlock(page_table, ptl); 2159 pte_unmap_unlock(page_table, ptl);
2132 return 0; 2160 return 0;
2133release: 2161release:
2162 mem_cgroup_uncharge_page(page);
2134 page_cache_release(page); 2163 page_cache_release(page);
2135 goto unlock; 2164 goto unlock;
2165oom_free_page:
2166 __free_page(page);
2136oom: 2167oom:
2137 return VM_FAULT_OOM; 2168 return VM_FAULT_OOM;
2138} 2169}
@@ -2246,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2246 2277
2247 } 2278 }
2248 2279
2280 if (mem_cgroup_charge(page, mm)) {
2281 ret = VM_FAULT_OOM;
2282 goto out;
2283 }
2284
2249 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2285 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2250 2286
2251 /* 2287 /*
@@ -2281,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2281 /* no need to invalidate: a not-present page won't be cached */ 2317 /* no need to invalidate: a not-present page won't be cached */
2282 update_mmu_cache(vma, address, entry); 2318 update_mmu_cache(vma, address, entry);
2283 } else { 2319 } else {
2320 mem_cgroup_uncharge_page(page);
2284 if (anon) 2321 if (anon)
2285 page_cache_release(page); 2322 page_cache_release(page);
2286 else 2323 else
diff --git a/mm/migrate.c b/mm/migrate.c
index 857a987e3690..417bbda14e5b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
29#include <linux/mempolicy.h> 29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h>
32 33
33#include "internal.h" 34#include "internal.h"
34 35
@@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
152 return; 153 return;
153 } 154 }
154 155
156 if (mem_cgroup_charge(new, mm)) {
157 pte_unmap(ptep);
158 return;
159 }
160
155 ptl = pte_lockptr(mm, pmd); 161 ptl = pte_lockptr(mm, pmd);
156 spin_lock(ptl); 162 spin_lock(ptl);
157 pte = *ptep; 163 pte = *ptep;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 37576b822f06..26a54a17dc9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/fault-inject.h> 44#include <linux/fault-inject.h>
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/memcontrol.h>
46 47
47#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
48#include <asm/div64.h> 49#include <asm/div64.h>
@@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold)
987 988
988 if (!PageHighMem(page)) 989 if (!PageHighMem(page))
989 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
991 VM_BUG_ON(page_get_page_cgroup(page));
990 arch_free_page(page, 0); 992 arch_free_page(page, 0);
991 kernel_map_pages(page, 1, 0); 993 kernel_map_pages(page, 1, 0);
992 994
@@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2525 set_page_links(page, zone, nid, pfn); 2527 set_page_links(page, zone, nid, pfn);
2526 init_page_count(page); 2528 init_page_count(page);
2527 reset_page_mapcount(page); 2529 reset_page_mapcount(page);
2530 page_assign_page_cgroup(page, NULL);
2528 SetPageReserved(page); 2531 SetPageReserved(page);
2529 2532
2530 /* 2533 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 57ad276900c9..4a3487921eff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -48,6 +48,7 @@
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53 54
@@ -554,8 +555,14 @@ void page_add_anon_rmap(struct page *page,
554 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 555 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
555 if (atomic_inc_and_test(&page->_mapcount)) 556 if (atomic_inc_and_test(&page->_mapcount))
556 __page_set_anon_rmap(page, vma, address); 557 __page_set_anon_rmap(page, vma, address);
557 else 558 else {
558 __page_check_anon_rmap(page, vma, address); 559 __page_check_anon_rmap(page, vma, address);
560 /*
561 * We unconditionally charged during prepare, we uncharge here
562 * This takes care of balancing the reference counts
563 */
564 mem_cgroup_uncharge_page(page);
565 }
559} 566}
560 567
561/* 568/*
@@ -586,6 +593,12 @@ void page_add_file_rmap(struct page *page)
586{ 593{
587 if (atomic_inc_and_test(&page->_mapcount)) 594 if (atomic_inc_and_test(&page->_mapcount))
588 __inc_zone_page_state(page, NR_FILE_MAPPED); 595 __inc_zone_page_state(page, NR_FILE_MAPPED);
596 else
597 /*
598 * We unconditionally charged during prepare, we uncharge here
599 * This takes care of balancing the reference counts
600 */
601 mem_cgroup_uncharge_page(page);
589} 602}
590 603
591#ifdef CONFIG_DEBUG_VM 604#ifdef CONFIG_DEBUG_VM
@@ -646,6 +659,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
646 page_clear_dirty(page); 659 page_clear_dirty(page);
647 set_page_dirty(page); 660 set_page_dirty(page);
648 } 661 }
662 mem_cgroup_uncharge_page(page);
663
649 __dec_zone_page_state(page, 664 __dec_zone_page_state(page,
650 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 665 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
651 } 666 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ec42f01a8d02..f96e3ff1e791 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/memcontrol.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
@@ -76,6 +77,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
80
81 error = mem_cgroup_charge(page, current->mm);
82 if (error)
83 goto out;
84
79 write_lock_irq(&swapper_space.tree_lock); 85 write_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 86 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 87 entry.val, page);
@@ -86,10 +92,13 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
86 total_swapcache_pages++; 92 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 93 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 94 INC_CACHE_INFO(add_total);
95 } else {
96 mem_cgroup_uncharge_page(page);
89 } 97 }
90 write_unlock_irq(&swapper_space.tree_lock); 98 write_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 99 radix_tree_preload_end();
92 } 100 }
101out:
93 return error; 102 return error;
94} 103}
95 104
@@ -104,6 +113,7 @@ void __delete_from_swap_cache(struct page *page)
104 BUG_ON(PageWriteback(page)); 113 BUG_ON(PageWriteback(page));
105 BUG_ON(PagePrivate(page)); 114 BUG_ON(PagePrivate(page));
106 115
116 mem_cgroup_uncharge_page(page);
107 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 117 radix_tree_delete(&swapper_space.page_tree, page_private(page));
108 set_page_private(page, 0); 118 set_page_private(page, 0);
109 ClearPageSwapCache(page); 119 ClearPageSwapCache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index afae7b1f680b..fddc4cc4149b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/capability.h> 28#include <linux/capability.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/memcontrol.h>
30 31
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
@@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, int free)
506 * just let do_wp_page work it out if a write is requested later - to 507 * just let do_wp_page work it out if a write is requested later - to
507 * force COW, vm_page_prot omits write permission from any private vma. 508 * force COW, vm_page_prot omits write permission from any private vma.
508 */ 509 */
509static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 510static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
510 unsigned long addr, swp_entry_t entry, struct page *page) 511 unsigned long addr, swp_entry_t entry, struct page *page)
511{ 512{
513 if (mem_cgroup_charge(page, vma->vm_mm))
514 return -ENOMEM;
515
512 inc_mm_counter(vma->vm_mm, anon_rss); 516 inc_mm_counter(vma->vm_mm, anon_rss);
513 get_page(page); 517 get_page(page);
514 set_pte_at(vma->vm_mm, addr, pte, 518 set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
520 * immediately swapped out again after swapon. 524 * immediately swapped out again after swapon.
521 */ 525 */
522 activate_page(page); 526 activate_page(page);
527 return 1;
523} 528}
524 529
525static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 530static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
529 pte_t swp_pte = swp_entry_to_pte(entry); 534 pte_t swp_pte = swp_entry_to_pte(entry);
530 pte_t *pte; 535 pte_t *pte;
531 spinlock_t *ptl; 536 spinlock_t *ptl;
532 int found = 0; 537 int ret = 0;
533 538
534 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 539 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
535 do { 540 do {
@@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
538 * Test inline before going to call unuse_pte. 543 * Test inline before going to call unuse_pte.
539 */ 544 */
540 if (unlikely(pte_same(*pte, swp_pte))) { 545 if (unlikely(pte_same(*pte, swp_pte))) {
541 unuse_pte(vma, pte++, addr, entry, page); 546 ret = unuse_pte(vma, pte++, addr, entry, page);
542 found = 1;
543 break; 547 break;
544 } 548 }
545 } while (pte++, addr += PAGE_SIZE, addr != end); 549 } while (pte++, addr += PAGE_SIZE, addr != end);
546 pte_unmap_unlock(pte - 1, ptl); 550 pte_unmap_unlock(pte - 1, ptl);
547 return found; 551 return ret;
548} 552}
549 553
550static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 554static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
553{ 557{
554 pmd_t *pmd; 558 pmd_t *pmd;
555 unsigned long next; 559 unsigned long next;
560 int ret;
556 561
557 pmd = pmd_offset(pud, addr); 562 pmd = pmd_offset(pud, addr);
558 do { 563 do {
559 next = pmd_addr_end(addr, end); 564 next = pmd_addr_end(addr, end);
560 if (pmd_none_or_clear_bad(pmd)) 565 if (pmd_none_or_clear_bad(pmd))
561 continue; 566 continue;
562 if (unuse_pte_range(vma, pmd, addr, next, entry, page)) 567 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
563 return 1; 568 if (ret)
569 return ret;
564 } while (pmd++, addr = next, addr != end); 570 } while (pmd++, addr = next, addr != end);
565 return 0; 571 return 0;
566} 572}
@@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
571{ 577{
572 pud_t *pud; 578 pud_t *pud;
573 unsigned long next; 579 unsigned long next;
580 int ret;
574 581
575 pud = pud_offset(pgd, addr); 582 pud = pud_offset(pgd, addr);
576 do { 583 do {
577 next = pud_addr_end(addr, end); 584 next = pud_addr_end(addr, end);
578 if (pud_none_or_clear_bad(pud)) 585 if (pud_none_or_clear_bad(pud))
579 continue; 586 continue;
580 if (unuse_pmd_range(vma, pud, addr, next, entry, page)) 587 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
581 return 1; 588 if (ret)
589 return ret;
582 } while (pud++, addr = next, addr != end); 590 } while (pud++, addr = next, addr != end);
583 return 0; 591 return 0;
584} 592}
@@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_struct *vma,
588{ 596{
589 pgd_t *pgd; 597 pgd_t *pgd;
590 unsigned long addr, end, next; 598 unsigned long addr, end, next;
599 int ret;
591 600
592 if (page->mapping) { 601 if (page->mapping) {
593 addr = page_address_in_vma(page, vma); 602 addr = page_address_in_vma(page, vma);
@@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_struct *vma,
605 next = pgd_addr_end(addr, end); 614 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd)) 615 if (pgd_none_or_clear_bad(pgd))
607 continue; 616 continue;
608 if (unuse_pud_range(vma, pgd, addr, next, entry, page)) 617 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
609 return 1; 618 if (ret)
619 return ret;
610 } while (pgd++, addr = next, addr != end); 620 } while (pgd++, addr = next, addr != end);
611 return 0; 621 return 0;
612} 622}
@@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm,
615 swp_entry_t entry, struct page *page) 625 swp_entry_t entry, struct page *page)
616{ 626{
617 struct vm_area_struct *vma; 627 struct vm_area_struct *vma;
628 int ret = 0;
618 629
619 if (!down_read_trylock(&mm->mmap_sem)) { 630 if (!down_read_trylock(&mm->mmap_sem)) {
620 /* 631 /*
@@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm,
627 lock_page(page); 638 lock_page(page);
628 } 639 }
629 for (vma = mm->mmap; vma; vma = vma->vm_next) { 640 for (vma = mm->mmap; vma; vma = vma->vm_next) {
630 if (vma->anon_vma && unuse_vma(vma, entry, page)) 641 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
631 break; 642 break;
632 } 643 }
633 up_read(&mm->mmap_sem); 644 up_read(&mm->mmap_sem);
634 /* 645 return (ret < 0)? ret: 0;
635 * Currently unuse_mm cannot fail, but leave error handling
636 * at call sites for now, since we change it from time to time.
637 */
638 return 0;
639} 646}
640 647
641/* 648/*