diff options
-rw-r--r-- | include/linux/memcontrol.h | 20 | ||||
-rw-r--r-- | mm/filemap.c | 12 | ||||
-rw-r--r-- | mm/memcontrol.c | 166 | ||||
-rw-r--r-- | mm/memory.c | 47 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 17 | ||||
-rw-r--r-- | mm/swap_state.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 41 |
9 files changed, 295 insertions, 27 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7d1f119c796e..f5b47efab48b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -30,6 +30,13 @@ extern void mm_free_cgroup(struct mm_struct *mm); | |||
30 | extern void page_assign_page_cgroup(struct page *page, | 30 | extern void page_assign_page_cgroup(struct page *page, |
31 | struct page_cgroup *pc); | 31 | struct page_cgroup *pc); |
32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); | 32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); |
33 | extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); | ||
34 | extern void mem_cgroup_uncharge(struct page_cgroup *pc); | ||
35 | |||
36 | static inline void mem_cgroup_uncharge_page(struct page *page) | ||
37 | { | ||
38 | mem_cgroup_uncharge(page_get_page_cgroup(page)); | ||
39 | } | ||
33 | 40 | ||
34 | #else /* CONFIG_CGROUP_MEM_CONT */ | 41 | #else /* CONFIG_CGROUP_MEM_CONT */ |
35 | static inline void mm_init_cgroup(struct mm_struct *mm, | 42 | static inline void mm_init_cgroup(struct mm_struct *mm, |
@@ -51,6 +58,19 @@ static inline struct page_cgroup *page_get_page_cgroup(struct page *page) | |||
51 | return NULL; | 58 | return NULL; |
52 | } | 59 | } |
53 | 60 | ||
61 | static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
62 | { | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | static inline void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
67 | { | ||
68 | } | ||
69 | |||
70 | static inline void mem_cgroup_uncharge_page(struct page *page) | ||
71 | { | ||
72 | } | ||
73 | |||
54 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 74 | #endif /* CONFIG_CGROUP_MEM_CONT */ |
55 | 75 | ||
56 | #endif /* _LINUX_MEMCONTROL_H */ | 76 | #endif /* _LINUX_MEMCONTROL_H */ |
diff --git a/mm/filemap.c b/mm/filemap.c index 81fb9bff0d4f..b7a01e927953 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
36 | #include <linux/memcontrol.h> | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
38 | /* | 39 | /* |
@@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page) | |||
118 | { | 119 | { |
119 | struct address_space *mapping = page->mapping; | 120 | struct address_space *mapping = page->mapping; |
120 | 121 | ||
122 | mem_cgroup_uncharge_page(page); | ||
121 | radix_tree_delete(&mapping->page_tree, page->index); | 123 | radix_tree_delete(&mapping->page_tree, page->index); |
122 | page->mapping = NULL; | 124 | page->mapping = NULL; |
123 | mapping->nrpages--; | 125 | mapping->nrpages--; |
@@ -461,6 +463,11 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
461 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 463 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
462 | 464 | ||
463 | if (error == 0) { | 465 | if (error == 0) { |
466 | |||
467 | error = mem_cgroup_charge(page, current->mm); | ||
468 | if (error) | ||
469 | goto out; | ||
470 | |||
464 | write_lock_irq(&mapping->tree_lock); | 471 | write_lock_irq(&mapping->tree_lock); |
465 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 472 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
466 | if (!error) { | 473 | if (!error) { |
@@ -470,10 +477,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
470 | page->index = offset; | 477 | page->index = offset; |
471 | mapping->nrpages++; | 478 | mapping->nrpages++; |
472 | __inc_zone_page_state(page, NR_FILE_PAGES); | 479 | __inc_zone_page_state(page, NR_FILE_PAGES); |
473 | } | 480 | } else |
481 | mem_cgroup_uncharge_page(page); | ||
482 | |||
474 | write_unlock_irq(&mapping->tree_lock); | 483 | write_unlock_irq(&mapping->tree_lock); |
475 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
476 | } | 485 | } |
486 | out: | ||
477 | return error; | 487 | return error; |
478 | } | 488 | } |
479 | EXPORT_SYMBOL(add_to_page_cache); | 489 | EXPORT_SYMBOL(add_to_page_cache); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c7..ebca767292dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/page-flags.h> | ||
25 | #include <linux/bit_spinlock.h> | ||
26 | #include <linux/rcupdate.h> | ||
24 | 27 | ||
25 | struct cgroup_subsys mem_cgroup_subsys; | 28 | struct cgroup_subsys mem_cgroup_subsys; |
26 | 29 | ||
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys; | |||
31 | * to help the administrator determine what knobs to tune. | 34 | * to help the administrator determine what knobs to tune. |
32 | * | 35 | * |
33 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 36 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
34 | * we hit the water mark. | 37 | * we hit the water mark. May be even add a low water mark, such that |
38 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
39 | * a feature that will be implemented much later in the future. | ||
35 | */ | 40 | */ |
36 | struct mem_cgroup { | 41 | struct mem_cgroup { |
37 | struct cgroup_subsys_state css; | 42 | struct cgroup_subsys_state css; |
@@ -49,6 +54,14 @@ struct mem_cgroup { | |||
49 | }; | 54 | }; |
50 | 55 | ||
51 | /* | 56 | /* |
57 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
58 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
59 | * byte aligned (based on comments from Nick Piggin) | ||
60 | */ | ||
61 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
62 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
63 | |||
64 | /* | ||
52 | * A page_cgroup page is associated with every page descriptor. The | 65 | * A page_cgroup page is associated with every page descriptor. The |
53 | * page_cgroup helps us identify information about the cgroup | 66 | * page_cgroup helps us identify information about the cgroup |
54 | */ | 67 | */ |
@@ -56,6 +69,8 @@ struct page_cgroup { | |||
56 | struct list_head lru; /* per cgroup LRU list */ | 69 | struct list_head lru; /* per cgroup LRU list */ |
57 | struct page *page; | 70 | struct page *page; |
58 | struct mem_cgroup *mem_cgroup; | 71 | struct mem_cgroup *mem_cgroup; |
72 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
73 | /* mapped and cached states */ | ||
59 | }; | 74 | }; |
60 | 75 | ||
61 | 76 | ||
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm) | |||
88 | css_put(&mm->mem_cgroup->css); | 103 | css_put(&mm->mem_cgroup->css); |
89 | } | 104 | } |
90 | 105 | ||
106 | static inline int page_cgroup_locked(struct page *page) | ||
107 | { | ||
108 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
109 | &page->page_cgroup); | ||
110 | } | ||
111 | |||
91 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | 112 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
92 | { | 113 | { |
93 | page->page_cgroup = (unsigned long)pc; | 114 | int locked; |
115 | |||
116 | /* | ||
117 | * While resetting the page_cgroup we might not hold the | ||
118 | * page_cgroup lock. free_hot_cold_page() is an example | ||
119 | * of such a scenario | ||
120 | */ | ||
121 | if (pc) | ||
122 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
123 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
124 | page->page_cgroup = ((unsigned long)pc | locked); | ||
94 | } | 125 | } |
95 | 126 | ||
96 | struct page_cgroup *page_get_page_cgroup(struct page *page) | 127 | struct page_cgroup *page_get_page_cgroup(struct page *page) |
97 | { | 128 | { |
98 | return page->page_cgroup; | 129 | return (struct page_cgroup *) |
130 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
131 | } | ||
132 | |||
133 | void __always_inline lock_page_cgroup(struct page *page) | ||
134 | { | ||
135 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
136 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
137 | } | ||
138 | |||
139 | void __always_inline unlock_page_cgroup(struct page *page) | ||
140 | { | ||
141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Charge the memory controller for page usage. | ||
146 | * Return | ||
147 | * 0 if the charge was successful | ||
148 | * < 0 if the cgroup is over its limit | ||
149 | */ | ||
150 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
151 | { | ||
152 | struct mem_cgroup *mem; | ||
153 | struct page_cgroup *pc, *race_pc; | ||
154 | |||
155 | /* | ||
156 | * Should page_cgroup's go to their own slab? | ||
157 | * One could optimize the performance of the charging routine | ||
158 | * by saving a bit in the page_flags and using it as a lock | ||
159 | * to see if the cgroup page already has a page_cgroup associated | ||
160 | * with it | ||
161 | */ | ||
162 | lock_page_cgroup(page); | ||
163 | pc = page_get_page_cgroup(page); | ||
164 | /* | ||
165 | * The page_cgroup exists and the page has already been accounted | ||
166 | */ | ||
167 | if (pc) { | ||
168 | atomic_inc(&pc->ref_cnt); | ||
169 | goto done; | ||
170 | } | ||
171 | |||
172 | unlock_page_cgroup(page); | ||
173 | |||
174 | pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); | ||
175 | if (pc == NULL) | ||
176 | goto err; | ||
177 | |||
178 | rcu_read_lock(); | ||
179 | /* | ||
180 | * We always charge the cgroup the mm_struct belongs to | ||
181 | * the mm_struct's mem_cgroup changes on task migration if the | ||
182 | * thread group leader migrates. It's possible that mm is not | ||
183 | * set, if so charge the init_mm (happens for pagecache usage). | ||
184 | */ | ||
185 | if (!mm) | ||
186 | mm = &init_mm; | ||
187 | |||
188 | mem = rcu_dereference(mm->mem_cgroup); | ||
189 | /* | ||
190 | * For every charge from the cgroup, increment reference | ||
191 | * count | ||
192 | */ | ||
193 | css_get(&mem->css); | ||
194 | rcu_read_unlock(); | ||
195 | |||
196 | /* | ||
197 | * If we created the page_cgroup, we should free it on exceeding | ||
198 | * the cgroup limit. | ||
199 | */ | ||
200 | if (res_counter_charge(&mem->res, 1)) { | ||
201 | css_put(&mem->css); | ||
202 | goto free_pc; | ||
203 | } | ||
204 | |||
205 | lock_page_cgroup(page); | ||
206 | /* | ||
207 | * Check if somebody else beat us to allocating the page_cgroup | ||
208 | */ | ||
209 | race_pc = page_get_page_cgroup(page); | ||
210 | if (race_pc) { | ||
211 | kfree(pc); | ||
212 | pc = race_pc; | ||
213 | atomic_inc(&pc->ref_cnt); | ||
214 | res_counter_uncharge(&mem->res, 1); | ||
215 | css_put(&mem->css); | ||
216 | goto done; | ||
217 | } | ||
218 | |||
219 | atomic_set(&pc->ref_cnt, 1); | ||
220 | pc->mem_cgroup = mem; | ||
221 | pc->page = page; | ||
222 | page_assign_page_cgroup(page, pc); | ||
223 | |||
224 | done: | ||
225 | unlock_page_cgroup(page); | ||
226 | return 0; | ||
227 | free_pc: | ||
228 | kfree(pc); | ||
229 | return -ENOMEM; | ||
230 | err: | ||
231 | unlock_page_cgroup(page); | ||
232 | return -ENOMEM; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * Uncharging is always a welcome operation, we never complain, simply | ||
237 | * uncharge. | ||
238 | */ | ||
239 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
240 | { | ||
241 | struct mem_cgroup *mem; | ||
242 | struct page *page; | ||
243 | |||
244 | if (!pc) | ||
245 | return; | ||
246 | |||
247 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
248 | page = pc->page; | ||
249 | lock_page_cgroup(page); | ||
250 | mem = pc->mem_cgroup; | ||
251 | css_put(&mem->css); | ||
252 | page_assign_page_cgroup(page, NULL); | ||
253 | unlock_page_cgroup(page); | ||
254 | res_counter_uncharge(&mem->res, 1); | ||
255 | kfree(pc); | ||
256 | } | ||
99 | } | 257 | } |
100 | 258 | ||
101 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | 259 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
150 | return NULL; | 308 | return NULL; |
151 | 309 | ||
152 | res_counter_init(&mem->res); | 310 | res_counter_init(&mem->res); |
311 | INIT_LIST_HEAD(&mem->active_list); | ||
312 | INIT_LIST_HEAD(&mem->inactive_list); | ||
153 | return &mem->css; | 313 | return &mem->css; |
154 | } | 314 | } |
155 | 315 | ||
diff --git a/mm/memory.c b/mm/memory.c index 9d073fa0a2d0..0ba224ea6ba4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/delayacct.h> | 50 | #include <linux/delayacct.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | ||
53 | 54 | ||
54 | #include <asm/pgalloc.h> | 55 | #include <asm/pgalloc.h> |
55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
@@ -1144,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
1144 | { | 1145 | { |
1145 | int retval; | 1146 | int retval; |
1146 | pte_t *pte; | 1147 | pte_t *pte; |
1147 | spinlock_t *ptl; | 1148 | spinlock_t *ptl; |
1149 | |||
1150 | retval = mem_cgroup_charge(page, mm); | ||
1151 | if (retval) | ||
1152 | goto out; | ||
1148 | 1153 | ||
1149 | retval = -EINVAL; | 1154 | retval = -EINVAL; |
1150 | if (PageAnon(page)) | 1155 | if (PageAnon(page)) |
1151 | goto out; | 1156 | goto out_uncharge; |
1152 | retval = -ENOMEM; | 1157 | retval = -ENOMEM; |
1153 | flush_dcache_page(page); | 1158 | flush_dcache_page(page); |
1154 | pte = get_locked_pte(mm, addr, &ptl); | 1159 | pte = get_locked_pte(mm, addr, &ptl); |
1155 | if (!pte) | 1160 | if (!pte) |
1156 | goto out; | 1161 | goto out_uncharge; |
1157 | retval = -EBUSY; | 1162 | retval = -EBUSY; |
1158 | if (!pte_none(*pte)) | 1163 | if (!pte_none(*pte)) |
1159 | goto out_unlock; | 1164 | goto out_unlock; |
@@ -1165,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
1165 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1170 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1166 | 1171 | ||
1167 | retval = 0; | 1172 | retval = 0; |
1173 | pte_unmap_unlock(pte, ptl); | ||
1174 | return retval; | ||
1168 | out_unlock: | 1175 | out_unlock: |
1169 | pte_unmap_unlock(pte, ptl); | 1176 | pte_unmap_unlock(pte, ptl); |
1177 | out_uncharge: | ||
1178 | mem_cgroup_uncharge_page(page); | ||
1170 | out: | 1179 | out: |
1171 | return retval; | 1180 | return retval; |
1172 | } | 1181 | } |
@@ -1641,6 +1650,9 @@ gotten: | |||
1641 | cow_user_page(new_page, old_page, address, vma); | 1650 | cow_user_page(new_page, old_page, address, vma); |
1642 | __SetPageUptodate(new_page); | 1651 | __SetPageUptodate(new_page); |
1643 | 1652 | ||
1653 | if (mem_cgroup_charge(new_page, mm)) | ||
1654 | goto oom_free_new; | ||
1655 | |||
1644 | /* | 1656 | /* |
1645 | * Re-check the pte - we dropped the lock | 1657 | * Re-check the pte - we dropped the lock |
1646 | */ | 1658 | */ |
@@ -1672,7 +1684,9 @@ gotten: | |||
1672 | /* Free the old page.. */ | 1684 | /* Free the old page.. */ |
1673 | new_page = old_page; | 1685 | new_page = old_page; |
1674 | ret |= VM_FAULT_WRITE; | 1686 | ret |= VM_FAULT_WRITE; |
1675 | } | 1687 | } else |
1688 | mem_cgroup_uncharge_page(new_page); | ||
1689 | |||
1676 | if (new_page) | 1690 | if (new_page) |
1677 | page_cache_release(new_page); | 1691 | page_cache_release(new_page); |
1678 | if (old_page) | 1692 | if (old_page) |
@@ -1696,6 +1710,8 @@ unlock: | |||
1696 | put_page(dirty_page); | 1710 | put_page(dirty_page); |
1697 | } | 1711 | } |
1698 | return ret; | 1712 | return ret; |
1713 | oom_free_new: | ||
1714 | __free_page(new_page); | ||
1699 | oom: | 1715 | oom: |
1700 | if (old_page) | 1716 | if (old_page) |
1701 | page_cache_release(old_page); | 1717 | page_cache_release(old_page); |
@@ -2036,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2036 | count_vm_event(PGMAJFAULT); | 2052 | count_vm_event(PGMAJFAULT); |
2037 | } | 2053 | } |
2038 | 2054 | ||
2055 | if (mem_cgroup_charge(page, mm)) { | ||
2056 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2057 | ret = VM_FAULT_OOM; | ||
2058 | goto out; | ||
2059 | } | ||
2060 | |||
2039 | mark_page_accessed(page); | 2061 | mark_page_accessed(page); |
2040 | lock_page(page); | 2062 | lock_page(page); |
2041 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2063 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2073,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2073 | if (write_access) { | 2095 | if (write_access) { |
2074 | /* XXX: We could OR the do_wp_page code with this one? */ | 2096 | /* XXX: We could OR the do_wp_page code with this one? */ |
2075 | if (do_wp_page(mm, vma, address, | 2097 | if (do_wp_page(mm, vma, address, |
2076 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) | 2098 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) { |
2099 | mem_cgroup_uncharge_page(page); | ||
2077 | ret = VM_FAULT_OOM; | 2100 | ret = VM_FAULT_OOM; |
2101 | } | ||
2078 | goto out; | 2102 | goto out; |
2079 | } | 2103 | } |
2080 | 2104 | ||
@@ -2085,6 +2109,7 @@ unlock: | |||
2085 | out: | 2109 | out: |
2086 | return ret; | 2110 | return ret; |
2087 | out_nomap: | 2111 | out_nomap: |
2112 | mem_cgroup_uncharge_page(page); | ||
2088 | pte_unmap_unlock(page_table, ptl); | 2113 | pte_unmap_unlock(page_table, ptl); |
2089 | unlock_page(page); | 2114 | unlock_page(page); |
2090 | page_cache_release(page); | 2115 | page_cache_release(page); |
@@ -2114,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2114 | goto oom; | 2139 | goto oom; |
2115 | __SetPageUptodate(page); | 2140 | __SetPageUptodate(page); |
2116 | 2141 | ||
2142 | if (mem_cgroup_charge(page, mm)) | ||
2143 | goto oom_free_page; | ||
2144 | |||
2117 | entry = mk_pte(page, vma->vm_page_prot); | 2145 | entry = mk_pte(page, vma->vm_page_prot); |
2118 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2146 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2119 | 2147 | ||
@@ -2131,8 +2159,11 @@ unlock: | |||
2131 | pte_unmap_unlock(page_table, ptl); | 2159 | pte_unmap_unlock(page_table, ptl); |
2132 | return 0; | 2160 | return 0; |
2133 | release: | 2161 | release: |
2162 | mem_cgroup_uncharge_page(page); | ||
2134 | page_cache_release(page); | 2163 | page_cache_release(page); |
2135 | goto unlock; | 2164 | goto unlock; |
2165 | oom_free_page: | ||
2166 | __free_page(page); | ||
2136 | oom: | 2167 | oom: |
2137 | return VM_FAULT_OOM; | 2168 | return VM_FAULT_OOM; |
2138 | } | 2169 | } |
@@ -2246,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2246 | 2277 | ||
2247 | } | 2278 | } |
2248 | 2279 | ||
2280 | if (mem_cgroup_charge(page, mm)) { | ||
2281 | ret = VM_FAULT_OOM; | ||
2282 | goto out; | ||
2283 | } | ||
2284 | |||
2249 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2285 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2250 | 2286 | ||
2251 | /* | 2287 | /* |
@@ -2281,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2281 | /* no need to invalidate: a not-present page won't be cached */ | 2317 | /* no need to invalidate: a not-present page won't be cached */ |
2282 | update_mmu_cache(vma, address, entry); | 2318 | update_mmu_cache(vma, address, entry); |
2283 | } else { | 2319 | } else { |
2320 | mem_cgroup_uncharge_page(page); | ||
2284 | if (anon) | 2321 | if (anon) |
2285 | page_cache_release(page); | 2322 | page_cache_release(page); |
2286 | else | 2323 | else |
diff --git a/mm/migrate.c b/mm/migrate.c index 857a987e3690..417bbda14e5b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/mempolicy.h> | 29 | #include <linux/mempolicy.h> |
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/memcontrol.h> | ||
32 | 33 | ||
33 | #include "internal.h" | 34 | #include "internal.h" |
34 | 35 | ||
@@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
152 | return; | 153 | return; |
153 | } | 154 | } |
154 | 155 | ||
156 | if (mem_cgroup_charge(new, mm)) { | ||
157 | pte_unmap(ptep); | ||
158 | return; | ||
159 | } | ||
160 | |||
155 | ptl = pte_lockptr(mm, pmd); | 161 | ptl = pte_lockptr(mm, pmd); |
156 | spin_lock(ptl); | 162 | spin_lock(ptl); |
157 | pte = *ptep; | 163 | pte = *ptep; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37576b822f06..26a54a17dc9f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/fault-inject.h> | 44 | #include <linux/fault-inject.h> |
45 | #include <linux/page-isolation.h> | 45 | #include <linux/page-isolation.h> |
46 | #include <linux/memcontrol.h> | ||
46 | 47 | ||
47 | #include <asm/tlbflush.h> | 48 | #include <asm/tlbflush.h> |
48 | #include <asm/div64.h> | 49 | #include <asm/div64.h> |
@@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
987 | 988 | ||
988 | if (!PageHighMem(page)) | 989 | if (!PageHighMem(page)) |
989 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | 990 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); |
991 | VM_BUG_ON(page_get_page_cgroup(page)); | ||
990 | arch_free_page(page, 0); | 992 | arch_free_page(page, 0); |
991 | kernel_map_pages(page, 1, 0); | 993 | kernel_map_pages(page, 1, 0); |
992 | 994 | ||
@@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2525 | set_page_links(page, zone, nid, pfn); | 2527 | set_page_links(page, zone, nid, pfn); |
2526 | init_page_count(page); | 2528 | init_page_count(page); |
2527 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
2530 | page_assign_page_cgroup(page, NULL); | ||
2528 | SetPageReserved(page); | 2531 | SetPageReserved(page); |
2529 | 2532 | ||
2530 | /* | 2533 | /* |
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | ||
51 | 52 | ||
52 | #include <asm/tlbflush.h> | 53 | #include <asm/tlbflush.h> |
53 | 54 | ||
@@ -554,8 +555,14 @@ void page_add_anon_rmap(struct page *page, | |||
554 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 555 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
555 | if (atomic_inc_and_test(&page->_mapcount)) | 556 | if (atomic_inc_and_test(&page->_mapcount)) |
556 | __page_set_anon_rmap(page, vma, address); | 557 | __page_set_anon_rmap(page, vma, address); |
557 | else | 558 | else { |
558 | __page_check_anon_rmap(page, vma, address); | 559 | __page_check_anon_rmap(page, vma, address); |
560 | /* | ||
561 | * We unconditionally charged during prepare, we uncharge here | ||
562 | * This takes care of balancing the reference counts | ||
563 | */ | ||
564 | mem_cgroup_uncharge_page(page); | ||
565 | } | ||
559 | } | 566 | } |
560 | 567 | ||
561 | /* | 568 | /* |
@@ -586,6 +593,12 @@ void page_add_file_rmap(struct page *page) | |||
586 | { | 593 | { |
587 | if (atomic_inc_and_test(&page->_mapcount)) | 594 | if (atomic_inc_and_test(&page->_mapcount)) |
588 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 595 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
596 | else | ||
597 | /* | ||
598 | * We unconditionally charged during prepare, we uncharge here | ||
599 | * This takes care of balancing the reference counts | ||
600 | */ | ||
601 | mem_cgroup_uncharge_page(page); | ||
589 | } | 602 | } |
590 | 603 | ||
591 | #ifdef CONFIG_DEBUG_VM | 604 | #ifdef CONFIG_DEBUG_VM |
@@ -646,6 +659,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
646 | page_clear_dirty(page); | 659 | page_clear_dirty(page); |
647 | set_page_dirty(page); | 660 | set_page_dirty(page); |
648 | } | 661 | } |
662 | mem_cgroup_uncharge_page(page); | ||
663 | |||
649 | __dec_zone_page_state(page, | 664 | __dec_zone_page_state(page, |
650 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 665 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
651 | } | 666 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index ec42f01a8d02..f96e3ff1e791 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/memcontrol.h> | ||
20 | 21 | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
@@ -76,6 +77,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
76 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
77 | error = radix_tree_preload(gfp_mask); | 78 | error = radix_tree_preload(gfp_mask); |
78 | if (!error) { | 79 | if (!error) { |
80 | |||
81 | error = mem_cgroup_charge(page, current->mm); | ||
82 | if (error) | ||
83 | goto out; | ||
84 | |||
79 | write_lock_irq(&swapper_space.tree_lock); | 85 | write_lock_irq(&swapper_space.tree_lock); |
80 | error = radix_tree_insert(&swapper_space.page_tree, | 86 | error = radix_tree_insert(&swapper_space.page_tree, |
81 | entry.val, page); | 87 | entry.val, page); |
@@ -86,10 +92,13 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
86 | total_swapcache_pages++; | 92 | total_swapcache_pages++; |
87 | __inc_zone_page_state(page, NR_FILE_PAGES); | 93 | __inc_zone_page_state(page, NR_FILE_PAGES); |
88 | INC_CACHE_INFO(add_total); | 94 | INC_CACHE_INFO(add_total); |
95 | } else { | ||
96 | mem_cgroup_uncharge_page(page); | ||
89 | } | 97 | } |
90 | write_unlock_irq(&swapper_space.tree_lock); | 98 | write_unlock_irq(&swapper_space.tree_lock); |
91 | radix_tree_preload_end(); | 99 | radix_tree_preload_end(); |
92 | } | 100 | } |
101 | out: | ||
93 | return error; | 102 | return error; |
94 | } | 103 | } |
95 | 104 | ||
@@ -104,6 +113,7 @@ void __delete_from_swap_cache(struct page *page) | |||
104 | BUG_ON(PageWriteback(page)); | 113 | BUG_ON(PageWriteback(page)); |
105 | BUG_ON(PagePrivate(page)); | 114 | BUG_ON(PagePrivate(page)); |
106 | 115 | ||
116 | mem_cgroup_uncharge_page(page); | ||
107 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 117 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
108 | set_page_private(page, 0); | 118 | set_page_private(page, 0); |
109 | ClearPageSwapCache(page); | 119 | ClearPageSwapCache(page); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index afae7b1f680b..fddc4cc4149b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mutex.h> | 27 | #include <linux/mutex.h> |
28 | #include <linux/capability.h> | 28 | #include <linux/capability.h> |
29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
30 | #include <linux/memcontrol.h> | ||
30 | 31 | ||
31 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
@@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, int free) | |||
506 | * just let do_wp_page work it out if a write is requested later - to | 507 | * just let do_wp_page work it out if a write is requested later - to |
507 | * force COW, vm_page_prot omits write permission from any private vma. | 508 | * force COW, vm_page_prot omits write permission from any private vma. |
508 | */ | 509 | */ |
509 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 510 | static int unuse_pte(struct vm_area_struct *vma, pte_t *pte, |
510 | unsigned long addr, swp_entry_t entry, struct page *page) | 511 | unsigned long addr, swp_entry_t entry, struct page *page) |
511 | { | 512 | { |
513 | if (mem_cgroup_charge(page, vma->vm_mm)) | ||
514 | return -ENOMEM; | ||
515 | |||
512 | inc_mm_counter(vma->vm_mm, anon_rss); | 516 | inc_mm_counter(vma->vm_mm, anon_rss); |
513 | get_page(page); | 517 | get_page(page); |
514 | set_pte_at(vma->vm_mm, addr, pte, | 518 | set_pte_at(vma->vm_mm, addr, pte, |
@@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | |||
520 | * immediately swapped out again after swapon. | 524 | * immediately swapped out again after swapon. |
521 | */ | 525 | */ |
522 | activate_page(page); | 526 | activate_page(page); |
527 | return 1; | ||
523 | } | 528 | } |
524 | 529 | ||
525 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 530 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
529 | pte_t swp_pte = swp_entry_to_pte(entry); | 534 | pte_t swp_pte = swp_entry_to_pte(entry); |
530 | pte_t *pte; | 535 | pte_t *pte; |
531 | spinlock_t *ptl; | 536 | spinlock_t *ptl; |
532 | int found = 0; | 537 | int ret = 0; |
533 | 538 | ||
534 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 539 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
535 | do { | 540 | do { |
@@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
538 | * Test inline before going to call unuse_pte. | 543 | * Test inline before going to call unuse_pte. |
539 | */ | 544 | */ |
540 | if (unlikely(pte_same(*pte, swp_pte))) { | 545 | if (unlikely(pte_same(*pte, swp_pte))) { |
541 | unuse_pte(vma, pte++, addr, entry, page); | 546 | ret = unuse_pte(vma, pte++, addr, entry, page); |
542 | found = 1; | ||
543 | break; | 547 | break; |
544 | } | 548 | } |
545 | } while (pte++, addr += PAGE_SIZE, addr != end); | 549 | } while (pte++, addr += PAGE_SIZE, addr != end); |
546 | pte_unmap_unlock(pte - 1, ptl); | 550 | pte_unmap_unlock(pte - 1, ptl); |
547 | return found; | 551 | return ret; |
548 | } | 552 | } |
549 | 553 | ||
550 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 554 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
@@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
553 | { | 557 | { |
554 | pmd_t *pmd; | 558 | pmd_t *pmd; |
555 | unsigned long next; | 559 | unsigned long next; |
560 | int ret; | ||
556 | 561 | ||
557 | pmd = pmd_offset(pud, addr); | 562 | pmd = pmd_offset(pud, addr); |
558 | do { | 563 | do { |
559 | next = pmd_addr_end(addr, end); | 564 | next = pmd_addr_end(addr, end); |
560 | if (pmd_none_or_clear_bad(pmd)) | 565 | if (pmd_none_or_clear_bad(pmd)) |
561 | continue; | 566 | continue; |
562 | if (unuse_pte_range(vma, pmd, addr, next, entry, page)) | 567 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
563 | return 1; | 568 | if (ret) |
569 | return ret; | ||
564 | } while (pmd++, addr = next, addr != end); | 570 | } while (pmd++, addr = next, addr != end); |
565 | return 0; | 571 | return 0; |
566 | } | 572 | } |
@@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
571 | { | 577 | { |
572 | pud_t *pud; | 578 | pud_t *pud; |
573 | unsigned long next; | 579 | unsigned long next; |
580 | int ret; | ||
574 | 581 | ||
575 | pud = pud_offset(pgd, addr); | 582 | pud = pud_offset(pgd, addr); |
576 | do { | 583 | do { |
577 | next = pud_addr_end(addr, end); | 584 | next = pud_addr_end(addr, end); |
578 | if (pud_none_or_clear_bad(pud)) | 585 | if (pud_none_or_clear_bad(pud)) |
579 | continue; | 586 | continue; |
580 | if (unuse_pmd_range(vma, pud, addr, next, entry, page)) | 587 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
581 | return 1; | 588 | if (ret) |
589 | return ret; | ||
582 | } while (pud++, addr = next, addr != end); | 590 | } while (pud++, addr = next, addr != end); |
583 | return 0; | 591 | return 0; |
584 | } | 592 | } |
@@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
588 | { | 596 | { |
589 | pgd_t *pgd; | 597 | pgd_t *pgd; |
590 | unsigned long addr, end, next; | 598 | unsigned long addr, end, next; |
599 | int ret; | ||
591 | 600 | ||
592 | if (page->mapping) { | 601 | if (page->mapping) { |
593 | addr = page_address_in_vma(page, vma); | 602 | addr = page_address_in_vma(page, vma); |
@@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
605 | next = pgd_addr_end(addr, end); | 614 | next = pgd_addr_end(addr, end); |
606 | if (pgd_none_or_clear_bad(pgd)) | 615 | if (pgd_none_or_clear_bad(pgd)) |
607 | continue; | 616 | continue; |
608 | if (unuse_pud_range(vma, pgd, addr, next, entry, page)) | 617 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); |
609 | return 1; | 618 | if (ret) |
619 | return ret; | ||
610 | } while (pgd++, addr = next, addr != end); | 620 | } while (pgd++, addr = next, addr != end); |
611 | return 0; | 621 | return 0; |
612 | } | 622 | } |
@@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm, | |||
615 | swp_entry_t entry, struct page *page) | 625 | swp_entry_t entry, struct page *page) |
616 | { | 626 | { |
617 | struct vm_area_struct *vma; | 627 | struct vm_area_struct *vma; |
628 | int ret = 0; | ||
618 | 629 | ||
619 | if (!down_read_trylock(&mm->mmap_sem)) { | 630 | if (!down_read_trylock(&mm->mmap_sem)) { |
620 | /* | 631 | /* |
@@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm, | |||
627 | lock_page(page); | 638 | lock_page(page); |
628 | } | 639 | } |
629 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 640 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
630 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 641 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
631 | break; | 642 | break; |
632 | } | 643 | } |
633 | up_read(&mm->mmap_sem); | 644 | up_read(&mm->mmap_sem); |
634 | /* | 645 | return (ret < 0)? ret: 0; |
635 | * Currently unuse_mm cannot fail, but leave error handling | ||
636 | * at call sites for now, since we change it from time to time. | ||
637 | */ | ||
638 | return 0; | ||
639 | } | 646 | } |
640 | 647 | ||
641 | /* | 648 | /* |