aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c15
-rw-r--r--mm/hugetlb.c43
-rw-r--r--mm/memcontrol.c365
-rw-r--r--mm/memory.c13
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c9
12 files changed, 236 insertions, 268 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 9f117bab5322..a5b0dd93427a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
33obj-$(CONFIG_SMP) += allocpercpu.o 33obj-$(CONFIG_SMP) += allocpercpu.o
34obj-$(CONFIG_QUICKLIST) += quicklist.o 34obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o 35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
36 36
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 7e58322b7134..b0012e27fea8 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -6,6 +6,10 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8 8
9#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES
11#endif
12
9/** 13/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu 14 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate 15 * @__pdata: per-cpu data to depopulate
@@ -52,6 +56,11 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52 struct percpu_data *pdata = __percpu_disguise(__pdata); 56 struct percpu_data *pdata = __percpu_disguise(__pdata);
53 int node = cpu_to_node(cpu); 57 int node = cpu_to_node(cpu);
54 58
59 /*
60 * We should make sure each CPU gets private memory.
61 */
62 size = roundup(size, cache_line_size());
63
55 BUG_ON(pdata->ptrs[cpu]); 64 BUG_ON(pdata->ptrs[cpu]);
56 if (node_online(node)) 65 if (node_online(node))
57 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); 66 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
@@ -98,7 +107,11 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
98 */ 107 */
99void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 108void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
100{ 109{
101 void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp); 110 /*
111 * We allocate whole cache lines to avoid false sharing
112 */
113 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
114 void *pdata = kzalloc(sz, gfp);
102 void *__pdata = __percpu_disguise(pdata); 115 void *__pdata = __percpu_disguise(pdata);
103 116
104 if (unlikely(!pdata)) 117 if (unlikely(!pdata))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 89e6286a7f57..dcacc811e70e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,7 +71,25 @@ static void enqueue_huge_page(struct page *page)
71 free_huge_pages_node[nid]++; 71 free_huge_pages_node[nid]++;
72} 72}
73 73
74static struct page *dequeue_huge_page(struct vm_area_struct *vma, 74static struct page *dequeue_huge_page(void)
75{
76 int nid;
77 struct page *page = NULL;
78
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next,
82 struct page, lru);
83 list_del(&page->lru);
84 free_huge_pages--;
85 free_huge_pages_node[nid]--;
86 break;
87 }
88 }
89 return page;
90}
91
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
75 unsigned long address) 93 unsigned long address)
76{ 94{
77 int nid; 95 int nid;
@@ -296,8 +314,10 @@ static int gather_surplus_pages(int delta)
296 int needed, allocated; 314 int needed, allocated;
297 315
298 needed = (resv_huge_pages + delta) - free_huge_pages; 316 needed = (resv_huge_pages + delta) - free_huge_pages;
299 if (needed <= 0) 317 if (needed <= 0) {
318 resv_huge_pages += delta;
300 return 0; 319 return 0;
320 }
301 321
302 allocated = 0; 322 allocated = 0;
303 INIT_LIST_HEAD(&surplus_list); 323 INIT_LIST_HEAD(&surplus_list);
@@ -335,9 +355,12 @@ retry:
335 * The surplus_list now contains _at_least_ the number of extra pages 355 * The surplus_list now contains _at_least_ the number of extra pages
336 * needed to accomodate the reservation. Add the appropriate number 356 * needed to accomodate the reservation. Add the appropriate number
337 * of pages to the hugetlb pool and free the extras back to the buddy 357 * of pages to the hugetlb pool and free the extras back to the buddy
338 * allocator. 358 * allocator. Commit the entire reservation here to prevent another
359 * process from stealing the pages as they are added to the pool but
360 * before they are reserved.
339 */ 361 */
340 needed += allocated; 362 needed += allocated;
363 resv_huge_pages += delta;
341 ret = 0; 364 ret = 0;
342free: 365free:
343 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 366 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
@@ -371,6 +394,9 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
371 struct page *page; 394 struct page *page;
372 unsigned long nr_pages; 395 unsigned long nr_pages;
373 396
397 /* Uncommit the reservation */
398 resv_huge_pages -= unused_resv_pages;
399
374 nr_pages = min(unused_resv_pages, surplus_huge_pages); 400 nr_pages = min(unused_resv_pages, surplus_huge_pages);
375 401
376 while (nr_pages) { 402 while (nr_pages) {
@@ -402,7 +428,7 @@ static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
402 struct page *page; 428 struct page *page;
403 429
404 spin_lock(&hugetlb_lock); 430 spin_lock(&hugetlb_lock);
405 page = dequeue_huge_page(vma, addr); 431 page = dequeue_huge_page_vma(vma, addr);
406 spin_unlock(&hugetlb_lock); 432 spin_unlock(&hugetlb_lock);
407 return page ? page : ERR_PTR(-VM_FAULT_OOM); 433 return page ? page : ERR_PTR(-VM_FAULT_OOM);
408} 434}
@@ -417,7 +443,7 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
417 443
418 spin_lock(&hugetlb_lock); 444 spin_lock(&hugetlb_lock);
419 if (free_huge_pages > resv_huge_pages) 445 if (free_huge_pages > resv_huge_pages)
420 page = dequeue_huge_page(vma, addr); 446 page = dequeue_huge_page_vma(vma, addr);
421 spin_unlock(&hugetlb_lock); 447 spin_unlock(&hugetlb_lock);
422 if (!page) { 448 if (!page) {
423 page = alloc_buddy_huge_page(vma, addr); 449 page = alloc_buddy_huge_page(vma, addr);
@@ -570,7 +596,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
570 min_count = max(count, min_count); 596 min_count = max(count, min_count);
571 try_to_free_low(min_count); 597 try_to_free_low(min_count);
572 while (min_count < persistent_huge_pages) { 598 while (min_count < persistent_huge_pages) {
573 struct page *page = dequeue_huge_page(NULL, 0); 599 struct page *page = dequeue_huge_page();
574 if (!page) 600 if (!page)
575 break; 601 break;
576 update_and_free_page(page); 602 update_and_free_page(page);
@@ -1205,12 +1231,13 @@ static int hugetlb_acct_memory(long delta)
1205 if (gather_surplus_pages(delta) < 0) 1231 if (gather_surplus_pages(delta) < 0)
1206 goto out; 1232 goto out;
1207 1233
1208 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1234 if (delta > cpuset_mems_nr(free_huge_pages_node)) {
1235 return_unused_surplus_pages(delta);
1209 goto out; 1236 goto out;
1237 }
1210 } 1238 }
1211 1239
1212 ret = 0; 1240 ret = 0;
1213 resv_huge_pages += delta;
1214 if (delta < 0) 1241 if (delta < 0)
1215 return_unused_surplus_pages((unsigned long) -delta); 1242 return_unused_surplus_pages((unsigned long) -delta);
1216 1243
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 631002d085d1..8b9f6cae938e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -137,14 +137,21 @@ struct mem_cgroup {
137 */ 137 */
138 struct mem_cgroup_stat stat; 138 struct mem_cgroup_stat stat;
139}; 139};
140static struct mem_cgroup init_mem_cgroup;
140 141
141/* 142/*
142 * We use the lower bit of the page->page_cgroup pointer as a bit spin 143 * We use the lower bit of the page->page_cgroup pointer as a bit spin
143 * lock. We need to ensure that page->page_cgroup is atleast two 144 * lock. We need to ensure that page->page_cgroup is at least two
144 * byte aligned (based on comments from Nick Piggin) 145 * byte aligned (based on comments from Nick Piggin). But since
146 * bit_spin_lock doesn't actually set that lock bit in a non-debug
147 * uniprocessor kernel, we should avoid setting it here too.
145 */ 148 */
146#define PAGE_CGROUP_LOCK_BIT 0x0 149#define PAGE_CGROUP_LOCK_BIT 0x0
147#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 150#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
151#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
152#else
153#define PAGE_CGROUP_LOCK 0x0
154#endif
148 155
149/* 156/*
150 * A page_cgroup page is associated with every page descriptor. The 157 * A page_cgroup page is associated with every page descriptor. The
@@ -154,37 +161,27 @@ struct page_cgroup {
154 struct list_head lru; /* per cgroup LRU list */ 161 struct list_head lru; /* per cgroup LRU list */
155 struct page *page; 162 struct page *page;
156 struct mem_cgroup *mem_cgroup; 163 struct mem_cgroup *mem_cgroup;
157 atomic_t ref_cnt; /* Helpful when pages move b/w */ 164 int ref_cnt; /* cached, mapped, migrating */
158 /* mapped and cached states */ 165 int flags;
159 int flags;
160}; 166};
161#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 167#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
162#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 168#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
163 169
164static inline int page_cgroup_nid(struct page_cgroup *pc) 170static int page_cgroup_nid(struct page_cgroup *pc)
165{ 171{
166 return page_to_nid(pc->page); 172 return page_to_nid(pc->page);
167} 173}
168 174
169static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) 175static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
170{ 176{
171 return page_zonenum(pc->page); 177 return page_zonenum(pc->page);
172} 178}
173 179
174enum {
175 MEM_CGROUP_TYPE_UNSPEC = 0,
176 MEM_CGROUP_TYPE_MAPPED,
177 MEM_CGROUP_TYPE_CACHED,
178 MEM_CGROUP_TYPE_ALL,
179 MEM_CGROUP_TYPE_MAX,
180};
181
182enum charge_type { 180enum charge_type {
183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 181 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
184 MEM_CGROUP_CHARGE_TYPE_MAPPED, 182 MEM_CGROUP_CHARGE_TYPE_MAPPED,
185}; 183};
186 184
187
188/* 185/*
189 * Always modified under lru lock. Then, not necessary to preempt_disable() 186 * Always modified under lru lock. Then, not necessary to preempt_disable()
190 */ 187 */
@@ -193,23 +190,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
193{ 190{
194 int val = (charge)? 1 : -1; 191 int val = (charge)? 1 : -1;
195 struct mem_cgroup_stat *stat = &mem->stat; 192 struct mem_cgroup_stat *stat = &mem->stat;
196 VM_BUG_ON(!irqs_disabled());
197 193
194 VM_BUG_ON(!irqs_disabled());
198 if (flags & PAGE_CGROUP_FLAG_CACHE) 195 if (flags & PAGE_CGROUP_FLAG_CACHE)
199 __mem_cgroup_stat_add_safe(stat, 196 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
200 MEM_CGROUP_STAT_CACHE, val);
201 else 197 else
202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 198 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
203} 199}
204 200
205static inline struct mem_cgroup_per_zone * 201static struct mem_cgroup_per_zone *
206mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 202mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
207{ 203{
208 BUG_ON(!mem->info.nodeinfo[nid]);
209 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 204 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
210} 205}
211 206
212static inline struct mem_cgroup_per_zone * 207static struct mem_cgroup_per_zone *
213page_cgroup_zoneinfo(struct page_cgroup *pc) 208page_cgroup_zoneinfo(struct page_cgroup *pc)
214{ 209{
215 struct mem_cgroup *mem = pc->mem_cgroup; 210 struct mem_cgroup *mem = pc->mem_cgroup;
@@ -234,18 +229,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
234 return total; 229 return total;
235} 230}
236 231
237static struct mem_cgroup init_mem_cgroup; 232static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
238
239static inline
240struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
241{ 233{
242 return container_of(cgroup_subsys_state(cont, 234 return container_of(cgroup_subsys_state(cont,
243 mem_cgroup_subsys_id), struct mem_cgroup, 235 mem_cgroup_subsys_id), struct mem_cgroup,
244 css); 236 css);
245} 237}
246 238
247static inline 239static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
248struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249{ 240{
250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
251 struct mem_cgroup, css); 242 struct mem_cgroup, css);
@@ -267,81 +258,33 @@ void mm_free_cgroup(struct mm_struct *mm)
267 258
268static inline int page_cgroup_locked(struct page *page) 259static inline int page_cgroup_locked(struct page *page)
269{ 260{
270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 261 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
271 &page->page_cgroup);
272} 262}
273 263
274void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 264static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
275{ 265{
276 int locked; 266 VM_BUG_ON(!page_cgroup_locked(page));
277 267 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
278 /*
279 * While resetting the page_cgroup we might not hold the
280 * page_cgroup lock. free_hot_cold_page() is an example
281 * of such a scenario
282 */
283 if (pc)
284 VM_BUG_ON(!page_cgroup_locked(page));
285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
286 page->page_cgroup = ((unsigned long)pc | locked);
287} 268}
288 269
289struct page_cgroup *page_get_page_cgroup(struct page *page) 270struct page_cgroup *page_get_page_cgroup(struct page *page)
290{ 271{
291 return (struct page_cgroup *) 272 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
292 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
293} 273}
294 274
295static void __always_inline lock_page_cgroup(struct page *page) 275static void lock_page_cgroup(struct page *page)
296{ 276{
297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 277 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
298 VM_BUG_ON(!page_cgroup_locked(page));
299}
300
301static void __always_inline unlock_page_cgroup(struct page *page)
302{
303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
304} 278}
305 279
306/* 280static int try_lock_page_cgroup(struct page *page)
307 * Tie new page_cgroup to struct page under lock_page_cgroup()
308 * This can fail if the page has been tied to a page_cgroup.
309 * If success, returns 0.
310 */
311static int page_cgroup_assign_new_page_cgroup(struct page *page,
312 struct page_cgroup *pc)
313{ 281{
314 int ret = 0; 282 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
315
316 lock_page_cgroup(page);
317 if (!page_get_page_cgroup(page))
318 page_assign_page_cgroup(page, pc);
319 else /* A page is tied to other pc. */
320 ret = 1;
321 unlock_page_cgroup(page);
322 return ret;
323} 283}
324 284
325/* 285static void unlock_page_cgroup(struct page *page)
326 * Clear page->page_cgroup member under lock_page_cgroup().
327 * If given "pc" value is different from one page->page_cgroup,
328 * page->cgroup is not cleared.
329 * Returns a value of page->page_cgroup at lock taken.
330 * A can can detect failure of clearing by following
331 * clear_page_cgroup(page, pc) == pc
332 */
333
334static struct page_cgroup *clear_page_cgroup(struct page *page,
335 struct page_cgroup *pc)
336{ 286{
337 struct page_cgroup *ret; 287 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
338 /* lock and clear */
339 lock_page_cgroup(page);
340 ret = page_get_page_cgroup(page);
341 if (likely(ret == pc))
342 page_assign_page_cgroup(page, NULL);
343 unlock_page_cgroup(page);
344 return ret;
345} 288}
346 289
347static void __mem_cgroup_remove_list(struct page_cgroup *pc) 290static void __mem_cgroup_remove_list(struct page_cgroup *pc)
@@ -399,7 +342,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
399 int ret; 342 int ret;
400 343
401 task_lock(task); 344 task_lock(task);
402 ret = task->mm && vm_match_cgroup(task->mm, mem); 345 ret = task->mm && mm_match_cgroup(task->mm, mem);
403 task_unlock(task); 346 task_unlock(task);
404 return ret; 347 return ret;
405} 348}
@@ -407,18 +350,30 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
407/* 350/*
408 * This routine assumes that the appropriate zone's lru lock is already held 351 * This routine assumes that the appropriate zone's lru lock is already held
409 */ 352 */
410void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 353void mem_cgroup_move_lists(struct page *page, bool active)
411{ 354{
355 struct page_cgroup *pc;
412 struct mem_cgroup_per_zone *mz; 356 struct mem_cgroup_per_zone *mz;
413 unsigned long flags; 357 unsigned long flags;
414 358
415 if (!pc) 359 /*
360 * We cannot lock_page_cgroup while holding zone's lru_lock,
361 * because other holders of lock_page_cgroup can be interrupted
362 * with an attempt to rotate_reclaimable_page. But we cannot
363 * safely get to page_cgroup without it, so just try_lock it:
364 * mem_cgroup_isolate_pages allows for page left on wrong list.
365 */
366 if (!try_lock_page_cgroup(page))
416 return; 367 return;
417 368
418 mz = page_cgroup_zoneinfo(pc); 369 pc = page_get_page_cgroup(page);
419 spin_lock_irqsave(&mz->lru_lock, flags); 370 if (pc) {
420 __mem_cgroup_move_lists(pc, active); 371 mz = page_cgroup_zoneinfo(pc);
421 spin_unlock_irqrestore(&mz->lru_lock, flags); 372 spin_lock_irqsave(&mz->lru_lock, flags);
373 __mem_cgroup_move_lists(pc, active);
374 spin_unlock_irqrestore(&mz->lru_lock, flags);
375 }
376 unlock_page_cgroup(page);
422} 377}
423 378
424/* 379/*
@@ -437,6 +392,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
437 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 392 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
438 return (int)((rss * 100L) / total); 393 return (int)((rss * 100L) / total);
439} 394}
395
440/* 396/*
441 * This function is called from vmscan.c. In page reclaiming loop. balance 397 * This function is called from vmscan.c. In page reclaiming loop. balance
442 * between active and inactive list is calculated. For memory controller 398 * between active and inactive list is calculated. For memory controller
@@ -500,7 +456,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
500 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 456 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
501 457
502 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 458 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
503
504 return (nr_inactive >> priority); 459 return (nr_inactive >> priority);
505} 460}
506 461
@@ -586,26 +541,21 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
586 * with it 541 * with it
587 */ 542 */
588retry: 543retry:
589 if (page) { 544 lock_page_cgroup(page);
590 lock_page_cgroup(page); 545 pc = page_get_page_cgroup(page);
591 pc = page_get_page_cgroup(page); 546 /*
592 /* 547 * The page_cgroup exists and
593 * The page_cgroup exists and 548 * the page has already been accounted.
594 * the page has already been accounted. 549 */
595 */ 550 if (pc) {
596 if (pc) { 551 VM_BUG_ON(pc->page != page);
597 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 552 VM_BUG_ON(pc->ref_cnt <= 0);
598 /* this page is under being uncharged ? */ 553
599 unlock_page_cgroup(page); 554 pc->ref_cnt++;
600 cpu_relax();
601 goto retry;
602 } else {
603 unlock_page_cgroup(page);
604 goto done;
605 }
606 }
607 unlock_page_cgroup(page); 555 unlock_page_cgroup(page);
556 goto done;
608 } 557 }
558 unlock_page_cgroup(page);
609 559
610 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 560 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
611 if (pc == NULL) 561 if (pc == NULL)
@@ -623,16 +573,11 @@ retry:
623 rcu_read_lock(); 573 rcu_read_lock();
624 mem = rcu_dereference(mm->mem_cgroup); 574 mem = rcu_dereference(mm->mem_cgroup);
625 /* 575 /*
626 * For every charge from the cgroup, increment reference 576 * For every charge from the cgroup, increment reference count
627 * count
628 */ 577 */
629 css_get(&mem->css); 578 css_get(&mem->css);
630 rcu_read_unlock(); 579 rcu_read_unlock();
631 580
632 /*
633 * If we created the page_cgroup, we should free it on exceeding
634 * the cgroup limit.
635 */
636 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 581 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
637 if (!(gfp_mask & __GFP_WAIT)) 582 if (!(gfp_mask & __GFP_WAIT))
638 goto out; 583 goto out;
@@ -641,12 +586,12 @@ retry:
641 continue; 586 continue;
642 587
643 /* 588 /*
644 * try_to_free_mem_cgroup_pages() might not give us a full 589 * try_to_free_mem_cgroup_pages() might not give us a full
645 * picture of reclaim. Some pages are reclaimed and might be 590 * picture of reclaim. Some pages are reclaimed and might be
646 * moved to swap cache or just unmapped from the cgroup. 591 * moved to swap cache or just unmapped from the cgroup.
647 * Check the limit again to see if the reclaim reduced the 592 * Check the limit again to see if the reclaim reduced the
648 * current usage of the cgroup before giving up 593 * current usage of the cgroup before giving up
649 */ 594 */
650 if (res_counter_check_under_limit(&mem->res)) 595 if (res_counter_check_under_limit(&mem->res))
651 continue; 596 continue;
652 597
@@ -657,14 +602,16 @@ retry:
657 congestion_wait(WRITE, HZ/10); 602 congestion_wait(WRITE, HZ/10);
658 } 603 }
659 604
660 atomic_set(&pc->ref_cnt, 1); 605 pc->ref_cnt = 1;
661 pc->mem_cgroup = mem; 606 pc->mem_cgroup = mem;
662 pc->page = page; 607 pc->page = page;
663 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 608 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
664 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 609 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
665 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 610 pc->flags |= PAGE_CGROUP_FLAG_CACHE;
666 611
667 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { 612 lock_page_cgroup(page);
613 if (page_get_page_cgroup(page)) {
614 unlock_page_cgroup(page);
668 /* 615 /*
669 * Another charge has been added to this page already. 616 * Another charge has been added to this page already.
670 * We take lock_page_cgroup(page) again and read 617 * We take lock_page_cgroup(page) again and read
@@ -673,17 +620,16 @@ retry:
673 res_counter_uncharge(&mem->res, PAGE_SIZE); 620 res_counter_uncharge(&mem->res, PAGE_SIZE);
674 css_put(&mem->css); 621 css_put(&mem->css);
675 kfree(pc); 622 kfree(pc);
676 if (!page)
677 goto done;
678 goto retry; 623 goto retry;
679 } 624 }
625 page_assign_page_cgroup(page, pc);
680 626
681 mz = page_cgroup_zoneinfo(pc); 627 mz = page_cgroup_zoneinfo(pc);
682 spin_lock_irqsave(&mz->lru_lock, flags); 628 spin_lock_irqsave(&mz->lru_lock, flags);
683 /* Update statistics vector */
684 __mem_cgroup_add_list(pc); 629 __mem_cgroup_add_list(pc);
685 spin_unlock_irqrestore(&mz->lru_lock, flags); 630 spin_unlock_irqrestore(&mz->lru_lock, flags);
686 631
632 unlock_page_cgroup(page);
687done: 633done:
688 return 0; 634 return 0;
689out: 635out:
@@ -693,70 +639,61 @@ err:
693 return -ENOMEM; 639 return -ENOMEM;
694} 640}
695 641
696int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 642int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
697 gfp_t gfp_mask)
698{ 643{
699 return mem_cgroup_charge_common(page, mm, gfp_mask, 644 return mem_cgroup_charge_common(page, mm, gfp_mask,
700 MEM_CGROUP_CHARGE_TYPE_MAPPED); 645 MEM_CGROUP_CHARGE_TYPE_MAPPED);
701} 646}
702 647
703/*
704 * See if the cached pages should be charged at all?
705 */
706int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 648int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
707 gfp_t gfp_mask) 649 gfp_t gfp_mask)
708{ 650{
709 int ret = 0;
710 if (!mm) 651 if (!mm)
711 mm = &init_mm; 652 mm = &init_mm;
712 653 return mem_cgroup_charge_common(page, mm, gfp_mask,
713 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
714 MEM_CGROUP_CHARGE_TYPE_CACHE); 654 MEM_CGROUP_CHARGE_TYPE_CACHE);
715 return ret;
716} 655}
717 656
718/* 657/*
719 * Uncharging is always a welcome operation, we never complain, simply 658 * Uncharging is always a welcome operation, we never complain, simply
720 * uncharge. This routine should be called with lock_page_cgroup held 659 * uncharge.
721 */ 660 */
722void mem_cgroup_uncharge(struct page_cgroup *pc) 661void mem_cgroup_uncharge_page(struct page *page)
723{ 662{
663 struct page_cgroup *pc;
724 struct mem_cgroup *mem; 664 struct mem_cgroup *mem;
725 struct mem_cgroup_per_zone *mz; 665 struct mem_cgroup_per_zone *mz;
726 struct page *page;
727 unsigned long flags; 666 unsigned long flags;
728 667
729 /* 668 /*
730 * Check if our page_cgroup is valid 669 * Check if our page_cgroup is valid
731 */ 670 */
671 lock_page_cgroup(page);
672 pc = page_get_page_cgroup(page);
732 if (!pc) 673 if (!pc)
733 return; 674 goto unlock;
734 675
735 if (atomic_dec_and_test(&pc->ref_cnt)) { 676 VM_BUG_ON(pc->page != page);
736 page = pc->page; 677 VM_BUG_ON(pc->ref_cnt <= 0);
678
679 if (--(pc->ref_cnt) == 0) {
737 mz = page_cgroup_zoneinfo(pc); 680 mz = page_cgroup_zoneinfo(pc);
738 /* 681 spin_lock_irqsave(&mz->lru_lock, flags);
739 * get page->cgroup and clear it under lock. 682 __mem_cgroup_remove_list(pc);
740 * force_empty can drop page->cgroup without checking refcnt. 683 spin_unlock_irqrestore(&mz->lru_lock, flags);
741 */ 684
685 page_assign_page_cgroup(page, NULL);
742 unlock_page_cgroup(page); 686 unlock_page_cgroup(page);
743 if (clear_page_cgroup(page, pc) == pc) { 687
744 mem = pc->mem_cgroup; 688 mem = pc->mem_cgroup;
745 css_put(&mem->css); 689 res_counter_uncharge(&mem->res, PAGE_SIZE);
746 res_counter_uncharge(&mem->res, PAGE_SIZE); 690 css_put(&mem->css);
747 spin_lock_irqsave(&mz->lru_lock, flags); 691
748 __mem_cgroup_remove_list(pc); 692 kfree(pc);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 693 return;
750 kfree(pc);
751 }
752 lock_page_cgroup(page);
753 } 694 }
754}
755 695
756void mem_cgroup_uncharge_page(struct page *page) 696unlock:
757{
758 lock_page_cgroup(page);
759 mem_cgroup_uncharge(page_get_page_cgroup(page));
760 unlock_page_cgroup(page); 697 unlock_page_cgroup(page);
761} 698}
762 699
@@ -764,63 +701,59 @@ void mem_cgroup_uncharge_page(struct page *page)
764 * Returns non-zero if a page (under migration) has valid page_cgroup member. 701 * Returns non-zero if a page (under migration) has valid page_cgroup member.
765 * Refcnt of page_cgroup is incremented. 702 * Refcnt of page_cgroup is incremented.
766 */ 703 */
767
768int mem_cgroup_prepare_migration(struct page *page) 704int mem_cgroup_prepare_migration(struct page *page)
769{ 705{
770 struct page_cgroup *pc; 706 struct page_cgroup *pc;
771 int ret = 0; 707
772 lock_page_cgroup(page); 708 lock_page_cgroup(page);
773 pc = page_get_page_cgroup(page); 709 pc = page_get_page_cgroup(page);
774 if (pc && atomic_inc_not_zero(&pc->ref_cnt)) 710 if (pc)
775 ret = 1; 711 pc->ref_cnt++;
776 unlock_page_cgroup(page); 712 unlock_page_cgroup(page);
777 return ret; 713 return pc != NULL;
778} 714}
779 715
780void mem_cgroup_end_migration(struct page *page) 716void mem_cgroup_end_migration(struct page *page)
781{ 717{
782 struct page_cgroup *pc; 718 mem_cgroup_uncharge_page(page);
783
784 lock_page_cgroup(page);
785 pc = page_get_page_cgroup(page);
786 mem_cgroup_uncharge(pc);
787 unlock_page_cgroup(page);
788} 719}
720
789/* 721/*
790 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. 722 * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
791 * And no race with uncharge() routines because page_cgroup for *page* 723 * And no race with uncharge() routines because page_cgroup for *page*
792 * has extra one reference by mem_cgroup_prepare_migration. 724 * has extra one reference by mem_cgroup_prepare_migration.
793 */ 725 */
794
795void mem_cgroup_page_migration(struct page *page, struct page *newpage) 726void mem_cgroup_page_migration(struct page *page, struct page *newpage)
796{ 727{
797 struct page_cgroup *pc; 728 struct page_cgroup *pc;
798 struct mem_cgroup *mem;
799 unsigned long flags;
800 struct mem_cgroup_per_zone *mz; 729 struct mem_cgroup_per_zone *mz;
801retry: 730 unsigned long flags;
731
732 lock_page_cgroup(page);
802 pc = page_get_page_cgroup(page); 733 pc = page_get_page_cgroup(page);
803 if (!pc) 734 if (!pc) {
735 unlock_page_cgroup(page);
804 return; 736 return;
805 mem = pc->mem_cgroup; 737 }
738
806 mz = page_cgroup_zoneinfo(pc); 739 mz = page_cgroup_zoneinfo(pc);
807 if (clear_page_cgroup(page, pc) != pc)
808 goto retry;
809 spin_lock_irqsave(&mz->lru_lock, flags); 740 spin_lock_irqsave(&mz->lru_lock, flags);
810
811 __mem_cgroup_remove_list(pc); 741 __mem_cgroup_remove_list(pc);
812 spin_unlock_irqrestore(&mz->lru_lock, flags); 742 spin_unlock_irqrestore(&mz->lru_lock, flags);
813 743
744 page_assign_page_cgroup(page, NULL);
745 unlock_page_cgroup(page);
746
814 pc->page = newpage; 747 pc->page = newpage;
815 lock_page_cgroup(newpage); 748 lock_page_cgroup(newpage);
816 page_assign_page_cgroup(newpage, pc); 749 page_assign_page_cgroup(newpage, pc);
817 unlock_page_cgroup(newpage);
818 750
819 mz = page_cgroup_zoneinfo(pc); 751 mz = page_cgroup_zoneinfo(pc);
820 spin_lock_irqsave(&mz->lru_lock, flags); 752 spin_lock_irqsave(&mz->lru_lock, flags);
821 __mem_cgroup_add_list(pc); 753 __mem_cgroup_add_list(pc);
822 spin_unlock_irqrestore(&mz->lru_lock, flags); 754 spin_unlock_irqrestore(&mz->lru_lock, flags);
823 return; 755
756 unlock_page_cgroup(newpage);
824} 757}
825 758
826/* 759/*
@@ -829,14 +762,13 @@ retry:
829 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 762 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
830 */ 763 */
831#define FORCE_UNCHARGE_BATCH (128) 764#define FORCE_UNCHARGE_BATCH (128)
832static void 765static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
833mem_cgroup_force_empty_list(struct mem_cgroup *mem,
834 struct mem_cgroup_per_zone *mz, 766 struct mem_cgroup_per_zone *mz,
835 int active) 767 int active)
836{ 768{
837 struct page_cgroup *pc; 769 struct page_cgroup *pc;
838 struct page *page; 770 struct page *page;
839 int count; 771 int count = FORCE_UNCHARGE_BATCH;
840 unsigned long flags; 772 unsigned long flags;
841 struct list_head *list; 773 struct list_head *list;
842 774
@@ -845,46 +777,36 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem,
845 else 777 else
846 list = &mz->inactive_list; 778 list = &mz->inactive_list;
847 779
848 if (list_empty(list))
849 return;
850retry:
851 count = FORCE_UNCHARGE_BATCH;
852 spin_lock_irqsave(&mz->lru_lock, flags); 780 spin_lock_irqsave(&mz->lru_lock, flags);
853 781 while (!list_empty(list)) {
854 while (--count && !list_empty(list)) {
855 pc = list_entry(list->prev, struct page_cgroup, lru); 782 pc = list_entry(list->prev, struct page_cgroup, lru);
856 page = pc->page; 783 page = pc->page;
857 /* Avoid race with charge */ 784 get_page(page);
858 atomic_set(&pc->ref_cnt, 0); 785 spin_unlock_irqrestore(&mz->lru_lock, flags);
859 if (clear_page_cgroup(page, pc) == pc) { 786 mem_cgroup_uncharge_page(page);
860 css_put(&mem->css); 787 put_page(page);
861 res_counter_uncharge(&mem->res, PAGE_SIZE); 788 if (--count <= 0) {
862 __mem_cgroup_remove_list(pc); 789 count = FORCE_UNCHARGE_BATCH;
863 kfree(pc); 790 cond_resched();
864 } else /* being uncharged ? ...do relax */ 791 }
865 break; 792 spin_lock_irqsave(&mz->lru_lock, flags);
866 } 793 }
867 spin_unlock_irqrestore(&mz->lru_lock, flags); 794 spin_unlock_irqrestore(&mz->lru_lock, flags);
868 if (!list_empty(list)) {
869 cond_resched();
870 goto retry;
871 }
872 return;
873} 795}
874 796
875/* 797/*
876 * make mem_cgroup's charge to be 0 if there is no task. 798 * make mem_cgroup's charge to be 0 if there is no task.
877 * This enables deleting this mem_cgroup. 799 * This enables deleting this mem_cgroup.
878 */ 800 */
879 801static int mem_cgroup_force_empty(struct mem_cgroup *mem)
880int mem_cgroup_force_empty(struct mem_cgroup *mem)
881{ 802{
882 int ret = -EBUSY; 803 int ret = -EBUSY;
883 int node, zid; 804 int node, zid;
805
884 css_get(&mem->css); 806 css_get(&mem->css);
885 /* 807 /*
886 * page reclaim code (kswapd etc..) will move pages between 808 * page reclaim code (kswapd etc..) will move pages between
887` * active_list <-> inactive_list while we don't take a lock. 809 * active_list <-> inactive_list while we don't take a lock.
888 * So, we have to do loop here until all lists are empty. 810 * So, we have to do loop here until all lists are empty.
889 */ 811 */
890 while (mem->res.usage > 0) { 812 while (mem->res.usage > 0) {
@@ -906,9 +828,7 @@ out:
906 return ret; 828 return ret;
907} 829}
908 830
909 831static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
910
911int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
912{ 832{
913 *tmp = memparse(buf, &buf); 833 *tmp = memparse(buf, &buf);
914 if (*buf != '\0') 834 if (*buf != '\0')
@@ -945,8 +865,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
945 size_t nbytes, loff_t *ppos) 865 size_t nbytes, loff_t *ppos)
946{ 866{
947 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 867 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
948 int ret; 868 int ret = mem_cgroup_force_empty(mem);
949 ret = mem_cgroup_force_empty(mem);
950 if (!ret) 869 if (!ret)
951 ret = nbytes; 870 ret = nbytes;
952 return ret; 871 return ret;
@@ -955,7 +874,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
955/* 874/*
956 * Note: This should be removed if cgroup supports write-only file. 875 * Note: This should be removed if cgroup supports write-only file.
957 */ 876 */
958
959static ssize_t mem_force_empty_read(struct cgroup *cont, 877static ssize_t mem_force_empty_read(struct cgroup *cont,
960 struct cftype *cft, 878 struct cftype *cft,
961 struct file *file, char __user *userbuf, 879 struct file *file, char __user *userbuf,
@@ -964,7 +882,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont,
964 return -EINVAL; 882 return -EINVAL;
965} 883}
966 884
967
968static const struct mem_cgroup_stat_desc { 885static const struct mem_cgroup_stat_desc {
969 const char *msg; 886 const char *msg;
970 u64 unit; 887 u64 unit;
@@ -1017,8 +934,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file)
1017 return single_open(file, mem_control_stat_show, cont); 934 return single_open(file, mem_control_stat_show, cont);
1018} 935}
1019 936
1020
1021
1022static struct cftype mem_cgroup_files[] = { 937static struct cftype mem_cgroup_files[] = {
1023 { 938 {
1024 .name = "usage_in_bytes", 939 .name = "usage_in_bytes",
@@ -1084,9 +999,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1084 kfree(mem->info.nodeinfo[node]); 999 kfree(mem->info.nodeinfo[node]);
1085} 1000}
1086 1001
1087
1088static struct mem_cgroup init_mem_cgroup;
1089
1090static struct cgroup_subsys_state * 1002static struct cgroup_subsys_state *
1091mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1003mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1092{ 1004{
@@ -1176,7 +1088,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1176 1088
1177out: 1089out:
1178 mmput(mm); 1090 mmput(mm);
1179 return;
1180} 1091}
1181 1092
1182struct cgroup_subsys mem_cgroup_subsys = { 1093struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory.c b/mm/memory.c
index ce3c9e4492d8..0d14d1e58a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1711,7 +1711,7 @@ unlock:
1711 } 1711 }
1712 return ret; 1712 return ret;
1713oom_free_new: 1713oom_free_new:
1714 __free_page(new_page); 1714 page_cache_release(new_page);
1715oom: 1715oom:
1716 if (old_page) 1716 if (old_page)
1717 page_cache_release(old_page); 1717 page_cache_release(old_page);
@@ -2093,12 +2093,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2093 unlock_page(page); 2093 unlock_page(page);
2094 2094
2095 if (write_access) { 2095 if (write_access) {
2096 /* XXX: We could OR the do_wp_page code with this one? */ 2096 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2097 if (do_wp_page(mm, vma, address, 2097 if (ret & VM_FAULT_ERROR)
2098 page_table, pmd, ptl, pte) & VM_FAULT_OOM) { 2098 ret &= VM_FAULT_ERROR;
2099 mem_cgroup_uncharge_page(page);
2100 ret = VM_FAULT_OOM;
2101 }
2102 goto out; 2099 goto out;
2103 } 2100 }
2104 2101
@@ -2163,7 +2160,7 @@ release:
2163 page_cache_release(page); 2160 page_cache_release(page);
2164 goto unlock; 2161 goto unlock;
2165oom_free_page: 2162oom_free_page:
2166 __free_page(page); 2163 page_cache_release(page);
2167oom: 2164oom:
2168 return VM_FAULT_OOM; 2165 return VM_FAULT_OOM;
2169} 2166}
diff --git a/mm/migrate.c b/mm/migrate.c
index a73504ff5ab9..4e0eccca5e26 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -153,11 +153,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
153 return; 153 return;
154 } 154 }
155 155
156 if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
157 pte_unmap(ptep);
158 return;
159 }
160
161 ptl = pte_lockptr(mm, pmd); 156 ptl = pte_lockptr(mm, pmd);
162 spin_lock(ptl); 157 spin_lock(ptl);
163 pte = *ptep; 158 pte = *ptep;
@@ -169,6 +164,20 @@ static void remove_migration_pte(struct vm_area_struct *vma,
169 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 164 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
170 goto out; 165 goto out;
171 166
167 /*
168 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
169 * Failure is not an option here: we're now expected to remove every
170 * migration pte, and will cause crashes otherwise. Normally this
171 * is not an issue: mem_cgroup_prepare_migration bumped up the old
172 * page_cgroup count for safety, that's now attached to the new page,
173 * so this charge should just be another incrementation of the count,
174 * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
175 * there's been a force_empty, those reference counts may no longer
176 * be reliable, and this charge can actually fail: oh well, we don't
177 * make the situation any worse by proceeding as if it had succeeded.
178 */
179 mem_cgroup_charge(new, mm, GFP_ATOMIC);
180
172 get_page(new); 181 get_page(new);
173 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 182 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
174 if (is_write_migration_entry(entry)) 183 if (is_write_migration_entry(entry))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4194b9db0104..44b2da11bf43 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,7 +412,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
412 return oom_kill_task(p); 412 return oom_kill_task(p);
413} 413}
414 414
415#ifdef CONFIG_CGROUP_MEM_CONT 415#ifdef CONFIG_CGROUP_MEM_RES_CTLR
416void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 416void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
417{ 417{
418 unsigned long points = 0; 418 unsigned long points = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8896e874a67d..402a504f1228 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
19#include <linux/swap.h> 19#include <linux/swap.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/compiler.h> 24#include <linux/compiler.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -221,13 +222,19 @@ static inline int bad_range(struct zone *zone, struct page *page)
221 222
222static void bad_page(struct page *page) 223static void bad_page(struct page *page)
223{ 224{
224 printk(KERN_EMERG "Bad page state in process '%s'\n" 225 void *pc = page_get_page_cgroup(page);
225 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 226
226 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 227 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
227 KERN_EMERG "Backtrace:\n", 228 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
228 current->comm, page, (int)(2*sizeof(unsigned long)), 229 current->comm, page, (int)(2*sizeof(unsigned long)),
229 (unsigned long)page->flags, page->mapping, 230 (unsigned long)page->flags, page->mapping,
230 page_mapcount(page), page_count(page)); 231 page_mapcount(page), page_count(page));
232 if (pc) {
233 printk(KERN_EMERG "cgroup:%p\n", pc);
234 page_reset_bad_cgroup(page);
235 }
236 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
237 KERN_EMERG "Backtrace:\n");
231 dump_stack(); 238 dump_stack();
232 page->flags &= ~(1 << PG_lru | 239 page->flags &= ~(1 << PG_lru |
233 1 << PG_private | 240 1 << PG_private |
@@ -453,6 +460,7 @@ static inline int free_pages_check(struct page *page)
453{ 460{
454 if (unlikely(page_mapcount(page) | 461 if (unlikely(page_mapcount(page) |
455 (page->mapping != NULL) | 462 (page->mapping != NULL) |
463 (page_get_page_cgroup(page) != NULL) |
456 (page_count(page) != 0) | 464 (page_count(page) != 0) |
457 (page->flags & ( 465 (page->flags & (
458 1 << PG_lru | 466 1 << PG_lru |
@@ -602,6 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
602{ 610{
603 if (unlikely(page_mapcount(page) | 611 if (unlikely(page_mapcount(page) |
604 (page->mapping != NULL) | 612 (page->mapping != NULL) |
613 (page_get_page_cgroup(page) != NULL) |
605 (page_count(page) != 0) | 614 (page_count(page) != 0) |
606 (page->flags & ( 615 (page->flags & (
607 1 << PG_lru | 616 1 << PG_lru |
@@ -988,7 +997,6 @@ static void free_hot_cold_page(struct page *page, int cold)
988 997
989 if (!PageHighMem(page)) 998 if (!PageHighMem(page))
990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 999 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
991 VM_BUG_ON(page_get_page_cgroup(page));
992 arch_free_page(page, 0); 1000 arch_free_page(page, 0);
993 kernel_map_pages(page, 1, 0); 1001 kernel_map_pages(page, 1, 0);
994 1002
@@ -1276,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1276 if (!zlc) 1284 if (!zlc)
1277 return NULL; 1285 return NULL;
1278 1286
1279 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1280 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1281 zlc->last_full_zap = jiffies; 1289 zlc->last_full_zap = jiffies;
1282 } 1290 }
@@ -2527,7 +2535,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2527 set_page_links(page, zone, nid, pfn); 2535 set_page_links(page, zone, nid, pfn);
2528 init_page_count(page); 2536 init_page_count(page);
2529 reset_page_mapcount(page); 2537 reset_page_mapcount(page);
2530 page_assign_page_cgroup(page, NULL);
2531 SetPageReserved(page); 2538 SetPageReserved(page);
2532 2539
2533 /* 2540 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 8fd527c4e2bf..0c9a2df06c39 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -321,7 +321,7 @@ static int page_referenced_anon(struct page *page,
321 * counting on behalf of references from different 321 * counting on behalf of references from different
322 * cgroups 322 * cgroups
323 */ 323 */
324 if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont)) 324 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
325 continue; 325 continue;
326 referenced += page_referenced_one(page, vma, &mapcount); 326 referenced += page_referenced_one(page, vma, &mapcount);
327 if (!mapcount) 327 if (!mapcount)
@@ -382,7 +382,7 @@ static int page_referenced_file(struct page *page,
382 * counting on behalf of references from different 382 * counting on behalf of references from different
383 * cgroups 383 * cgroups
384 */ 384 */
385 if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont)) 385 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
386 continue; 386 continue;
387 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 387 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
388 == (VM_LOCKED|VM_MAYSHARE)) { 388 == (VM_LOCKED|VM_MAYSHARE)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 90b576cbc06e..3372bc579e89 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1370,14 +1370,17 @@ repeat:
1370 shmem_swp_unmap(entry); 1370 shmem_swp_unmap(entry);
1371 spin_unlock(&info->lock); 1371 spin_unlock(&info->lock);
1372 unlock_page(swappage); 1372 unlock_page(swappage);
1373 page_cache_release(swappage);
1374 if (error == -ENOMEM) { 1373 if (error == -ENOMEM) {
1375 /* allow reclaim from this memory cgroup */ 1374 /* allow reclaim from this memory cgroup */
1376 error = mem_cgroup_cache_charge(NULL, 1375 error = mem_cgroup_cache_charge(swappage,
1377 current->mm, gfp & ~__GFP_HIGHMEM); 1376 current->mm, gfp & ~__GFP_HIGHMEM);
1378 if (error) 1377 if (error) {
1378 page_cache_release(swappage);
1379 goto failed; 1379 goto failed;
1380 }
1381 mem_cgroup_uncharge_page(swappage);
1380 } 1382 }
1383 page_cache_release(swappage);
1381 goto repeat; 1384 goto repeat;
1382 } 1385 }
1383 } else if (sgp == SGP_READ && !filepage) { 1386 } else if (sgp == SGP_READ && !filepage) {
diff --git a/mm/swap.c b/mm/swap.c
index 710a20bb9749..d4ec59aa5c46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -176,7 +176,7 @@ void activate_page(struct page *page)
176 SetPageActive(page); 176 SetPageActive(page);
177 add_page_to_active_list(zone, page); 177 add_page_to_active_list(zone, page);
178 __count_vm_event(PGACTIVATE); 178 __count_vm_event(PGACTIVATE);
179 mem_cgroup_move_lists(page_get_page_cgroup(page), true); 179 mem_cgroup_move_lists(page, true);
180 } 180 }
181 spin_unlock_irq(&zone->lru_lock); 181 spin_unlock_irq(&zone->lru_lock);
182} 182}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a26dabd62fed..45711585684e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -126,7 +126,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
126static LIST_HEAD(shrinker_list); 126static LIST_HEAD(shrinker_list);
127static DECLARE_RWSEM(shrinker_rwsem); 127static DECLARE_RWSEM(shrinker_rwsem);
128 128
129#ifdef CONFIG_CGROUP_MEM_CONT 129#ifdef CONFIG_CGROUP_MEM_RES_CTLR
130#define scan_global_lru(sc) (!(sc)->mem_cgroup) 130#define scan_global_lru(sc) (!(sc)->mem_cgroup)
131#else 131#else
132#define scan_global_lru(sc) (1) 132#define scan_global_lru(sc) (1)
@@ -1128,7 +1128,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1128 ClearPageActive(page); 1128 ClearPageActive(page);
1129 1129
1130 list_move(&page->lru, &zone->inactive_list); 1130 list_move(&page->lru, &zone->inactive_list);
1131 mem_cgroup_move_lists(page_get_page_cgroup(page), false); 1131 mem_cgroup_move_lists(page, false);
1132 pgmoved++; 1132 pgmoved++;
1133 if (!pagevec_add(&pvec, page)) { 1133 if (!pagevec_add(&pvec, page)) {
1134 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1134 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1156,8 +1156,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1156 VM_BUG_ON(PageLRU(page)); 1156 VM_BUG_ON(PageLRU(page));
1157 SetPageLRU(page); 1157 SetPageLRU(page);
1158 VM_BUG_ON(!PageActive(page)); 1158 VM_BUG_ON(!PageActive(page));
1159
1159 list_move(&page->lru, &zone->active_list); 1160 list_move(&page->lru, &zone->active_list);
1160 mem_cgroup_move_lists(page_get_page_cgroup(page), true); 1161 mem_cgroup_move_lists(page, true);
1161 pgmoved++; 1162 pgmoved++;
1162 if (!pagevec_add(&pvec, page)) { 1163 if (!pagevec_add(&pvec, page)) {
1163 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1164 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1427,7 +1428,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1427 return do_try_to_free_pages(zones, gfp_mask, &sc); 1428 return do_try_to_free_pages(zones, gfp_mask, &sc);
1428} 1429}
1429 1430
1430#ifdef CONFIG_CGROUP_MEM_CONT 1431#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1431 1432
1432unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1433unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1433 gfp_t gfp_mask) 1434 gfp_t gfp_mask)