aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c466
1 files changed, 215 insertions, 251 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36896f3eb7f..d4a92b63e98 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,11 +32,12 @@
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38struct cgroup_subsys mem_cgroup_subsys __read_mostly; 40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static struct kmem_cache *page_cgroup_cache __read_mostly;
40#define MEM_CGROUP_RECLAIM_RETRIES 5 41#define MEM_CGROUP_RECLAIM_RETRIES 5
41 42
42/* 43/*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
65/* 66/*
66 * For accounting under irq disable, no need for increment preempt count. 67 * For accounting under irq disable, no need for increment preempt count.
67 */ 68 */
68static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
69 enum mem_cgroup_stat_index idx, int val) 70 enum mem_cgroup_stat_index idx, int val)
70{ 71{
71 int cpu = smp_processor_id(); 72 stat->count[idx] += val;
72 stat->cpustat[cpu].count[idx] += val;
73} 73}
74 74
75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
85/* 85/*
86 * per-zone information in memory controller. 86 * per-zone information in memory controller.
87 */ 87 */
88
89enum mem_cgroup_zstat_index {
90 MEM_CGROUP_ZSTAT_ACTIVE,
91 MEM_CGROUP_ZSTAT_INACTIVE,
92
93 NR_MEM_CGROUP_ZSTAT,
94};
95
96struct mem_cgroup_per_zone { 88struct mem_cgroup_per_zone {
97 /* 89 /*
98 * spin_lock to protect the per cgroup LRU 90 * spin_lock to protect the per cgroup LRU
99 */ 91 */
100 spinlock_t lru_lock; 92 spinlock_t lru_lock;
101 struct list_head active_list; 93 struct list_head lists[NR_LRU_LISTS];
102 struct list_head inactive_list; 94 unsigned long count[NR_LRU_LISTS];
103 unsigned long count[NR_MEM_CGROUP_ZSTAT];
104}; 95};
105/* Macro for accessing counter */ 96/* Macro for accessing counter */
106#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
144}; 135};
145static struct mem_cgroup init_mem_cgroup; 136static struct mem_cgroup init_mem_cgroup;
146 137
147/*
148 * We use the lower bit of the page->page_cgroup pointer as a bit spin
149 * lock. We need to ensure that page->page_cgroup is at least two
150 * byte aligned (based on comments from Nick Piggin). But since
151 * bit_spin_lock doesn't actually set that lock bit in a non-debug
152 * uniprocessor kernel, we should avoid setting it here too.
153 */
154#define PAGE_CGROUP_LOCK_BIT 0x0
155#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
156#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
157#else
158#define PAGE_CGROUP_LOCK 0x0
159#endif
160
161/*
162 * A page_cgroup page is associated with every page descriptor. The
163 * page_cgroup helps us identify information about the cgroup
164 */
165struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page;
168 struct mem_cgroup *mem_cgroup;
169 int flags;
170};
171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
172#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
173
174static int page_cgroup_nid(struct page_cgroup *pc)
175{
176 return page_to_nid(pc->page);
177}
178
179static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
180{
181 return page_zonenum(pc->page);
182}
183
184enum charge_type { 138enum charge_type {
185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
143 NR_CHARGE_TYPE,
144};
145
146/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */
188}; 158};
189 159
190/* 160/*
191 * Always modified under lru lock. Then, not necessary to preempt_disable() 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
192 */ 162 */
193static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
194 bool charge) 164 struct page_cgroup *pc,
165 bool charge)
195{ 166{
196 int val = (charge)? 1 : -1; 167 int val = (charge)? 1 : -1;
197 struct mem_cgroup_stat *stat = &mem->stat; 168 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat;
198 170
199 VM_BUG_ON(!irqs_disabled()); 171 VM_BUG_ON(!irqs_disabled());
200 if (flags & PAGE_CGROUP_FLAG_CACHE) 172
201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
202 else 176 else
203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
204 178
205 if (charge) 179 if (charge)
206 __mem_cgroup_stat_add_safe(stat, 180 __mem_cgroup_stat_add_safe(cpustat,
207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
208 else 182 else
209 __mem_cgroup_stat_add_safe(stat, 183 __mem_cgroup_stat_add_safe(cpustat,
210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
211} 185}
212 186
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
227} 201}
228 202
229static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
230 enum mem_cgroup_zstat_index idx) 204 enum lru_list idx)
231{ 205{
232 int nid, zid; 206 int nid, zid;
233 struct mem_cgroup_per_zone *mz; 207 struct mem_cgroup_per_zone *mz;
@@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
262 struct mem_cgroup, css); 236 struct mem_cgroup, css);
263} 237}
264 238
265static inline int page_cgroup_locked(struct page *page)
266{
267 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
268}
269
270static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
271{
272 VM_BUG_ON(!page_cgroup_locked(page));
273 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
274}
275
276struct page_cgroup *page_get_page_cgroup(struct page *page)
277{
278 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
279}
280
281static void lock_page_cgroup(struct page *page)
282{
283 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
284}
285
286static int try_lock_page_cgroup(struct page *page)
287{
288 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
289}
290
291static void unlock_page_cgroup(struct page *page)
292{
293 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
294}
295
296static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
297 struct page_cgroup *pc) 240 struct page_cgroup *pc)
298{ 241{
299 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 242 int lru = LRU_BASE;
243
244 if (PageCgroupUnevictable(pc))
245 lru = LRU_UNEVICTABLE;
246 else {
247 if (PageCgroupActive(pc))
248 lru += LRU_ACTIVE;
249 if (PageCgroupFile(pc))
250 lru += LRU_FILE;
251 }
300 252
301 if (from) 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
302 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
303 else
304 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
305 254
306 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
307 list_del(&pc->lru); 256 list_del(&pc->lru);
308} 257}
309 258
310static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
311 struct page_cgroup *pc) 260 struct page_cgroup *pc)
312{ 261{
313 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 262 int lru = LRU_BASE;
314 263
315 if (!to) { 264 if (PageCgroupUnevictable(pc))
316 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 265 lru = LRU_UNEVICTABLE;
317 list_add(&pc->lru, &mz->inactive_list); 266 else {
318 } else { 267 if (PageCgroupActive(pc))
319 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 268 lru += LRU_ACTIVE;
320 list_add(&pc->lru, &mz->active_list); 269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
321 } 271 }
322 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]);
275
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
323} 277}
324 278
325static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
326{ 280{
327 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
328 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
282 int active = PageCgroupActive(pc);
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
329 287
330 if (from) 288 if (lru == from)
331 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 289 return;
332 else
333 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
334 290
335 if (active) { 291 MEM_CGROUP_ZSTAT(mz, from) -= 1;
336 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 292 /*
337 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 293 * However this is done under mz->lru_lock, another flags, which
338 list_move(&pc->lru, &mz->active_list); 294 * are not related to LRU, will be modified from out-of-lock.
295 * We have to use atomic set/clear flags.
296 */
297 if (is_unevictable_lru(lru)) {
298 ClearPageCgroupActive(pc);
299 SetPageCgroupUnevictable(pc);
339 } else { 300 } else {
340 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 301 if (is_active_lru(lru))
341 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 302 SetPageCgroupActive(pc);
342 list_move(&pc->lru, &mz->inactive_list); 303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
343 } 306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]);
344} 310}
345 311
346int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
356/* 322/*
357 * This routine assumes that the appropriate zone's lru lock is already held 323 * This routine assumes that the appropriate zone's lru lock is already held
358 */ 324 */
359void mem_cgroup_move_lists(struct page *page, bool active) 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
360{ 326{
361 struct page_cgroup *pc; 327 struct page_cgroup *pc;
362 struct mem_cgroup_per_zone *mz; 328 struct mem_cgroup_per_zone *mz;
@@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
372 * safely get to page_cgroup without it, so just try_lock it: 338 * safely get to page_cgroup without it, so just try_lock it:
373 * mem_cgroup_isolate_pages allows for page left on wrong list. 339 * mem_cgroup_isolate_pages allows for page left on wrong list.
374 */ 340 */
375 if (!try_lock_page_cgroup(page)) 341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
376 return; 343 return;
377 344 if (pc && PageCgroupUsed(pc)) {
378 pc = page_get_page_cgroup(page);
379 if (pc) {
380 mz = page_cgroup_zoneinfo(pc); 345 mz = page_cgroup_zoneinfo(pc);
381 spin_lock_irqsave(&mz->lru_lock, flags); 346 spin_lock_irqsave(&mz->lru_lock, flags);
382 __mem_cgroup_move_lists(pc, active); 347 __mem_cgroup_move_lists(pc, lru);
383 spin_unlock_irqrestore(&mz->lru_lock, flags); 348 spin_unlock_irqrestore(&mz->lru_lock, flags);
384 } 349 }
385 unlock_page_cgroup(page); 350 unlock_page_cgroup(pc);
386} 351}
387 352
388/* 353/*
@@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
403} 368}
404 369
405/* 370/*
406 * This function is called from vmscan.c. In page reclaiming loop. balance
407 * between active and inactive list is calculated. For memory controller
408 * page reclaiming, we should use using mem_cgroup's imbalance rather than
409 * zone's global lru imbalance.
410 */
411long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
412{
413 unsigned long active, inactive;
414 /* active and inactive are the number of pages. 'long' is ok.*/
415 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
416 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
417 return (long) (active / (inactive + 1));
418}
419
420/*
421 * prev_priority control...this will be used in memory reclaim path. 371 * prev_priority control...this will be used in memory reclaim path.
422 */ 372 */
423int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
444 * (see include/linux/mmzone.h) 394 * (see include/linux/mmzone.h)
445 */ 395 */
446 396
447long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
448 struct zone *zone, int priority) 398 int priority, enum lru_list lru)
449{ 399{
450 long nr_active; 400 long nr_pages;
451 int nid = zone->zone_pgdat->node_id; 401 int nid = zone->zone_pgdat->node_id;
452 int zid = zone_idx(zone); 402 int zid = zone_idx(zone);
453 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
454 404
455 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
456 return (nr_active >> priority);
457}
458 406
459long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 407 return (nr_pages >> priority);
460 struct zone *zone, int priority)
461{
462 long nr_inactive;
463 int nid = zone->zone_pgdat->node_id;
464 int zid = zone_idx(zone);
465 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
466
467 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
468 return (nr_inactive >> priority);
469} 408}
470 409
471unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
473 unsigned long *scanned, int order, 412 unsigned long *scanned, int order,
474 int mode, struct zone *z, 413 int mode, struct zone *z,
475 struct mem_cgroup *mem_cont, 414 struct mem_cgroup *mem_cont,
476 int active) 415 int active, int file)
477{ 416{
478 unsigned long nr_taken = 0; 417 unsigned long nr_taken = 0;
479 struct page *page; 418 struct page *page;
@@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
484 int nid = z->zone_pgdat->node_id; 423 int nid = z->zone_pgdat->node_id;
485 int zid = zone_idx(z); 424 int zid = zone_idx(z);
486 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_per_zone *mz;
426 int lru = LRU_FILE * !!file + !!active;
487 427
488 BUG_ON(!mem_cont); 428 BUG_ON(!mem_cont);
489 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
490 if (active) 430 src = &mz->lists[lru];
491 src = &mz->active_list;
492 else
493 src = &mz->inactive_list;
494
495 431
496 spin_lock(&mz->lru_lock); 432 spin_lock(&mz->lru_lock);
497 scan = 0; 433 scan = 0;
498 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
499 if (scan >= nr_to_scan) 435 if (scan >= nr_to_scan)
500 break; 436 break;
437 if (unlikely(!PageCgroupUsed(pc)))
438 continue;
501 page = pc->page; 439 page = pc->page;
502 440
503 if (unlikely(!PageLRU(page))) 441 if (unlikely(!PageLRU(page)))
504 continue; 442 continue;
505 443
506 if (PageActive(page) && !active) { 444 /*
507 __mem_cgroup_move_lists(pc, true); 445 * TODO: play better with lumpy reclaim, grabbing anything.
508 continue; 446 */
509 } 447 if (PageUnevictable(page) ||
510 if (!PageActive(page) && active) { 448 (PageActive(page) && !active) ||
511 __mem_cgroup_move_lists(pc, false); 449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
512 continue; 451 continue;
513 } 452 }
514 453
515 scan++; 454 scan++;
516 list_move(&pc->lru, &pc_list); 455 list_move(&pc->lru, &pc_list);
517 456
518 if (__isolate_lru_page(page, mode) == 0) { 457 if (__isolate_lru_page(page, mode, file) == 0) {
519 list_move(&page->lru, dst); 458 list_move(&page->lru, dst);
520 nr_taken++; 459 nr_taken++;
521 } 460 }
@@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
540{ 479{
541 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
542 struct page_cgroup *pc; 481 struct page_cgroup *pc;
543 unsigned long flags;
544 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
545 struct mem_cgroup_per_zone *mz; 483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
546 485
547 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 486 pc = lookup_page_cgroup(page);
548 if (unlikely(pc == NULL)) 487 /* can happen at boot */
549 goto err; 488 if (unlikely(!pc))
550 489 return 0;
490 prefetchw(pc);
551 /* 491 /*
552 * We always charge the cgroup the mm_struct belongs to. 492 * We always charge the cgroup the mm_struct belongs to.
553 * The mm_struct's mem_cgroup changes on task migration if the 493 * The mm_struct's mem_cgroup changes on task migration if the
554 * thread group leader migrates. It's possible that mm is not 494 * thread group leader migrates. It's possible that mm is not
555 * set, if so charge the init_mm (happens for pagecache usage). 495 * set, if so charge the init_mm (happens for pagecache usage).
556 */ 496 */
497
557 if (likely(!memcg)) { 498 if (likely(!memcg)) {
558 rcu_read_lock(); 499 rcu_read_lock();
559 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
560 if (unlikely(!mem)) { 501 if (unlikely(!mem)) {
561 rcu_read_unlock(); 502 rcu_read_unlock();
562 kmem_cache_free(page_cgroup_cache, pc);
563 return 0; 503 return 0;
564 } 504 }
565 /* 505 /*
@@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
572 css_get(&memcg->css); 512 css_get(&memcg->css);
573 } 513 }
574 514
575 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
576 if (!(gfp_mask & __GFP_WAIT)) 516 if (!(gfp_mask & __GFP_WAIT))
577 goto out; 517 goto out;
578 518
@@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
595 } 535 }
596 } 536 }
597 537
598 pc->mem_cgroup = mem;
599 pc->page = page;
600 /*
601 * If a page is accounted as a page cache, insert to inactive list.
602 * If anon, insert to active list.
603 */
604 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
605 pc->flags = PAGE_CGROUP_FLAG_CACHE;
606 else
607 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
608 538
609 lock_page_cgroup(page); 539 lock_page_cgroup(pc);
610 if (unlikely(page_get_page_cgroup(page))) { 540 if (unlikely(PageCgroupUsed(pc))) {
611 unlock_page_cgroup(page); 541 unlock_page_cgroup(pc);
612 res_counter_uncharge(&mem->res, PAGE_SIZE); 542 res_counter_uncharge(&mem->res, PAGE_SIZE);
613 css_put(&mem->css); 543 css_put(&mem->css);
614 kmem_cache_free(page_cgroup_cache, pc); 544
615 goto done; 545 goto done;
616 } 546 }
617 page_assign_page_cgroup(page, pc); 547 pc->mem_cgroup = mem;
548 /*
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype];
618 553
619 mz = page_cgroup_zoneinfo(pc); 554 mz = page_cgroup_zoneinfo(pc);
555
620 spin_lock_irqsave(&mz->lru_lock, flags); 556 spin_lock_irqsave(&mz->lru_lock, flags);
621 __mem_cgroup_add_list(mz, pc); 557 __mem_cgroup_add_list(mz, pc);
622 spin_unlock_irqrestore(&mz->lru_lock, flags); 558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc);
623 560
624 unlock_page_cgroup(page);
625done: 561done:
626 return 0; 562 return 0;
627out: 563out:
628 css_put(&mem->css); 564 css_put(&mem->css);
629 kmem_cache_free(page_cgroup_cache, pc);
630err:
631 return -ENOMEM; 565 return -ENOMEM;
632} 566}
633 567
@@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
635{ 569{
636 if (mem_cgroup_subsys.disabled) 570 if (mem_cgroup_subsys.disabled)
637 return 0; 571 return 0;
638 572 if (PageCompound(page))
573 return 0;
639 /* 574 /*
640 * If already mapped, we don't have to account. 575 * If already mapped, we don't have to account.
641 * If page cache, page->mapping has address_space. 576 * If page cache, page->mapping has address_space.
@@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
656{ 591{
657 if (mem_cgroup_subsys.disabled) 592 if (mem_cgroup_subsys.disabled)
658 return 0; 593 return 0;
659 594 if (PageCompound(page))
595 return 0;
660 /* 596 /*
661 * Corner case handling. This is called from add_to_page_cache() 597 * Corner case handling. This is called from add_to_page_cache()
662 * in usual. But some FS (shmem) precharges this page before calling it 598 * in usual. But some FS (shmem) precharges this page before calling it
@@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
669 if (!(gfp_mask & __GFP_WAIT)) { 605 if (!(gfp_mask & __GFP_WAIT)) {
670 struct page_cgroup *pc; 606 struct page_cgroup *pc;
671 607
672 lock_page_cgroup(page); 608
673 pc = page_get_page_cgroup(page); 609 pc = lookup_page_cgroup(page);
674 if (pc) { 610 if (!pc)
675 VM_BUG_ON(pc->page != page); 611 return 0;
676 VM_BUG_ON(!pc->mem_cgroup); 612 lock_page_cgroup(pc);
677 unlock_page_cgroup(page); 613 if (PageCgroupUsed(pc)) {
614 unlock_page_cgroup(pc);
678 return 0; 615 return 0;
679 } 616 }
680 unlock_page_cgroup(page); 617 unlock_page_cgroup(pc);
681 } 618 }
682 619
683 if (unlikely(!mm)) 620 if (unlikely(!mm))
684 mm = &init_mm; 621 mm = &init_mm;
685 622
686 return mem_cgroup_charge_common(page, mm, gfp_mask, 623 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask,
687 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else
627 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
688} 629}
689 630
690/* 631/*
@@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
704 /* 645 /*
705 * Check if our page_cgroup is valid 646 * Check if our page_cgroup is valid
706 */ 647 */
707 lock_page_cgroup(page); 648 pc = lookup_page_cgroup(page);
708 pc = page_get_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc)))
709 if (unlikely(!pc)) 650 return;
710 goto unlock;
711
712 VM_BUG_ON(pc->page != page);
713 651
714 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 652 lock_page_cgroup(pc);
715 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
716 || page_mapped(page))) 654 || !PageCgroupUsed(pc)) {
717 goto unlock; 655 /* This happens at race in zap_pte_range() and do_swap_page()*/
656 unlock_page_cgroup(pc);
657 return;
658 }
659 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup;
718 661
719 mz = page_cgroup_zoneinfo(pc); 662 mz = page_cgroup_zoneinfo(pc);
720 spin_lock_irqsave(&mz->lru_lock, flags); 663 spin_lock_irqsave(&mz->lru_lock, flags);
721 __mem_cgroup_remove_list(mz, pc); 664 __mem_cgroup_remove_list(mz, pc);
722 spin_unlock_irqrestore(&mz->lru_lock, flags); 665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc);
723 667
724 page_assign_page_cgroup(page, NULL);
725 unlock_page_cgroup(page);
726
727 mem = pc->mem_cgroup;
728 res_counter_uncharge(&mem->res, PAGE_SIZE); 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
729 css_put(&mem->css); 669 css_put(&mem->css);
730 670
731 kmem_cache_free(page_cgroup_cache, pc);
732 return; 671 return;
733unlock:
734 unlock_page_cgroup(page);
735} 672}
736 673
737void mem_cgroup_uncharge_page(struct page *page) 674void mem_cgroup_uncharge_page(struct page *page)
738{ 675{
676 /* early check. */
677 if (page_mapped(page))
678 return;
679 if (page->mapping && !PageAnon(page))
680 return;
739 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
740} 682}
741 683
742void mem_cgroup_uncharge_cache_page(struct page *page) 684void mem_cgroup_uncharge_cache_page(struct page *page)
743{ 685{
744 VM_BUG_ON(page_mapped(page)); 686 VM_BUG_ON(page_mapped(page));
687 VM_BUG_ON(page->mapping);
745 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
746} 689}
747 690
@@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
758 if (mem_cgroup_subsys.disabled) 701 if (mem_cgroup_subsys.disabled)
759 return 0; 702 return 0;
760 703
761 lock_page_cgroup(page); 704 pc = lookup_page_cgroup(page);
762 pc = page_get_page_cgroup(page); 705 lock_page_cgroup(pc);
763 if (pc) { 706 if (PageCgroupUsed(pc)) {
764 mem = pc->mem_cgroup; 707 mem = pc->mem_cgroup;
765 css_get(&mem->css); 708 css_get(&mem->css);
766 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 709 if (PageCgroupCache(pc)) {
767 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
768 } 715 }
769 unlock_page_cgroup(page); 716 unlock_page_cgroup(pc);
770 if (mem) { 717 if (mem) {
771 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
772 ctype, mem); 719 ctype, mem);
@@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
791 */ 738 */
792 if (!newpage->mapping) 739 if (!newpage->mapping)
793 __mem_cgroup_uncharge_common(newpage, 740 __mem_cgroup_uncharge_common(newpage,
794 MEM_CGROUP_CHARGE_TYPE_FORCE); 741 MEM_CGROUP_CHARGE_TYPE_FORCE);
795 else if (PageAnon(newpage)) 742 else if (PageAnon(newpage))
796 mem_cgroup_uncharge_page(newpage); 743 mem_cgroup_uncharge_page(newpage);
797} 744}
@@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
863#define FORCE_UNCHARGE_BATCH (128) 810#define FORCE_UNCHARGE_BATCH (128)
864static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
865 struct mem_cgroup_per_zone *mz, 812 struct mem_cgroup_per_zone *mz,
866 int active) 813 enum lru_list lru)
867{ 814{
868 struct page_cgroup *pc; 815 struct page_cgroup *pc;
869 struct page *page; 816 struct page *page;
@@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
871 unsigned long flags; 818 unsigned long flags;
872 struct list_head *list; 819 struct list_head *list;
873 820
874 if (active) 821 list = &mz->lists[lru];
875 list = &mz->active_list;
876 else
877 list = &mz->inactive_list;
878 822
879 spin_lock_irqsave(&mz->lru_lock, flags); 823 spin_lock_irqsave(&mz->lru_lock, flags);
880 while (!list_empty(list)) { 824 while (!list_empty(list)) {
881 pc = list_entry(list->prev, struct page_cgroup, lru); 825 pc = list_entry(list->prev, struct page_cgroup, lru);
882 page = pc->page; 826 page = pc->page;
827 if (!PageCgroupUsed(pc))
828 break;
883 get_page(page); 829 get_page(page);
884 spin_unlock_irqrestore(&mz->lru_lock, flags); 830 spin_unlock_irqrestore(&mz->lru_lock, flags);
885 /* 831 /*
@@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
894 count = FORCE_UNCHARGE_BATCH; 840 count = FORCE_UNCHARGE_BATCH;
895 cond_resched(); 841 cond_resched();
896 } 842 }
897 } else 843 } else {
898 cond_resched(); 844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break;
846 }
899 spin_lock_irqsave(&mz->lru_lock, flags); 847 spin_lock_irqsave(&mz->lru_lock, flags);
900 } 848 }
901 spin_unlock_irqrestore(&mz->lru_lock, flags); 849 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
919 while (mem->res.usage > 0) { 867 while (mem->res.usage > 0) {
920 if (atomic_read(&mem->css.cgroup->count) > 0) 868 if (atomic_read(&mem->css.cgroup->count) > 0)
921 goto out; 869 goto out;
870 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all();
922 for_each_node_state(node, N_POSSIBLE) 872 for_each_node_state(node, N_POSSIBLE)
923 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
924 struct mem_cgroup_per_zone *mz; 874 struct mem_cgroup_per_zone *mz;
875 enum lru_list l;
925 mz = mem_cgroup_zoneinfo(mem, node, zid); 876 mz = mem_cgroup_zoneinfo(mem, node, zid);
926 /* drop all page_cgroup in active_list */ 877 for_each_lru(l)
927 mem_cgroup_force_empty_list(mem, mz, 1); 878 mem_cgroup_force_empty_list(mem, mz, l);
928 /* drop all page_cgroup in inactive_list */
929 mem_cgroup_force_empty_list(mem, mz, 0);
930 } 879 }
880 cond_resched();
931 } 881 }
932 ret = 0; 882 ret = 0;
933out: 883out:
@@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1012 } 962 }
1013 /* showing # of active pages */ 963 /* showing # of active pages */
1014 { 964 {
1015 unsigned long active, inactive; 965 unsigned long active_anon, inactive_anon;
1016 966 unsigned long active_file, inactive_file;
1017 inactive = mem_cgroup_get_all_zonestat(mem_cont, 967 unsigned long unevictable;
1018 MEM_CGROUP_ZSTAT_INACTIVE); 968
1019 active = mem_cgroup_get_all_zonestat(mem_cont, 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1020 MEM_CGROUP_ZSTAT_ACTIVE); 970 LRU_INACTIVE_ANON);
1021 cb->fill(cb, "active", (active) * PAGE_SIZE); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1022 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 972 LRU_ACTIVE_ANON);
973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
974 LRU_INACTIVE_FILE);
975 active_file = mem_cgroup_get_all_zonestat(mem_cont,
976 LRU_ACTIVE_FILE);
977 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
978 LRU_UNEVICTABLE);
979
980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985
1023 } 986 }
1024 return 0; 987 return 0;
1025} 988}
@@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1062{ 1025{
1063 struct mem_cgroup_per_node *pn; 1026 struct mem_cgroup_per_node *pn;
1064 struct mem_cgroup_per_zone *mz; 1027 struct mem_cgroup_per_zone *mz;
1028 enum lru_list l;
1065 int zone, tmp = node; 1029 int zone, tmp = node;
1066 /* 1030 /*
1067 * This routine is called against possible nodes. 1031 * This routine is called against possible nodes.
@@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1082 1046
1083 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1084 mz = &pn->zoneinfo[zone]; 1048 mz = &pn->zoneinfo[zone];
1085 INIT_LIST_HEAD(&mz->active_list);
1086 INIT_LIST_HEAD(&mz->inactive_list);
1087 spin_lock_init(&mz->lru_lock); 1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]);
1088 } 1052 }
1089 return 0; 1053 return 0;
1090} 1054}
@@ -1124,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1124 int node; 1088 int node;
1125 1089
1126 if (unlikely((cont->parent) == NULL)) { 1090 if (unlikely((cont->parent) == NULL)) {
1091 page_cgroup_init();
1127 mem = &init_mem_cgroup; 1092 mem = &init_mem_cgroup;
1128 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1129 } else { 1093 } else {
1130 mem = mem_cgroup_alloc(); 1094 mem = mem_cgroup_alloc();
1131 if (!mem) 1095 if (!mem)