aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:54:49 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:54:49 -0400
commitd1a76187a5be4f89c6cb19d800cb5fb7aac735c5 (patch)
tree2fac3ffbfffc7560eeef8364b541d0d7a0057920 /mm/memcontrol.c
parentc7e78cff6b7518212247fb20b1dc6411540dc9af (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into core/locking
Conflicts: arch/um/include/asm/system.h
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c481
1 files changed, 231 insertions, 250 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f1f7a7374ba..866dcc7eeb0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,11 +32,12 @@
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38struct cgroup_subsys mem_cgroup_subsys __read_mostly; 40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static struct kmem_cache *page_cgroup_cache __read_mostly;
40#define MEM_CGROUP_RECLAIM_RETRIES 5 41#define MEM_CGROUP_RECLAIM_RETRIES 5
41 42
42/* 43/*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
65/* 66/*
66 * For accounting under irq disable, no need for increment preempt count. 67 * For accounting under irq disable, no need for increment preempt count.
67 */ 68 */
68static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
69 enum mem_cgroup_stat_index idx, int val) 70 enum mem_cgroup_stat_index idx, int val)
70{ 71{
71 int cpu = smp_processor_id(); 72 stat->count[idx] += val;
72 stat->cpustat[cpu].count[idx] += val;
73} 73}
74 74
75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
85/* 85/*
86 * per-zone information in memory controller. 86 * per-zone information in memory controller.
87 */ 87 */
88
89enum mem_cgroup_zstat_index {
90 MEM_CGROUP_ZSTAT_ACTIVE,
91 MEM_CGROUP_ZSTAT_INACTIVE,
92
93 NR_MEM_CGROUP_ZSTAT,
94};
95
96struct mem_cgroup_per_zone { 88struct mem_cgroup_per_zone {
97 /* 89 /*
98 * spin_lock to protect the per cgroup LRU 90 * spin_lock to protect the per cgroup LRU
99 */ 91 */
100 spinlock_t lru_lock; 92 spinlock_t lru_lock;
101 struct list_head active_list; 93 struct list_head lists[NR_LRU_LISTS];
102 struct list_head inactive_list; 94 unsigned long count[NR_LRU_LISTS];
103 unsigned long count[NR_MEM_CGROUP_ZSTAT];
104}; 95};
105/* Macro for accessing counter */ 96/* Macro for accessing counter */
106#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
144}; 135};
145static struct mem_cgroup init_mem_cgroup; 136static struct mem_cgroup init_mem_cgroup;
146 137
147/*
148 * We use the lower bit of the page->page_cgroup pointer as a bit spin
149 * lock. We need to ensure that page->page_cgroup is at least two
150 * byte aligned (based on comments from Nick Piggin). But since
151 * bit_spin_lock doesn't actually set that lock bit in a non-debug
152 * uniprocessor kernel, we should avoid setting it here too.
153 */
154#define PAGE_CGROUP_LOCK_BIT 0x0
155#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
156#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
157#else
158#define PAGE_CGROUP_LOCK 0x0
159#endif
160
161/*
162 * A page_cgroup page is associated with every page descriptor. The
163 * page_cgroup helps us identify information about the cgroup
164 */
165struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page;
168 struct mem_cgroup *mem_cgroup;
169 int flags;
170};
171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
172#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
173
174static int page_cgroup_nid(struct page_cgroup *pc)
175{
176 return page_to_nid(pc->page);
177}
178
179static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
180{
181 return page_zonenum(pc->page);
182}
183
184enum charge_type { 138enum charge_type {
185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
143 NR_CHARGE_TYPE,
144};
145
146/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */
188}; 158};
189 159
190/* 160/*
191 * Always modified under lru lock. Then, not necessary to preempt_disable() 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
192 */ 162 */
193static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
194 bool charge) 164 struct page_cgroup *pc,
165 bool charge)
195{ 166{
196 int val = (charge)? 1 : -1; 167 int val = (charge)? 1 : -1;
197 struct mem_cgroup_stat *stat = &mem->stat; 168 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat;
198 170
199 VM_BUG_ON(!irqs_disabled()); 171 VM_BUG_ON(!irqs_disabled());
200 if (flags & PAGE_CGROUP_FLAG_CACHE) 172
201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
202 else 176 else
203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
204 178
205 if (charge) 179 if (charge)
206 __mem_cgroup_stat_add_safe(stat, 180 __mem_cgroup_stat_add_safe(cpustat,
207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
208 else 182 else
209 __mem_cgroup_stat_add_safe(stat, 183 __mem_cgroup_stat_add_safe(cpustat,
210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
211} 185}
212 186
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
227} 201}
228 202
229static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
230 enum mem_cgroup_zstat_index idx) 204 enum lru_list idx)
231{ 205{
232 int nid, zid; 206 int nid, zid;
233 struct mem_cgroup_per_zone *mz; 207 struct mem_cgroup_per_zone *mz;
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
250 224
251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 225struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
252{ 226{
227 /*
228 * mm_update_next_owner() may clear mm->owner to NULL
229 * if it races with swapoff, page migration, etc.
230 * So this can be called with p == NULL.
231 */
232 if (unlikely(!p))
233 return NULL;
234
253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 235 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
254 struct mem_cgroup, css); 236 struct mem_cgroup, css);
255} 237}
256 238
257static inline int page_cgroup_locked(struct page *page)
258{
259 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
260}
261
262static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
263{
264 VM_BUG_ON(!page_cgroup_locked(page));
265 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
266}
267
268struct page_cgroup *page_get_page_cgroup(struct page *page)
269{
270 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
271}
272
273static void lock_page_cgroup(struct page *page)
274{
275 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
276}
277
278static int try_lock_page_cgroup(struct page *page)
279{
280 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
281}
282
283static void unlock_page_cgroup(struct page *page)
284{
285 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
286}
287
288static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
289 struct page_cgroup *pc) 240 struct page_cgroup *pc)
290{ 241{
291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 242 int lru = LRU_BASE;
243
244 if (PageCgroupUnevictable(pc))
245 lru = LRU_UNEVICTABLE;
246 else {
247 if (PageCgroupActive(pc))
248 lru += LRU_ACTIVE;
249 if (PageCgroupFile(pc))
250 lru += LRU_FILE;
251 }
292 252
293 if (from) 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
294 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
295 else
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 254
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
299 list_del(&pc->lru); 256 list_del(&pc->lru);
300} 257}
301 258
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
303 struct page_cgroup *pc) 260 struct page_cgroup *pc)
304{ 261{
305 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 262 int lru = LRU_BASE;
306 263
307 if (!to) { 264 if (PageCgroupUnevictable(pc))
308 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 265 lru = LRU_UNEVICTABLE;
309 list_add(&pc->lru, &mz->inactive_list); 266 else {
310 } else { 267 if (PageCgroupActive(pc))
311 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 268 lru += LRU_ACTIVE;
312 list_add(&pc->lru, &mz->active_list); 269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
313 } 271 }
314 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]);
275
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
315} 277}
316 278
317static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
318{ 280{
319 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
320 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
282 int active = PageCgroupActive(pc);
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
321 287
322 if (from) 288 if (lru == from)
323 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 289 return;
324 else
325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
326 290
327 if (active) { 291 MEM_CGROUP_ZSTAT(mz, from) -= 1;
328 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 292 /*
329 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 293 * However this is done under mz->lru_lock, another flags, which
330 list_move(&pc->lru, &mz->active_list); 294 * are not related to LRU, will be modified from out-of-lock.
295 * We have to use atomic set/clear flags.
296 */
297 if (is_unevictable_lru(lru)) {
298 ClearPageCgroupActive(pc);
299 SetPageCgroupUnevictable(pc);
331 } else { 300 } else {
332 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 301 if (is_active_lru(lru))
333 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 302 SetPageCgroupActive(pc);
334 list_move(&pc->lru, &mz->inactive_list); 303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
335 } 306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]);
336} 310}
337 311
338int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -348,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
348/* 322/*
349 * This routine assumes that the appropriate zone's lru lock is already held 323 * This routine assumes that the appropriate zone's lru lock is already held
350 */ 324 */
351void mem_cgroup_move_lists(struct page *page, bool active) 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
352{ 326{
353 struct page_cgroup *pc; 327 struct page_cgroup *pc;
354 struct mem_cgroup_per_zone *mz; 328 struct mem_cgroup_per_zone *mz;
@@ -364,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
364 * safely get to page_cgroup without it, so just try_lock it: 338 * safely get to page_cgroup without it, so just try_lock it:
365 * mem_cgroup_isolate_pages allows for page left on wrong list. 339 * mem_cgroup_isolate_pages allows for page left on wrong list.
366 */ 340 */
367 if (!try_lock_page_cgroup(page)) 341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
368 return; 343 return;
369 344 if (pc && PageCgroupUsed(pc)) {
370 pc = page_get_page_cgroup(page);
371 if (pc) {
372 mz = page_cgroup_zoneinfo(pc); 345 mz = page_cgroup_zoneinfo(pc);
373 spin_lock_irqsave(&mz->lru_lock, flags); 346 spin_lock_irqsave(&mz->lru_lock, flags);
374 __mem_cgroup_move_lists(pc, active); 347 __mem_cgroup_move_lists(pc, lru);
375 spin_unlock_irqrestore(&mz->lru_lock, flags); 348 spin_unlock_irqrestore(&mz->lru_lock, flags);
376 } 349 }
377 unlock_page_cgroup(page); 350 unlock_page_cgroup(pc);
378} 351}
379 352
380/* 353/*
@@ -395,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
395} 368}
396 369
397/* 370/*
398 * This function is called from vmscan.c. In page reclaiming loop. balance
399 * between active and inactive list is calculated. For memory controller
400 * page reclaiming, we should use using mem_cgroup's imbalance rather than
401 * zone's global lru imbalance.
402 */
403long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
404{
405 unsigned long active, inactive;
406 /* active and inactive are the number of pages. 'long' is ok.*/
407 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
408 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
409 return (long) (active / (inactive + 1));
410}
411
412/*
413 * prev_priority control...this will be used in memory reclaim path. 371 * prev_priority control...this will be used in memory reclaim path.
414 */ 372 */
415int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -436,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
436 * (see include/linux/mmzone.h) 394 * (see include/linux/mmzone.h)
437 */ 395 */
438 396
439long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
440 struct zone *zone, int priority) 398 int priority, enum lru_list lru)
441{ 399{
442 long nr_active; 400 long nr_pages;
443 int nid = zone->zone_pgdat->node_id; 401 int nid = zone->zone_pgdat->node_id;
444 int zid = zone_idx(zone); 402 int zid = zone_idx(zone);
445 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
446 404
447 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
448 return (nr_active >> priority);
449}
450
451long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
452 struct zone *zone, int priority)
453{
454 long nr_inactive;
455 int nid = zone->zone_pgdat->node_id;
456 int zid = zone_idx(zone);
457 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
458 406
459 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 407 return (nr_pages >> priority);
460 return (nr_inactive >> priority);
461} 408}
462 409
463unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -465,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
465 unsigned long *scanned, int order, 412 unsigned long *scanned, int order,
466 int mode, struct zone *z, 413 int mode, struct zone *z,
467 struct mem_cgroup *mem_cont, 414 struct mem_cgroup *mem_cont,
468 int active) 415 int active, int file)
469{ 416{
470 unsigned long nr_taken = 0; 417 unsigned long nr_taken = 0;
471 struct page *page; 418 struct page *page;
@@ -476,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
476 int nid = z->zone_pgdat->node_id; 423 int nid = z->zone_pgdat->node_id;
477 int zid = zone_idx(z); 424 int zid = zone_idx(z);
478 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_per_zone *mz;
426 int lru = LRU_FILE * !!file + !!active;
479 427
480 BUG_ON(!mem_cont); 428 BUG_ON(!mem_cont);
481 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
482 if (active) 430 src = &mz->lists[lru];
483 src = &mz->active_list;
484 else
485 src = &mz->inactive_list;
486
487 431
488 spin_lock(&mz->lru_lock); 432 spin_lock(&mz->lru_lock);
489 scan = 0; 433 scan = 0;
490 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
491 if (scan >= nr_to_scan) 435 if (scan >= nr_to_scan)
492 break; 436 break;
437 if (unlikely(!PageCgroupUsed(pc)))
438 continue;
493 page = pc->page; 439 page = pc->page;
494 440
495 if (unlikely(!PageLRU(page))) 441 if (unlikely(!PageLRU(page)))
496 continue; 442 continue;
497 443
498 if (PageActive(page) && !active) { 444 /*
499 __mem_cgroup_move_lists(pc, true); 445 * TODO: play better with lumpy reclaim, grabbing anything.
500 continue; 446 */
501 } 447 if (PageUnevictable(page) ||
502 if (!PageActive(page) && active) { 448 (PageActive(page) && !active) ||
503 __mem_cgroup_move_lists(pc, false); 449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
504 continue; 451 continue;
505 } 452 }
506 453
507 scan++; 454 scan++;
508 list_move(&pc->lru, &pc_list); 455 list_move(&pc->lru, &pc_list);
509 456
510 if (__isolate_lru_page(page, mode) == 0) { 457 if (__isolate_lru_page(page, mode, file) == 0) {
511 list_move(&page->lru, dst); 458 list_move(&page->lru, dst);
512 nr_taken++; 459 nr_taken++;
513 } 460 }
@@ -532,23 +479,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532{ 479{
533 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
534 struct page_cgroup *pc; 481 struct page_cgroup *pc;
535 unsigned long flags;
536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
537 struct mem_cgroup_per_zone *mz; 483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
538 485
539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 486 pc = lookup_page_cgroup(page);
540 if (unlikely(pc == NULL)) 487 /* can happen at boot */
541 goto err; 488 if (unlikely(!pc))
542 489 return 0;
490 prefetchw(pc);
543 /* 491 /*
544 * We always charge the cgroup the mm_struct belongs to. 492 * We always charge the cgroup the mm_struct belongs to.
545 * The mm_struct's mem_cgroup changes on task migration if the 493 * The mm_struct's mem_cgroup changes on task migration if the
546 * thread group leader migrates. It's possible that mm is not 494 * thread group leader migrates. It's possible that mm is not
547 * set, if so charge the init_mm (happens for pagecache usage). 495 * set, if so charge the init_mm (happens for pagecache usage).
548 */ 496 */
497
549 if (likely(!memcg)) { 498 if (likely(!memcg)) {
550 rcu_read_lock(); 499 rcu_read_lock();
551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
501 if (unlikely(!mem)) {
502 rcu_read_unlock();
503 return 0;
504 }
552 /* 505 /*
553 * For every charge from the cgroup, increment reference count 506 * For every charge from the cgroup, increment reference count
554 */ 507 */
@@ -559,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
559 css_get(&memcg->css); 512 css_get(&memcg->css);
560 } 513 }
561 514
562 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
563 if (!(gfp_mask & __GFP_WAIT)) 516 if (!(gfp_mask & __GFP_WAIT))
564 goto out; 517 goto out;
565 518
@@ -582,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
582 } 535 }
583 } 536 }
584 537
585 pc->mem_cgroup = mem;
586 pc->page = page;
587 /*
588 * If a page is accounted as a page cache, insert to inactive list.
589 * If anon, insert to active list.
590 */
591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
592 pc->flags = PAGE_CGROUP_FLAG_CACHE;
593 else
594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
595 538
596 lock_page_cgroup(page); 539 lock_page_cgroup(pc);
597 if (unlikely(page_get_page_cgroup(page))) { 540 if (unlikely(PageCgroupUsed(pc))) {
598 unlock_page_cgroup(page); 541 unlock_page_cgroup(pc);
599 res_counter_uncharge(&mem->res, PAGE_SIZE); 542 res_counter_uncharge(&mem->res, PAGE_SIZE);
600 css_put(&mem->css); 543 css_put(&mem->css);
601 kmem_cache_free(page_cgroup_cache, pc); 544
602 goto done; 545 goto done;
603 } 546 }
604 page_assign_page_cgroup(page, pc); 547 pc->mem_cgroup = mem;
548 /*
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype];
605 553
606 mz = page_cgroup_zoneinfo(pc); 554 mz = page_cgroup_zoneinfo(pc);
555
607 spin_lock_irqsave(&mz->lru_lock, flags); 556 spin_lock_irqsave(&mz->lru_lock, flags);
608 __mem_cgroup_add_list(mz, pc); 557 __mem_cgroup_add_list(mz, pc);
609 spin_unlock_irqrestore(&mz->lru_lock, flags); 558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc);
610 560
611 unlock_page_cgroup(page);
612done: 561done:
613 return 0; 562 return 0;
614out: 563out:
615 css_put(&mem->css); 564 css_put(&mem->css);
616 kmem_cache_free(page_cgroup_cache, pc);
617err:
618 return -ENOMEM; 565 return -ENOMEM;
619} 566}
620 567
@@ -622,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
622{ 569{
623 if (mem_cgroup_subsys.disabled) 570 if (mem_cgroup_subsys.disabled)
624 return 0; 571 return 0;
625 572 if (PageCompound(page))
573 return 0;
626 /* 574 /*
627 * If already mapped, we don't have to account. 575 * If already mapped, we don't have to account.
628 * If page cache, page->mapping has address_space. 576 * If page cache, page->mapping has address_space.
@@ -643,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
643{ 591{
644 if (mem_cgroup_subsys.disabled) 592 if (mem_cgroup_subsys.disabled)
645 return 0; 593 return 0;
646 594 if (PageCompound(page))
595 return 0;
647 /* 596 /*
648 * Corner case handling. This is called from add_to_page_cache() 597 * Corner case handling. This is called from add_to_page_cache()
649 * in usual. But some FS (shmem) precharges this page before calling it 598 * in usual. But some FS (shmem) precharges this page before calling it
@@ -656,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
656 if (!(gfp_mask & __GFP_WAIT)) { 605 if (!(gfp_mask & __GFP_WAIT)) {
657 struct page_cgroup *pc; 606 struct page_cgroup *pc;
658 607
659 lock_page_cgroup(page); 608
660 pc = page_get_page_cgroup(page); 609 pc = lookup_page_cgroup(page);
661 if (pc) { 610 if (!pc)
662 VM_BUG_ON(pc->page != page); 611 return 0;
663 VM_BUG_ON(!pc->mem_cgroup); 612 lock_page_cgroup(pc);
664 unlock_page_cgroup(page); 613 if (PageCgroupUsed(pc)) {
614 unlock_page_cgroup(pc);
665 return 0; 615 return 0;
666 } 616 }
667 unlock_page_cgroup(page); 617 unlock_page_cgroup(pc);
668 } 618 }
669 619
670 if (unlikely(!mm)) 620 if (unlikely(!mm))
671 mm = &init_mm; 621 mm = &init_mm;
672 622
673 return mem_cgroup_charge_common(page, mm, gfp_mask, 623 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask,
674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else
627 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
675} 629}
676 630
677/* 631/*
@@ -691,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
691 /* 645 /*
692 * Check if our page_cgroup is valid 646 * Check if our page_cgroup is valid
693 */ 647 */
694 lock_page_cgroup(page); 648 pc = lookup_page_cgroup(page);
695 pc = page_get_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc)))
696 if (unlikely(!pc)) 650 return;
697 goto unlock;
698
699 VM_BUG_ON(pc->page != page);
700 651
701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 652 lock_page_cgroup(pc);
702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
703 || page_mapped(page))) 654 || !PageCgroupUsed(pc)) {
704 goto unlock; 655 /* This happens at race in zap_pte_range() and do_swap_page()*/
656 unlock_page_cgroup(pc);
657 return;
658 }
659 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup;
705 661
706 mz = page_cgroup_zoneinfo(pc); 662 mz = page_cgroup_zoneinfo(pc);
707 spin_lock_irqsave(&mz->lru_lock, flags); 663 spin_lock_irqsave(&mz->lru_lock, flags);
708 __mem_cgroup_remove_list(mz, pc); 664 __mem_cgroup_remove_list(mz, pc);
709 spin_unlock_irqrestore(&mz->lru_lock, flags); 665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc);
710 667
711 page_assign_page_cgroup(page, NULL);
712 unlock_page_cgroup(page);
713
714 mem = pc->mem_cgroup;
715 res_counter_uncharge(&mem->res, PAGE_SIZE); 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
716 css_put(&mem->css); 669 css_put(&mem->css);
717 670
718 kmem_cache_free(page_cgroup_cache, pc);
719 return; 671 return;
720unlock:
721 unlock_page_cgroup(page);
722} 672}
723 673
724void mem_cgroup_uncharge_page(struct page *page) 674void mem_cgroup_uncharge_page(struct page *page)
725{ 675{
676 /* early check. */
677 if (page_mapped(page))
678 return;
679 if (page->mapping && !PageAnon(page))
680 return;
726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
727} 682}
728 683
729void mem_cgroup_uncharge_cache_page(struct page *page) 684void mem_cgroup_uncharge_cache_page(struct page *page)
730{ 685{
731 VM_BUG_ON(page_mapped(page)); 686 VM_BUG_ON(page_mapped(page));
687 VM_BUG_ON(page->mapping);
732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
733} 689}
734 690
@@ -745,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
745 if (mem_cgroup_subsys.disabled) 701 if (mem_cgroup_subsys.disabled)
746 return 0; 702 return 0;
747 703
748 lock_page_cgroup(page); 704 pc = lookup_page_cgroup(page);
749 pc = page_get_page_cgroup(page); 705 lock_page_cgroup(pc);
750 if (pc) { 706 if (PageCgroupUsed(pc)) {
751 mem = pc->mem_cgroup; 707 mem = pc->mem_cgroup;
752 css_get(&mem->css); 708 css_get(&mem->css);
753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 709 if (PageCgroupCache(pc)) {
754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
755 } 715 }
756 unlock_page_cgroup(page); 716 unlock_page_cgroup(pc);
757 if (mem) { 717 if (mem) {
758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
759 ctype, mem); 719 ctype, mem);
@@ -778,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
778 */ 738 */
779 if (!newpage->mapping) 739 if (!newpage->mapping)
780 __mem_cgroup_uncharge_common(newpage, 740 __mem_cgroup_uncharge_common(newpage,
781 MEM_CGROUP_CHARGE_TYPE_FORCE); 741 MEM_CGROUP_CHARGE_TYPE_FORCE);
782 else if (PageAnon(newpage)) 742 else if (PageAnon(newpage))
783 mem_cgroup_uncharge_page(newpage); 743 mem_cgroup_uncharge_page(newpage);
784} 744}
@@ -801,11 +761,16 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
801 761
802 rcu_read_lock(); 762 rcu_read_lock();
803 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
764 if (unlikely(!mem)) {
765 rcu_read_unlock();
766 return 0;
767 }
804 css_get(&mem->css); 768 css_get(&mem->css);
805 rcu_read_unlock(); 769 rcu_read_unlock();
806 770
807 do { 771 do {
808 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
773 progress += res_counter_check_under_limit(&mem->res);
809 } while (!progress && --retry); 774 } while (!progress && --retry);
810 775
811 css_put(&mem->css); 776 css_put(&mem->css);
@@ -845,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
845#define FORCE_UNCHARGE_BATCH (128) 810#define FORCE_UNCHARGE_BATCH (128)
846static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
847 struct mem_cgroup_per_zone *mz, 812 struct mem_cgroup_per_zone *mz,
848 int active) 813 enum lru_list lru)
849{ 814{
850 struct page_cgroup *pc; 815 struct page_cgroup *pc;
851 struct page *page; 816 struct page *page;
@@ -853,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
853 unsigned long flags; 818 unsigned long flags;
854 struct list_head *list; 819 struct list_head *list;
855 820
856 if (active) 821 list = &mz->lists[lru];
857 list = &mz->active_list;
858 else
859 list = &mz->inactive_list;
860 822
861 spin_lock_irqsave(&mz->lru_lock, flags); 823 spin_lock_irqsave(&mz->lru_lock, flags);
862 while (!list_empty(list)) { 824 while (!list_empty(list)) {
863 pc = list_entry(list->prev, struct page_cgroup, lru); 825 pc = list_entry(list->prev, struct page_cgroup, lru);
864 page = pc->page; 826 page = pc->page;
827 if (!PageCgroupUsed(pc))
828 break;
865 get_page(page); 829 get_page(page);
866 spin_unlock_irqrestore(&mz->lru_lock, flags); 830 spin_unlock_irqrestore(&mz->lru_lock, flags);
867 /* 831 /*
@@ -876,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
876 count = FORCE_UNCHARGE_BATCH; 840 count = FORCE_UNCHARGE_BATCH;
877 cond_resched(); 841 cond_resched();
878 } 842 }
879 } else 843 } else {
880 cond_resched(); 844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break;
846 }
881 spin_lock_irqsave(&mz->lru_lock, flags); 847 spin_lock_irqsave(&mz->lru_lock, flags);
882 } 848 }
883 spin_unlock_irqrestore(&mz->lru_lock, flags); 849 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -901,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
901 while (mem->res.usage > 0) { 867 while (mem->res.usage > 0) {
902 if (atomic_read(&mem->css.cgroup->count) > 0) 868 if (atomic_read(&mem->css.cgroup->count) > 0)
903 goto out; 869 goto out;
870 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all();
904 for_each_node_state(node, N_POSSIBLE) 872 for_each_node_state(node, N_POSSIBLE)
905 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
906 struct mem_cgroup_per_zone *mz; 874 struct mem_cgroup_per_zone *mz;
875 enum lru_list l;
907 mz = mem_cgroup_zoneinfo(mem, node, zid); 876 mz = mem_cgroup_zoneinfo(mem, node, zid);
908 /* drop all page_cgroup in active_list */ 877 for_each_lru(l)
909 mem_cgroup_force_empty_list(mem, mz, 1); 878 mem_cgroup_force_empty_list(mem, mz, l);
910 /* drop all page_cgroup in inactive_list */
911 mem_cgroup_force_empty_list(mem, mz, 0);
912 } 879 }
880 cond_resched();
913 } 881 }
914 ret = 0; 882 ret = 0;
915out: 883out:
@@ -994,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
994 } 962 }
995 /* showing # of active pages */ 963 /* showing # of active pages */
996 { 964 {
997 unsigned long active, inactive; 965 unsigned long active_anon, inactive_anon;
998 966 unsigned long active_file, inactive_file;
999 inactive = mem_cgroup_get_all_zonestat(mem_cont, 967 unsigned long unevictable;
1000 MEM_CGROUP_ZSTAT_INACTIVE); 968
1001 active = mem_cgroup_get_all_zonestat(mem_cont, 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1002 MEM_CGROUP_ZSTAT_ACTIVE); 970 LRU_INACTIVE_ANON);
1003 cb->fill(cb, "active", (active) * PAGE_SIZE); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1004 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 972 LRU_ACTIVE_ANON);
973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
974 LRU_INACTIVE_FILE);
975 active_file = mem_cgroup_get_all_zonestat(mem_cont,
976 LRU_ACTIVE_FILE);
977 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
978 LRU_UNEVICTABLE);
979
980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985
1005 } 986 }
1006 return 0; 987 return 0;
1007} 988}
@@ -1044,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1044{ 1025{
1045 struct mem_cgroup_per_node *pn; 1026 struct mem_cgroup_per_node *pn;
1046 struct mem_cgroup_per_zone *mz; 1027 struct mem_cgroup_per_zone *mz;
1028 enum lru_list l;
1047 int zone, tmp = node; 1029 int zone, tmp = node;
1048 /* 1030 /*
1049 * This routine is called against possible nodes. 1031 * This routine is called against possible nodes.
@@ -1064,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1064 1046
1065 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1066 mz = &pn->zoneinfo[zone]; 1048 mz = &pn->zoneinfo[zone];
1067 INIT_LIST_HEAD(&mz->active_list);
1068 INIT_LIST_HEAD(&mz->inactive_list);
1069 spin_lock_init(&mz->lru_lock); 1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]);
1070 } 1052 }
1071 return 0; 1053 return 0;
1072} 1054}
@@ -1107,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1107 1089
1108 if (unlikely((cont->parent) == NULL)) { 1090 if (unlikely((cont->parent) == NULL)) {
1109 mem = &init_mem_cgroup; 1091 mem = &init_mem_cgroup;
1110 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1111 } else { 1092 } else {
1112 mem = mem_cgroup_alloc(); 1093 mem = mem_cgroup_alloc();
1113 if (!mem) 1094 if (!mem)