aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2009-09-23 18:56:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:58 -0400
commit4b3bde4c983de36c59e6c1a24701f6fe816f9f55 (patch)
treee759c5ceccf57331b868b7feac61cae5e932c6d4
parentbe367d09927023d081f9199665c8500f69f14d22 (diff)
memcg: remove the overhead associated with the root cgroup
Change the memory cgroup to remove the overhead associated with accounting all pages in the root cgroup. As a side-effect, we can no longer set a memory hard limit in the root cgroup. A new flag to track whether the page has been accounted or not has been added as well. Flags are now set atomically for page_cgroup, pcg_default_flags is now obsolete and removed. [akpm@linux-foundation.org: fix a few documentation glitches] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memory.txt4
-rw-r--r--include/linux/page_cgroup.h13
-rw-r--r--mm/memcontrol.c54
3 files changed, 57 insertions, 14 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c0775..ab0a02172cf4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
179pages that are selected for reclaiming come from the per cgroup LRU 179pages that are selected for reclaiming come from the per cgroup LRU
180list. 180list.
181 181
182NOTE: Reclaim does not work for the root cgroup, since we cannot set any
183limits on the root cgroup.
184
1822. Locking 1852. Locking
183 186
184The memory controller uses the following hierarchy 187The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
210NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 213NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
211mega or gigabytes. 214mega or gigabytes.
212NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 215NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
216NOTE: We cannot set limits on the root cgroup any more.
213 217
214# cat /cgroups/0/memory.limit_in_bytes 218# cat /cgroups/0/memory.limit_in_bytes
2154194304 2194194304
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index ada779f24178..4b938d4f3ac2 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -38,6 +38,7 @@ enum {
38 PCG_LOCK, /* page cgroup is locked */ 38 PCG_LOCK, /* page cgroup is locked */
39 PCG_CACHE, /* charged as cache */ 39 PCG_CACHE, /* charged as cache */
40 PCG_USED, /* this object is in use. */ 40 PCG_USED, /* this object is in use. */
41 PCG_ACCT_LRU, /* page has been accounted for */
41}; 42};
42 43
43#define TESTPCGFLAG(uname, lname) \ 44#define TESTPCGFLAG(uname, lname) \
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
52static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ 53static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
53 { clear_bit(PCG_##lname, &pc->flags); } 54 { clear_bit(PCG_##lname, &pc->flags); }
54 55
56#define TESTCLEARPCGFLAG(uname, lname) \
57static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
58 { return test_and_clear_bit(PCG_##lname, &pc->flags); }
59
55/* Cache flag is set only once (at allocation) */ 60/* Cache flag is set only once (at allocation) */
56TESTPCGFLAG(Cache, CACHE) 61TESTPCGFLAG(Cache, CACHE)
62CLEARPCGFLAG(Cache, CACHE)
63SETPCGFLAG(Cache, CACHE)
57 64
58TESTPCGFLAG(Used, USED) 65TESTPCGFLAG(Used, USED)
59CLEARPCGFLAG(Used, USED) 66CLEARPCGFLAG(Used, USED)
67SETPCGFLAG(Used, USED)
68
69SETPCGFLAG(AcctLRU, ACCT_LRU)
70CLEARPCGFLAG(AcctLRU, ACCT_LRU)
71TESTPCGFLAG(AcctLRU, ACCT_LRU)
72TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
60 73
61static inline int page_cgroup_nid(struct page_cgroup *pc) 74static inline int page_cgroup_nid(struct page_cgroup *pc)
62{ 75{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf2e717f5c12..b0757660663f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -43,6 +43,7 @@
43 43
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 44struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 45#define MEM_CGROUP_RECLAIM_RETRIES 5
46struct mem_cgroup *root_mem_cgroup __read_mostly;
46 47
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 48#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 49/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -200,13 +201,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 201#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 202#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 203#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 204/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 205#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 206
211/* for encoding cft->private value on file */ 207/* for encoding cft->private value on file */
212#define _MEM (0) 208#define _MEM (0)
@@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 350 return ret;
355} 351}
356 352
353static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
354{
355 return (mem == root_mem_cgroup);
356}
357
357/* 358/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 359 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 360 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 372void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 373{
373 struct page_cgroup *pc; 374 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 375 struct mem_cgroup_per_zone *mz;
376 376
377 if (mem_cgroup_disabled()) 377 if (mem_cgroup_disabled())
378 return; 378 return;
379 pc = lookup_page_cgroup(page); 379 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 380 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 381 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 382 return;
383 VM_BUG_ON(!pc->mem_cgroup);
383 /* 384 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 385 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 386 * removed from global LRU.
386 */ 387 */
387 mz = page_cgroup_zoneinfo(pc); 388 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 389 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
390 if (mem_cgroup_is_root(pc->mem_cgroup))
391 return;
392 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 393 list_del_init(&pc->lru);
391 return; 394 return;
392} 395}
@@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 413 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 414 */
412 smp_rmb(); 415 smp_rmb();
413 /* unused page is not rotated. */ 416 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 417 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 418 return;
416 mz = page_cgroup_zoneinfo(pc); 419 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 420 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 428 if (mem_cgroup_disabled())
426 return; 429 return;
427 pc = lookup_page_cgroup(page); 430 pc = lookup_page_cgroup(page);
431 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 432 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 433 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 434 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 439
436 mz = page_cgroup_zoneinfo(pc); 440 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 441 MEM_CGROUP_ZSTAT(mz, lru) += 1;
442 SetPageCgroupAcctLRU(pc);
443 if (mem_cgroup_is_root(pc->mem_cgroup))
444 return;
438 list_add(&pc->lru, &mz->lists[lru]); 445 list_add(&pc->lru, &mz->lists[lru]);
439} 446}
440 447
@@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 476
470 spin_lock_irqsave(&zone->lru_lock, flags); 477 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 478 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 479 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 480 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 481 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 482}
@@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1125 css_put(&mem->css); 1132 css_put(&mem->css);
1126 return; 1133 return;
1127 } 1134 }
1135
1128 pc->mem_cgroup = mem; 1136 pc->mem_cgroup = mem;
1129 smp_wmb(); 1137 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1138 switch (ctype) {
1139 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1140 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1141 SetPageCgroupCache(pc);
1142 SetPageCgroupUsed(pc);
1143 break;
1144 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1145 ClearPageCgroupCache(pc);
1146 SetPageCgroupUsed(pc);
1147 break;
1148 default:
1149 break;
1150 }
1131 1151
1132 mem_cgroup_charge_statistics(mem, pc, true); 1152 mem_cgroup_charge_statistics(mem, pc, true);
1133 1153
@@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2103 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2104 switch (name) {
2085 case RES_LIMIT: 2105 case RES_LIMIT:
2106 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2107 ret = -EINVAL;
2108 break;
2109 }
2086 /* This function does all necessary parse...reuse it */ 2110 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2111 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2112 if (ret)
@@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2549 if (cont->parent == NULL) { 2573 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 2574 enable_swap_cgroup();
2551 parent = NULL; 2575 parent = NULL;
2576 root_mem_cgroup = mem;
2552 } else { 2577 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 2578 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 2579 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 2602 return &mem->css;
2578free_out: 2603free_out:
2579 __mem_cgroup_free(mem); 2604 __mem_cgroup_free(mem);
2605 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 2606 return ERR_PTR(error);
2581} 2607}
2582 2608