diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2009-09-23 18:56:32 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-24 10:20:58 -0400 |
commit | 4b3bde4c983de36c59e6c1a24701f6fe816f9f55 (patch) | |
tree | e759c5ceccf57331b868b7feac61cae5e932c6d4 | |
parent | be367d09927023d081f9199665c8500f69f14d22 (diff) |
memcg: remove the overhead associated with the root cgroup
Change the memory cgroup to remove the overhead associated with accounting
all pages in the root cgroup. As a side-effect, we can no longer set a
memory hard limit in the root cgroup.
A new flag to track whether the page has been accounted or not has been
added as well. Flags are now set atomically for page_cgroup,
pcg_default_flags is now obsolete and removed.
[akpm@linux-foundation.org: fix a few documentation glitches]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/cgroups/memory.txt | 4 | ||||
-rw-r--r-- | include/linux/page_cgroup.h | 13 | ||||
-rw-r--r-- | mm/memcontrol.c | 54 |
3 files changed, 57 insertions, 14 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 23d1262c0775..ab0a02172cf4 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that | |||
179 | pages that are selected for reclaiming come from the per cgroup LRU | 179 | pages that are selected for reclaiming come from the per cgroup LRU |
180 | list. | 180 | list. |
181 | 181 | ||
182 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | ||
183 | limits on the root cgroup. | ||
184 | |||
182 | 2. Locking | 185 | 2. Locking |
183 | 186 | ||
184 | The memory controller uses the following hierarchy | 187 | The memory controller uses the following hierarchy |
@@ -210,6 +213,7 @@ We can alter the memory limit: | |||
210 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 213 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
211 | mega or gigabytes. | 214 | mega or gigabytes. |
212 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 215 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). |
216 | NOTE: We cannot set limits on the root cgroup any more. | ||
213 | 217 | ||
214 | # cat /cgroups/0/memory.limit_in_bytes | 218 | # cat /cgroups/0/memory.limit_in_bytes |
215 | 4194304 | 219 | 4194304 |
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index ada779f24178..4b938d4f3ac2 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h | |||
@@ -38,6 +38,7 @@ enum { | |||
38 | PCG_LOCK, /* page cgroup is locked */ | 38 | PCG_LOCK, /* page cgroup is locked */ |
39 | PCG_CACHE, /* charged as cache */ | 39 | PCG_CACHE, /* charged as cache */ |
40 | PCG_USED, /* this object is in use. */ | 40 | PCG_USED, /* this object is in use. */ |
41 | PCG_ACCT_LRU, /* page has been accounted for */ | ||
41 | }; | 42 | }; |
42 | 43 | ||
43 | #define TESTPCGFLAG(uname, lname) \ | 44 | #define TESTPCGFLAG(uname, lname) \ |
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ | |||
52 | static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ | 53 | static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ |
53 | { clear_bit(PCG_##lname, &pc->flags); } | 54 | { clear_bit(PCG_##lname, &pc->flags); } |
54 | 55 | ||
56 | #define TESTCLEARPCGFLAG(uname, lname) \ | ||
57 | static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ | ||
58 | { return test_and_clear_bit(PCG_##lname, &pc->flags); } | ||
59 | |||
55 | /* Cache flag is set only once (at allocation) */ | 60 | /* Cache flag is set only once (at allocation) */ |
56 | TESTPCGFLAG(Cache, CACHE) | 61 | TESTPCGFLAG(Cache, CACHE) |
62 | CLEARPCGFLAG(Cache, CACHE) | ||
63 | SETPCGFLAG(Cache, CACHE) | ||
57 | 64 | ||
58 | TESTPCGFLAG(Used, USED) | 65 | TESTPCGFLAG(Used, USED) |
59 | CLEARPCGFLAG(Used, USED) | 66 | CLEARPCGFLAG(Used, USED) |
67 | SETPCGFLAG(Used, USED) | ||
68 | |||
69 | SETPCGFLAG(AcctLRU, ACCT_LRU) | ||
70 | CLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
71 | TESTPCGFLAG(AcctLRU, ACCT_LRU) | ||
72 | TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
60 | 73 | ||
61 | static inline int page_cgroup_nid(struct page_cgroup *pc) | 74 | static inline int page_cgroup_nid(struct page_cgroup *pc) |
62 | { | 75 | { |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf2e717f5c12..b0757660663f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -43,6 +43,7 @@ | |||
43 | 43 | ||
44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
46 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
46 | 47 | ||
47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 48 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 49 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
@@ -200,13 +201,8 @@ enum charge_type { | |||
200 | #define PCGF_CACHE (1UL << PCG_CACHE) | 201 | #define PCGF_CACHE (1UL << PCG_CACHE) |
201 | #define PCGF_USED (1UL << PCG_USED) | 202 | #define PCGF_USED (1UL << PCG_USED) |
202 | #define PCGF_LOCK (1UL << PCG_LOCK) | 203 | #define PCGF_LOCK (1UL << PCG_LOCK) |
203 | static const unsigned long | 204 | /* Not used, but added here for completeness */ |
204 | pcg_default_flags[NR_CHARGE_TYPE] = { | 205 | #define PCGF_ACCT (1UL << PCG_ACCT) |
205 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
206 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
207 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
208 | 0, /* FORCE */ | ||
209 | }; | ||
210 | 206 | ||
211 | /* for encoding cft->private value on file */ | 207 | /* for encoding cft->private value on file */ |
212 | #define _MEM (0) | 208 | #define _MEM (0) |
@@ -354,6 +350,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
354 | return ret; | 350 | return ret; |
355 | } | 351 | } |
356 | 352 | ||
353 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
354 | { | ||
355 | return (mem == root_mem_cgroup); | ||
356 | } | ||
357 | |||
357 | /* | 358 | /* |
358 | * Following LRU functions are allowed to be used without PCG_LOCK. | 359 | * Following LRU functions are allowed to be used without PCG_LOCK. |
359 | * Operations are called by routine of global LRU independently from memcg. | 360 | * Operations are called by routine of global LRU independently from memcg. |
@@ -371,22 +372,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
371 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 372 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
372 | { | 373 | { |
373 | struct page_cgroup *pc; | 374 | struct page_cgroup *pc; |
374 | struct mem_cgroup *mem; | ||
375 | struct mem_cgroup_per_zone *mz; | 375 | struct mem_cgroup_per_zone *mz; |
376 | 376 | ||
377 | if (mem_cgroup_disabled()) | 377 | if (mem_cgroup_disabled()) |
378 | return; | 378 | return; |
379 | pc = lookup_page_cgroup(page); | 379 | pc = lookup_page_cgroup(page); |
380 | /* can happen while we handle swapcache. */ | 380 | /* can happen while we handle swapcache. */ |
381 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 381 | if (!TestClearPageCgroupAcctLRU(pc)) |
382 | return; | 382 | return; |
383 | VM_BUG_ON(!pc->mem_cgroup); | ||
383 | /* | 384 | /* |
384 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 385 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
385 | * removed from global LRU. | 386 | * removed from global LRU. |
386 | */ | 387 | */ |
387 | mz = page_cgroup_zoneinfo(pc); | 388 | mz = page_cgroup_zoneinfo(pc); |
388 | mem = pc->mem_cgroup; | ||
389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
390 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
391 | return; | ||
392 | VM_BUG_ON(list_empty(&pc->lru)); | ||
390 | list_del_init(&pc->lru); | 393 | list_del_init(&pc->lru); |
391 | return; | 394 | return; |
392 | } | 395 | } |
@@ -410,8 +413,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
410 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 413 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
411 | */ | 414 | */ |
412 | smp_rmb(); | 415 | smp_rmb(); |
413 | /* unused page is not rotated. */ | 416 | /* unused or root page is not rotated. */ |
414 | if (!PageCgroupUsed(pc)) | 417 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
415 | return; | 418 | return; |
416 | mz = page_cgroup_zoneinfo(pc); | 419 | mz = page_cgroup_zoneinfo(pc); |
417 | list_move(&pc->lru, &mz->lists[lru]); | 420 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -425,6 +428,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
425 | if (mem_cgroup_disabled()) | 428 | if (mem_cgroup_disabled()) |
426 | return; | 429 | return; |
427 | pc = lookup_page_cgroup(page); | 430 | pc = lookup_page_cgroup(page); |
431 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
428 | /* | 432 | /* |
429 | * Used bit is set without atomic ops but after smp_wmb(). | 433 | * Used bit is set without atomic ops but after smp_wmb(). |
430 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 434 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
@@ -435,6 +439,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
435 | 439 | ||
436 | mz = page_cgroup_zoneinfo(pc); | 440 | mz = page_cgroup_zoneinfo(pc); |
437 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 441 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
442 | SetPageCgroupAcctLRU(pc); | ||
443 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
444 | return; | ||
438 | list_add(&pc->lru, &mz->lists[lru]); | 445 | list_add(&pc->lru, &mz->lists[lru]); |
439 | } | 446 | } |
440 | 447 | ||
@@ -469,7 +476,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
469 | 476 | ||
470 | spin_lock_irqsave(&zone->lru_lock, flags); | 477 | spin_lock_irqsave(&zone->lru_lock, flags); |
471 | /* link when the page is linked to LRU but page_cgroup isn't */ | 478 | /* link when the page is linked to LRU but page_cgroup isn't */ |
472 | if (PageLRU(page) && list_empty(&pc->lru)) | 479 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
473 | mem_cgroup_add_lru_list(page, page_lru(page)); | 480 | mem_cgroup_add_lru_list(page, page_lru(page)); |
474 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 481 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
475 | } | 482 | } |
@@ -1125,9 +1132,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1125 | css_put(&mem->css); | 1132 | css_put(&mem->css); |
1126 | return; | 1133 | return; |
1127 | } | 1134 | } |
1135 | |||
1128 | pc->mem_cgroup = mem; | 1136 | pc->mem_cgroup = mem; |
1129 | smp_wmb(); | 1137 | smp_wmb(); |
1130 | pc->flags = pcg_default_flags[ctype]; | 1138 | switch (ctype) { |
1139 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
1140 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
1141 | SetPageCgroupCache(pc); | ||
1142 | SetPageCgroupUsed(pc); | ||
1143 | break; | ||
1144 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1145 | ClearPageCgroupCache(pc); | ||
1146 | SetPageCgroupUsed(pc); | ||
1147 | break; | ||
1148 | default: | ||
1149 | break; | ||
1150 | } | ||
1131 | 1151 | ||
1132 | mem_cgroup_charge_statistics(mem, pc, true); | 1152 | mem_cgroup_charge_statistics(mem, pc, true); |
1133 | 1153 | ||
@@ -2083,6 +2103,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2083 | name = MEMFILE_ATTR(cft->private); | 2103 | name = MEMFILE_ATTR(cft->private); |
2084 | switch (name) { | 2104 | switch (name) { |
2085 | case RES_LIMIT: | 2105 | case RES_LIMIT: |
2106 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
2107 | ret = -EINVAL; | ||
2108 | break; | ||
2109 | } | ||
2086 | /* This function does all necessary parse...reuse it */ | 2110 | /* This function does all necessary parse...reuse it */ |
2087 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2111 | ret = res_counter_memparse_write_strategy(buffer, &val); |
2088 | if (ret) | 2112 | if (ret) |
@@ -2549,6 +2573,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2549 | if (cont->parent == NULL) { | 2573 | if (cont->parent == NULL) { |
2550 | enable_swap_cgroup(); | 2574 | enable_swap_cgroup(); |
2551 | parent = NULL; | 2575 | parent = NULL; |
2576 | root_mem_cgroup = mem; | ||
2552 | } else { | 2577 | } else { |
2553 | parent = mem_cgroup_from_cont(cont->parent); | 2578 | parent = mem_cgroup_from_cont(cont->parent); |
2554 | mem->use_hierarchy = parent->use_hierarchy; | 2579 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -2577,6 +2602,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2577 | return &mem->css; | 2602 | return &mem->css; |
2578 | free_out: | 2603 | free_out: |
2579 | __mem_cgroup_free(mem); | 2604 | __mem_cgroup_free(mem); |
2605 | root_mem_cgroup = NULL; | ||
2580 | return ERR_PTR(error); | 2606 | return ERR_PTR(error); |
2581 | } | 2607 | } |
2582 | 2608 | ||