aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:07:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:04 -0500
commitf817ed48535ac6510ebae7c4116f24a5f9268834 (patch)
tree5ce9cc72c1adc0427f7efeefb3bc51b9b392ea09
parent0753b0ef3b301895234fed02bea2c099c7ff4feb (diff)
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page between mem_cgroups and rewrite force_empty to make use of this. This moving of page_cgroup is done under - lru_lock of source/destination mem_cgroup is held. - lock_page_cgroup() is held. Then, a routine which touches pc->mem_cgroup without lock_page_cgroup() should confirm pc->mem_cgroup is still valid or not. Typical code can be following. (while page is not under lock_page()) mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc) spin_lock_irqsave(&mz->lru_lock); if (pc->mem_cgroup == mem) ...../* some list handling */ spin_unlock_irqrestore(&mz->lru_lock); Of course, better way is lock_page_cgroup(pc); .... unlock_page_cgroup(pc); But you should confirm the nest of lock and avoid deadlock. If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock, you don't have to worry about what pc->mem_cgroup points to. moved pages are added to head of lru, not to tail. Expected users of this routine is: - force_empty (rmdir) - moving tasks between cgroup (for moving account information.) - hierarchy (maybe useful.) force_empty(rmdir) uses this move_account and move pages to its parent. This "move" will not cause OOM (I added "oom" parameter to try_charge().) If the parent is busy (not enough memory), force_empty calls try_to_free_page() and reduce usage. Purpose of this behavior is - Fix "forget all" behavior of force_empty and avoid leak of accounting. - By "moving first, free if necessary", keep pages on memory as much as possible. Adding a switch to change behavior of force_empty to - free first, move if necessary - free all, if there is mlocked/busy pages, return -EBUSY. is under consideration. (I'll add if someone requtests.) This patch also removes memory.force_empty file, a brutal debug-only interface. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/controllers/memory.txt12
-rw-r--r--mm/memcontrol.c277
2 files changed, 214 insertions, 75 deletions
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 1c07547d3f81..58f32c166fac 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -207,12 +207,6 @@ exceeded.
207The memory.stat file gives accounting information. Now, the number of 207The memory.stat file gives accounting information. Now, the number of
208caches, RSS and Active pages/Inactive pages are shown. 208caches, RSS and Active pages/Inactive pages are shown.
209 209
210The memory.force_empty gives an interface to drop *all* charges by force.
211
212# echo 1 > memory.force_empty
213
214will drop all charges in cgroup. Currently, this is maintained for test.
215
2164. Testing 2104. Testing
217 211
218Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. 212Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11].
@@ -242,8 +236,10 @@ reclaimed.
242 236
243A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 237A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
244cgroup might have some charge associated with it, even though all 238cgroup might have some charge associated with it, even though all
245tasks have migrated away from it. Such charges are automatically dropped at 239tasks have migrated away from it.
246rmdir() if there are no tasks. 240Such charges are moved to its parent as much as possible and freed if parent
241is full. Both of RSS and CACHES are moved to parent.
242If both of them are busy, rmdir() returns -EBUSY.
247 243
2485. TODO 2445. TODO
249 245
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b71195e8198b..49234d93988a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -257,7 +257,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
257} 257}
258 258
259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
260 struct page_cgroup *pc) 260 struct page_cgroup *pc, bool hot)
261{ 261{
262 int lru = LRU_BASE; 262 int lru = LRU_BASE;
263 263
@@ -271,7 +271,10 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
271 } 271 }
272 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1; 273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]); 274 if (hot)
275 list_add(&pc->lru, &mz->lists[lru]);
276 else
277 list_add_tail(&pc->lru, &mz->lists[lru]);
275 278
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 279 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
277} 280}
@@ -467,21 +470,12 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
467 return nr_taken; 470 return nr_taken;
468} 471}
469 472
470 473/*
471/** 474 * Unlike exported interface, "oom" parameter is added. if oom==true,
472 * mem_cgroup_try_charge - get charge of PAGE_SIZE. 475 * oom-killer can be invoked.
473 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
474 * @gfp_mask: gfp_mask for reclaim.
475 * @memcg: a pointer to memory cgroup which is charged against.
476 *
477 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
478 * memory cgroup from @mm is got and stored in *memcg.
479 *
480 * Returns 0 if success. -ENOMEM at failure.
481 */ 476 */
482 477static int __mem_cgroup_try_charge(struct mm_struct *mm,
483int mem_cgroup_try_charge(struct mm_struct *mm, 478 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
484 gfp_t gfp_mask, struct mem_cgroup **memcg)
485{ 479{
486 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
487 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 481 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -528,7 +522,8 @@ int mem_cgroup_try_charge(struct mm_struct *mm,
528 continue; 522 continue;
529 523
530 if (!nr_retries--) { 524 if (!nr_retries--) {
531 mem_cgroup_out_of_memory(mem, gfp_mask); 525 if (oom)
526 mem_cgroup_out_of_memory(mem, gfp_mask);
532 goto nomem; 527 goto nomem;
533 } 528 }
534 } 529 }
@@ -538,6 +533,25 @@ nomem:
538 return -ENOMEM; 533 return -ENOMEM;
539} 534}
540 535
536/**
537 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
538 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
539 * @gfp_mask: gfp_mask for reclaim.
540 * @memcg: a pointer to memory cgroup which is charged against.
541 *
542 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
543 * memory cgroup from @mm is got and stored in *memcg.
544 *
545 * Returns 0 if success. -ENOMEM at failure.
546 * This call can invoke OOM-Killer.
547 */
548
549int mem_cgroup_try_charge(struct mm_struct *mm,
550 gfp_t mask, struct mem_cgroup **memcg)
551{
552 return __mem_cgroup_try_charge(mm, mask, memcg, true);
553}
554
541/* 555/*
542 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be 556 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
543 * USED state. If already USED, uncharge and return. 557 * USED state. If already USED, uncharge and return.
@@ -571,11 +585,109 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
571 mz = page_cgroup_zoneinfo(pc); 585 mz = page_cgroup_zoneinfo(pc);
572 586
573 spin_lock_irqsave(&mz->lru_lock, flags); 587 spin_lock_irqsave(&mz->lru_lock, flags);
574 __mem_cgroup_add_list(mz, pc); 588 __mem_cgroup_add_list(mz, pc, true);
575 spin_unlock_irqrestore(&mz->lru_lock, flags); 589 spin_unlock_irqrestore(&mz->lru_lock, flags);
576 unlock_page_cgroup(pc); 590 unlock_page_cgroup(pc);
577} 591}
578 592
593/**
594 * mem_cgroup_move_account - move account of the page
595 * @pc: page_cgroup of the page.
596 * @from: mem_cgroup which the page is moved from.
597 * @to: mem_cgroup which the page is moved to. @from != @to.
598 *
599 * The caller must confirm following.
600 * 1. disable irq.
601 * 2. lru_lock of old mem_cgroup(@from) should be held.
602 *
603 * returns 0 at success,
604 * returns -EBUSY when lock is busy or "pc" is unstable.
605 *
606 * This function does "uncharge" from old cgroup but doesn't do "charge" to
607 * new cgroup. It should be done by a caller.
608 */
609
610static int mem_cgroup_move_account(struct page_cgroup *pc,
611 struct mem_cgroup *from, struct mem_cgroup *to)
612{
613 struct mem_cgroup_per_zone *from_mz, *to_mz;
614 int nid, zid;
615 int ret = -EBUSY;
616
617 VM_BUG_ON(!irqs_disabled());
618 VM_BUG_ON(from == to);
619
620 nid = page_cgroup_nid(pc);
621 zid = page_cgroup_zid(pc);
622 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
623 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
624
625
626 if (!trylock_page_cgroup(pc))
627 return ret;
628
629 if (!PageCgroupUsed(pc))
630 goto out;
631
632 if (pc->mem_cgroup != from)
633 goto out;
634
635 if (spin_trylock(&to_mz->lru_lock)) {
636 __mem_cgroup_remove_list(from_mz, pc);
637 css_put(&from->css);
638 res_counter_uncharge(&from->res, PAGE_SIZE);
639 pc->mem_cgroup = to;
640 css_get(&to->css);
641 __mem_cgroup_add_list(to_mz, pc, false);
642 ret = 0;
643 spin_unlock(&to_mz->lru_lock);
644 }
645out:
646 unlock_page_cgroup(pc);
647 return ret;
648}
649
650/*
651 * move charges to its parent.
652 */
653
654static int mem_cgroup_move_parent(struct page_cgroup *pc,
655 struct mem_cgroup *child,
656 gfp_t gfp_mask)
657{
658 struct cgroup *cg = child->css.cgroup;
659 struct cgroup *pcg = cg->parent;
660 struct mem_cgroup *parent;
661 struct mem_cgroup_per_zone *mz;
662 unsigned long flags;
663 int ret;
664
665 /* Is ROOT ? */
666 if (!pcg)
667 return -EINVAL;
668
669 parent = mem_cgroup_from_cont(pcg);
670
671 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
672 if (ret)
673 return ret;
674
675 mz = mem_cgroup_zoneinfo(child,
676 page_cgroup_nid(pc), page_cgroup_zid(pc));
677
678 spin_lock_irqsave(&mz->lru_lock, flags);
679 ret = mem_cgroup_move_account(pc, child, parent);
680 spin_unlock_irqrestore(&mz->lru_lock, flags);
681
682 /* drop extra refcnt */
683 css_put(&parent->css);
684 /* uncharge if move fails */
685 if (ret)
686 res_counter_uncharge(&parent->res, PAGE_SIZE);
687
688 return ret;
689}
690
579/* 691/*
580 * Charge the memory controller for page usage. 692 * Charge the memory controller for page usage.
581 * Return 693 * Return
@@ -597,7 +709,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
597 prefetchw(pc); 709 prefetchw(pc);
598 710
599 mem = memcg; 711 mem = memcg;
600 ret = mem_cgroup_try_charge(mm, gfp_mask, &mem); 712 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
601 if (ret) 713 if (ret)
602 return ret; 714 return ret;
603 715
@@ -899,46 +1011,52 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
899 * This routine traverse page_cgroup in given list and drop them all. 1011 * This routine traverse page_cgroup in given list and drop them all.
900 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1012 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
901 */ 1013 */
902#define FORCE_UNCHARGE_BATCH (128) 1014static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
903static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
904 struct mem_cgroup_per_zone *mz, 1015 struct mem_cgroup_per_zone *mz,
905 enum lru_list lru) 1016 enum lru_list lru)
906{ 1017{
907 struct page_cgroup *pc; 1018 struct page_cgroup *pc, *busy;
908 struct page *page;
909 int count = FORCE_UNCHARGE_BATCH;
910 unsigned long flags; 1019 unsigned long flags;
1020 unsigned long loop;
911 struct list_head *list; 1021 struct list_head *list;
1022 int ret = 0;
912 1023
913 list = &mz->lists[lru]; 1024 list = &mz->lists[lru];
914 1025
915 spin_lock_irqsave(&mz->lru_lock, flags); 1026 loop = MEM_CGROUP_ZSTAT(mz, lru);
916 while (!list_empty(list)) { 1027 /* give some margin against EBUSY etc...*/
917 pc = list_entry(list->prev, struct page_cgroup, lru); 1028 loop += 256;
918 page = pc->page; 1029 busy = NULL;
919 if (!PageCgroupUsed(pc)) 1030 while (loop--) {
1031 ret = 0;
1032 spin_lock_irqsave(&mz->lru_lock, flags);
1033 if (list_empty(list)) {
1034 spin_unlock_irqrestore(&mz->lru_lock, flags);
920 break; 1035 break;
921 get_page(page); 1036 }
1037 pc = list_entry(list->prev, struct page_cgroup, lru);
1038 if (busy == pc) {
1039 list_move(&pc->lru, list);
1040 busy = 0;
1041 spin_unlock_irqrestore(&mz->lru_lock, flags);
1042 continue;
1043 }
922 spin_unlock_irqrestore(&mz->lru_lock, flags); 1044 spin_unlock_irqrestore(&mz->lru_lock, flags);
923 /* 1045
924 * Check if this page is on LRU. !LRU page can be found 1046 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
925 * if it's under page migration. 1047 if (ret == -ENOMEM)
926 */
927 if (PageLRU(page)) {
928 __mem_cgroup_uncharge_common(page,
929 MEM_CGROUP_CHARGE_TYPE_FORCE);
930 put_page(page);
931 if (--count <= 0) {
932 count = FORCE_UNCHARGE_BATCH;
933 cond_resched();
934 }
935 } else {
936 spin_lock_irqsave(&mz->lru_lock, flags);
937 break; 1048 break;
938 } 1049
939 spin_lock_irqsave(&mz->lru_lock, flags); 1050 if (ret == -EBUSY || ret == -EINVAL) {
1051 /* found lock contention or "pc" is obsolete. */
1052 busy = pc;
1053 cond_resched();
1054 } else
1055 busy = NULL;
940 } 1056 }
941 spin_unlock_irqrestore(&mz->lru_lock, flags); 1057 if (!ret && !list_empty(list))
1058 return -EBUSY;
1059 return ret;
942} 1060}
943 1061
944/* 1062/*
@@ -947,34 +1065,68 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
947 */ 1065 */
948static int mem_cgroup_force_empty(struct mem_cgroup *mem) 1066static int mem_cgroup_force_empty(struct mem_cgroup *mem)
949{ 1067{
950 int ret = -EBUSY; 1068 int ret;
951 int node, zid; 1069 int node, zid, shrink;
1070 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
952 1071
953 css_get(&mem->css); 1072 css_get(&mem->css);
954 /* 1073
955 * page reclaim code (kswapd etc..) will move pages between 1074 shrink = 0;
956 * active_list <-> inactive_list while we don't take a lock. 1075move_account:
957 * So, we have to do loop here until all lists are empty.
958 */
959 while (mem->res.usage > 0) { 1076 while (mem->res.usage > 0) {
1077 ret = -EBUSY;
960 if (atomic_read(&mem->css.cgroup->count) > 0) 1078 if (atomic_read(&mem->css.cgroup->count) > 0)
961 goto out; 1079 goto out;
1080
962 /* This is for making all *used* pages to be on LRU. */ 1081 /* This is for making all *used* pages to be on LRU. */
963 lru_add_drain_all(); 1082 lru_add_drain_all();
964 for_each_node_state(node, N_POSSIBLE) 1083 ret = 0;
965 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1084 for_each_node_state(node, N_POSSIBLE) {
1085 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
966 struct mem_cgroup_per_zone *mz; 1086 struct mem_cgroup_per_zone *mz;
967 enum lru_list l; 1087 enum lru_list l;
968 mz = mem_cgroup_zoneinfo(mem, node, zid); 1088 mz = mem_cgroup_zoneinfo(mem, node, zid);
969 for_each_lru(l) 1089 for_each_lru(l) {
970 mem_cgroup_force_empty_list(mem, mz, l); 1090 ret = mem_cgroup_force_empty_list(mem,
1091 mz, l);
1092 if (ret)
1093 break;
1094 }
971 } 1095 }
1096 if (ret)
1097 break;
1098 }
1099 /* it seems parent cgroup doesn't have enough mem */
1100 if (ret == -ENOMEM)
1101 goto try_to_free;
972 cond_resched(); 1102 cond_resched();
973 } 1103 }
974 ret = 0; 1104 ret = 0;
975out: 1105out:
976 css_put(&mem->css); 1106 css_put(&mem->css);
977 return ret; 1107 return ret;
1108
1109try_to_free:
1110 /* returns EBUSY if we come here twice. */
1111 if (shrink) {
1112 ret = -EBUSY;
1113 goto out;
1114 }
1115 /* try to free all pages in this cgroup */
1116 shrink = 1;
1117 while (nr_retries && mem->res.usage > 0) {
1118 int progress;
1119 progress = try_to_free_mem_cgroup_pages(mem,
1120 GFP_HIGHUSER_MOVABLE);
1121 if (!progress)
1122 nr_retries--;
1123
1124 }
1125 /* try move_account...there may be some *locked* pages. */
1126 if (mem->res.usage)
1127 goto move_account;
1128 ret = 0;
1129 goto out;
978} 1130}
979 1131
980static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1132static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
@@ -1023,11 +1175,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1023 return 0; 1175 return 0;
1024} 1176}
1025 1177
1026static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
1027{
1028 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
1029}
1030
1031static const struct mem_cgroup_stat_desc { 1178static const struct mem_cgroup_stat_desc {
1032 const char *msg; 1179 const char *msg;
1033 u64 unit; 1180 u64 unit;
@@ -1104,10 +1251,6 @@ static struct cftype mem_cgroup_files[] = {
1104 .read_u64 = mem_cgroup_read, 1251 .read_u64 = mem_cgroup_read,
1105 }, 1252 },
1106 { 1253 {
1107 .name = "force_empty",
1108 .trigger = mem_force_empty_write,
1109 },
1110 {
1111 .name = "stat", 1254 .name = "stat",
1112 .read_map = mem_control_stat_show, 1255 .read_map = mem_control_stat_show,
1113 }, 1256 },