aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:07:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:04 -0500
commitf817ed48535ac6510ebae7c4116f24a5f9268834 (patch)
tree5ce9cc72c1adc0427f7efeefb3bc51b9b392ea09 /mm
parent0753b0ef3b301895234fed02bea2c099c7ff4feb (diff)
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page between mem_cgroups and rewrite force_empty to make use of this. This moving of page_cgroup is done under - lru_lock of source/destination mem_cgroup is held. - lock_page_cgroup() is held. Then, a routine which touches pc->mem_cgroup without lock_page_cgroup() should confirm pc->mem_cgroup is still valid or not. Typical code can be following. (while page is not under lock_page()) mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc) spin_lock_irqsave(&mz->lru_lock); if (pc->mem_cgroup == mem) ...../* some list handling */ spin_unlock_irqrestore(&mz->lru_lock); Of course, better way is lock_page_cgroup(pc); .... unlock_page_cgroup(pc); But you should confirm the nest of lock and avoid deadlock. If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock, you don't have to worry about what pc->mem_cgroup points to. moved pages are added to head of lru, not to tail. Expected users of this routine is: - force_empty (rmdir) - moving tasks between cgroup (for moving account information.) - hierarchy (maybe useful.) force_empty(rmdir) uses this move_account and move pages to its parent. This "move" will not cause OOM (I added "oom" parameter to try_charge().) If the parent is busy (not enough memory), force_empty calls try_to_free_page() and reduce usage. Purpose of this behavior is - Fix "forget all" behavior of force_empty and avoid leak of accounting. - By "moving first, free if necessary", keep pages on memory as much as possible. Adding a switch to change behavior of force_empty to - free first, move if necessary - free all, if there is mlocked/busy pages, return -EBUSY. is under consideration. (I'll add if someone requtests.) This patch also removes memory.force_empty file, a brutal debug-only interface. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c277
1 files changed, 210 insertions, 67 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b71195e8198b..49234d93988a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -257,7 +257,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
257} 257}
258 258
259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
260 struct page_cgroup *pc) 260 struct page_cgroup *pc, bool hot)
261{ 261{
262 int lru = LRU_BASE; 262 int lru = LRU_BASE;
263 263
@@ -271,7 +271,10 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
271 } 271 }
272 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1; 273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]); 274 if (hot)
275 list_add(&pc->lru, &mz->lists[lru]);
276 else
277 list_add_tail(&pc->lru, &mz->lists[lru]);
275 278
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 279 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
277} 280}
@@ -467,21 +470,12 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
467 return nr_taken; 470 return nr_taken;
468} 471}
469 472
470 473/*
471/** 474 * Unlike exported interface, "oom" parameter is added. if oom==true,
472 * mem_cgroup_try_charge - get charge of PAGE_SIZE. 475 * oom-killer can be invoked.
473 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
474 * @gfp_mask: gfp_mask for reclaim.
475 * @memcg: a pointer to memory cgroup which is charged against.
476 *
477 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
478 * memory cgroup from @mm is got and stored in *memcg.
479 *
480 * Returns 0 if success. -ENOMEM at failure.
481 */ 476 */
482 477static int __mem_cgroup_try_charge(struct mm_struct *mm,
483int mem_cgroup_try_charge(struct mm_struct *mm, 478 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
484 gfp_t gfp_mask, struct mem_cgroup **memcg)
485{ 479{
486 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
487 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 481 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -528,7 +522,8 @@ int mem_cgroup_try_charge(struct mm_struct *mm,
528 continue; 522 continue;
529 523
530 if (!nr_retries--) { 524 if (!nr_retries--) {
531 mem_cgroup_out_of_memory(mem, gfp_mask); 525 if (oom)
526 mem_cgroup_out_of_memory(mem, gfp_mask);
532 goto nomem; 527 goto nomem;
533 } 528 }
534 } 529 }
@@ -538,6 +533,25 @@ nomem:
538 return -ENOMEM; 533 return -ENOMEM;
539} 534}
540 535
536/**
537 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
538 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
539 * @gfp_mask: gfp_mask for reclaim.
540 * @memcg: a pointer to memory cgroup which is charged against.
541 *
542 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
543 * memory cgroup from @mm is got and stored in *memcg.
544 *
545 * Returns 0 if success. -ENOMEM at failure.
546 * This call can invoke OOM-Killer.
547 */
548
549int mem_cgroup_try_charge(struct mm_struct *mm,
550 gfp_t mask, struct mem_cgroup **memcg)
551{
552 return __mem_cgroup_try_charge(mm, mask, memcg, true);
553}
554
541/* 555/*
542 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be 556 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
543 * USED state. If already USED, uncharge and return. 557 * USED state. If already USED, uncharge and return.
@@ -571,11 +585,109 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
571 mz = page_cgroup_zoneinfo(pc); 585 mz = page_cgroup_zoneinfo(pc);
572 586
573 spin_lock_irqsave(&mz->lru_lock, flags); 587 spin_lock_irqsave(&mz->lru_lock, flags);
574 __mem_cgroup_add_list(mz, pc); 588 __mem_cgroup_add_list(mz, pc, true);
575 spin_unlock_irqrestore(&mz->lru_lock, flags); 589 spin_unlock_irqrestore(&mz->lru_lock, flags);
576 unlock_page_cgroup(pc); 590 unlock_page_cgroup(pc);
577} 591}
578 592
593/**
594 * mem_cgroup_move_account - move account of the page
595 * @pc: page_cgroup of the page.
596 * @from: mem_cgroup which the page is moved from.
597 * @to: mem_cgroup which the page is moved to. @from != @to.
598 *
599 * The caller must confirm following.
600 * 1. disable irq.
601 * 2. lru_lock of old mem_cgroup(@from) should be held.
602 *
603 * returns 0 at success,
604 * returns -EBUSY when lock is busy or "pc" is unstable.
605 *
606 * This function does "uncharge" from old cgroup but doesn't do "charge" to
607 * new cgroup. It should be done by a caller.
608 */
609
610static int mem_cgroup_move_account(struct page_cgroup *pc,
611 struct mem_cgroup *from, struct mem_cgroup *to)
612{
613 struct mem_cgroup_per_zone *from_mz, *to_mz;
614 int nid, zid;
615 int ret = -EBUSY;
616
617 VM_BUG_ON(!irqs_disabled());
618 VM_BUG_ON(from == to);
619
620 nid = page_cgroup_nid(pc);
621 zid = page_cgroup_zid(pc);
622 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
623 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
624
625
626 if (!trylock_page_cgroup(pc))
627 return ret;
628
629 if (!PageCgroupUsed(pc))
630 goto out;
631
632 if (pc->mem_cgroup != from)
633 goto out;
634
635 if (spin_trylock(&to_mz->lru_lock)) {
636 __mem_cgroup_remove_list(from_mz, pc);
637 css_put(&from->css);
638 res_counter_uncharge(&from->res, PAGE_SIZE);
639 pc->mem_cgroup = to;
640 css_get(&to->css);
641 __mem_cgroup_add_list(to_mz, pc, false);
642 ret = 0;
643 spin_unlock(&to_mz->lru_lock);
644 }
645out:
646 unlock_page_cgroup(pc);
647 return ret;
648}
649
650/*
651 * move charges to its parent.
652 */
653
654static int mem_cgroup_move_parent(struct page_cgroup *pc,
655 struct mem_cgroup *child,
656 gfp_t gfp_mask)
657{
658 struct cgroup *cg = child->css.cgroup;
659 struct cgroup *pcg = cg->parent;
660 struct mem_cgroup *parent;
661 struct mem_cgroup_per_zone *mz;
662 unsigned long flags;
663 int ret;
664
665 /* Is ROOT ? */
666 if (!pcg)
667 return -EINVAL;
668
669 parent = mem_cgroup_from_cont(pcg);
670
671 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
672 if (ret)
673 return ret;
674
675 mz = mem_cgroup_zoneinfo(child,
676 page_cgroup_nid(pc), page_cgroup_zid(pc));
677
678 spin_lock_irqsave(&mz->lru_lock, flags);
679 ret = mem_cgroup_move_account(pc, child, parent);
680 spin_unlock_irqrestore(&mz->lru_lock, flags);
681
682 /* drop extra refcnt */
683 css_put(&parent->css);
684 /* uncharge if move fails */
685 if (ret)
686 res_counter_uncharge(&parent->res, PAGE_SIZE);
687
688 return ret;
689}
690
579/* 691/*
580 * Charge the memory controller for page usage. 692 * Charge the memory controller for page usage.
581 * Return 693 * Return
@@ -597,7 +709,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
597 prefetchw(pc); 709 prefetchw(pc);
598 710
599 mem = memcg; 711 mem = memcg;
600 ret = mem_cgroup_try_charge(mm, gfp_mask, &mem); 712 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
601 if (ret) 713 if (ret)
602 return ret; 714 return ret;
603 715
@@ -899,46 +1011,52 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
899 * This routine traverse page_cgroup in given list and drop them all. 1011 * This routine traverse page_cgroup in given list and drop them all.
900 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1012 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
901 */ 1013 */
902#define FORCE_UNCHARGE_BATCH (128) 1014static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
903static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
904 struct mem_cgroup_per_zone *mz, 1015 struct mem_cgroup_per_zone *mz,
905 enum lru_list lru) 1016 enum lru_list lru)
906{ 1017{
907 struct page_cgroup *pc; 1018 struct page_cgroup *pc, *busy;
908 struct page *page;
909 int count = FORCE_UNCHARGE_BATCH;
910 unsigned long flags; 1019 unsigned long flags;
1020 unsigned long loop;
911 struct list_head *list; 1021 struct list_head *list;
1022 int ret = 0;
912 1023
913 list = &mz->lists[lru]; 1024 list = &mz->lists[lru];
914 1025
915 spin_lock_irqsave(&mz->lru_lock, flags); 1026 loop = MEM_CGROUP_ZSTAT(mz, lru);
916 while (!list_empty(list)) { 1027 /* give some margin against EBUSY etc...*/
917 pc = list_entry(list->prev, struct page_cgroup, lru); 1028 loop += 256;
918 page = pc->page; 1029 busy = NULL;
919 if (!PageCgroupUsed(pc)) 1030 while (loop--) {
1031 ret = 0;
1032 spin_lock_irqsave(&mz->lru_lock, flags);
1033 if (list_empty(list)) {
1034 spin_unlock_irqrestore(&mz->lru_lock, flags);
920 break; 1035 break;
921 get_page(page); 1036 }
1037 pc = list_entry(list->prev, struct page_cgroup, lru);
1038 if (busy == pc) {
1039 list_move(&pc->lru, list);
1040 busy = 0;
1041 spin_unlock_irqrestore(&mz->lru_lock, flags);
1042 continue;
1043 }
922 spin_unlock_irqrestore(&mz->lru_lock, flags); 1044 spin_unlock_irqrestore(&mz->lru_lock, flags);
923 /* 1045
924 * Check if this page is on LRU. !LRU page can be found 1046 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
925 * if it's under page migration. 1047 if (ret == -ENOMEM)
926 */
927 if (PageLRU(page)) {
928 __mem_cgroup_uncharge_common(page,
929 MEM_CGROUP_CHARGE_TYPE_FORCE);
930 put_page(page);
931 if (--count <= 0) {
932 count = FORCE_UNCHARGE_BATCH;
933 cond_resched();
934 }
935 } else {
936 spin_lock_irqsave(&mz->lru_lock, flags);
937 break; 1048 break;
938 } 1049
939 spin_lock_irqsave(&mz->lru_lock, flags); 1050 if (ret == -EBUSY || ret == -EINVAL) {
1051 /* found lock contention or "pc" is obsolete. */
1052 busy = pc;
1053 cond_resched();
1054 } else
1055 busy = NULL;
940 } 1056 }
941 spin_unlock_irqrestore(&mz->lru_lock, flags); 1057 if (!ret && !list_empty(list))
1058 return -EBUSY;
1059 return ret;
942} 1060}
943 1061
944/* 1062/*
@@ -947,34 +1065,68 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
947 */ 1065 */
948static int mem_cgroup_force_empty(struct mem_cgroup *mem) 1066static int mem_cgroup_force_empty(struct mem_cgroup *mem)
949{ 1067{
950 int ret = -EBUSY; 1068 int ret;
951 int node, zid; 1069 int node, zid, shrink;
1070 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
952 1071
953 css_get(&mem->css); 1072 css_get(&mem->css);
954 /* 1073
955 * page reclaim code (kswapd etc..) will move pages between 1074 shrink = 0;
956 * active_list <-> inactive_list while we don't take a lock. 1075move_account:
957 * So, we have to do loop here until all lists are empty.
958 */
959 while (mem->res.usage > 0) { 1076 while (mem->res.usage > 0) {
1077 ret = -EBUSY;
960 if (atomic_read(&mem->css.cgroup->count) > 0) 1078 if (atomic_read(&mem->css.cgroup->count) > 0)
961 goto out; 1079 goto out;
1080
962 /* This is for making all *used* pages to be on LRU. */ 1081 /* This is for making all *used* pages to be on LRU. */
963 lru_add_drain_all(); 1082 lru_add_drain_all();
964 for_each_node_state(node, N_POSSIBLE) 1083 ret = 0;
965 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1084 for_each_node_state(node, N_POSSIBLE) {
1085 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
966 struct mem_cgroup_per_zone *mz; 1086 struct mem_cgroup_per_zone *mz;
967 enum lru_list l; 1087 enum lru_list l;
968 mz = mem_cgroup_zoneinfo(mem, node, zid); 1088 mz = mem_cgroup_zoneinfo(mem, node, zid);
969 for_each_lru(l) 1089 for_each_lru(l) {
970 mem_cgroup_force_empty_list(mem, mz, l); 1090 ret = mem_cgroup_force_empty_list(mem,
1091 mz, l);
1092 if (ret)
1093 break;
1094 }
971 } 1095 }
1096 if (ret)
1097 break;
1098 }
1099 /* it seems parent cgroup doesn't have enough mem */
1100 if (ret == -ENOMEM)
1101 goto try_to_free;
972 cond_resched(); 1102 cond_resched();
973 } 1103 }
974 ret = 0; 1104 ret = 0;
975out: 1105out:
976 css_put(&mem->css); 1106 css_put(&mem->css);
977 return ret; 1107 return ret;
1108
1109try_to_free:
1110 /* returns EBUSY if we come here twice. */
1111 if (shrink) {
1112 ret = -EBUSY;
1113 goto out;
1114 }
1115 /* try to free all pages in this cgroup */
1116 shrink = 1;
1117 while (nr_retries && mem->res.usage > 0) {
1118 int progress;
1119 progress = try_to_free_mem_cgroup_pages(mem,
1120 GFP_HIGHUSER_MOVABLE);
1121 if (!progress)
1122 nr_retries--;
1123
1124 }
1125 /* try move_account...there may be some *locked* pages. */
1126 if (mem->res.usage)
1127 goto move_account;
1128 ret = 0;
1129 goto out;
978} 1130}
979 1131
980static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1132static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
@@ -1023,11 +1175,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1023 return 0; 1175 return 0;
1024} 1176}
1025 1177
1026static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
1027{
1028 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
1029}
1030
1031static const struct mem_cgroup_stat_desc { 1178static const struct mem_cgroup_stat_desc {
1032 const char *msg; 1179 const char *msg;
1033 u64 unit; 1180 u64 unit;
@@ -1104,10 +1251,6 @@ static struct cftype mem_cgroup_files[] = {
1104 .read_u64 = mem_cgroup_read, 1251 .read_u64 = mem_cgroup_read,
1105 }, 1252 },
1106 { 1253 {
1107 .name = "force_empty",
1108 .trigger = mem_force_empty_write,
1109 },
1110 {
1111 .name = "stat", 1254 .name = "stat",
1112 .read_map = mem_control_stat_show, 1255 .read_map = mem_control_stat_show,
1113 }, 1256 },