diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-01-07 21:07:53 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-08 11:31:04 -0500 |
commit | f817ed48535ac6510ebae7c4116f24a5f9268834 (patch) | |
tree | 5ce9cc72c1adc0427f7efeefb3bc51b9b392ea09 /mm | |
parent | 0753b0ef3b301895234fed02bea2c099c7ff4feb (diff) |
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.
This moving of page_cgroup is done under
- lru_lock of source/destination mem_cgroup is held.
- lock_page_cgroup() is held.
Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not. Typical code can be
following.
(while page is not under lock_page())
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc)
spin_lock_irqsave(&mz->lru_lock);
if (pc->mem_cgroup == mem)
...../* some list handling */
spin_unlock_irqrestore(&mz->lru_lock);
Of course, better way is
lock_page_cgroup(pc);
....
unlock_page_cgroup(pc);
But you should confirm the nest of lock and avoid deadlock.
If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.
Expected users of this routine is:
- force_empty (rmdir)
- moving tasks between cgroup (for moving account information.)
- hierarchy (maybe useful.)
force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)
If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.
Purpose of this behavior is
- Fix "forget all" behavior of force_empty and avoid leak of accounting.
- By "moving first, free if necessary", keep pages on memory as much as
possible.
Adding a switch to change behavior of force_empty to
- free first, move if necessary
- free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)
This patch also removes memory.force_empty file, a brutal debug-only interface.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 277 |
1 files changed, 210 insertions, 67 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b71195e8198b..49234d93988a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -257,7 +257,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | |||
257 | } | 257 | } |
258 | 258 | ||
259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
260 | struct page_cgroup *pc) | 260 | struct page_cgroup *pc, bool hot) |
261 | { | 261 | { |
262 | int lru = LRU_BASE; | 262 | int lru = LRU_BASE; |
263 | 263 | ||
@@ -271,7 +271,10 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | |||
271 | } | 271 | } |
272 | 272 | ||
273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
274 | list_add(&pc->lru, &mz->lists[lru]); | 274 | if (hot) |
275 | list_add(&pc->lru, &mz->lists[lru]); | ||
276 | else | ||
277 | list_add_tail(&pc->lru, &mz->lists[lru]); | ||
275 | 278 | ||
276 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | 279 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); |
277 | } | 280 | } |
@@ -467,21 +470,12 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
467 | return nr_taken; | 470 | return nr_taken; |
468 | } | 471 | } |
469 | 472 | ||
470 | 473 | /* | |
471 | /** | 474 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
472 | * mem_cgroup_try_charge - get charge of PAGE_SIZE. | 475 | * oom-killer can be invoked. |
473 | * @mm: an mm_struct which is charged against. (when *memcg is NULL) | ||
474 | * @gfp_mask: gfp_mask for reclaim. | ||
475 | * @memcg: a pointer to memory cgroup which is charged against. | ||
476 | * | ||
477 | * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | ||
478 | * memory cgroup from @mm is got and stored in *memcg. | ||
479 | * | ||
480 | * Returns 0 if success. -ENOMEM at failure. | ||
481 | */ | 476 | */ |
482 | 477 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | |
483 | int mem_cgroup_try_charge(struct mm_struct *mm, | 478 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
484 | gfp_t gfp_mask, struct mem_cgroup **memcg) | ||
485 | { | 479 | { |
486 | struct mem_cgroup *mem; | 480 | struct mem_cgroup *mem; |
487 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 481 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
@@ -528,7 +522,8 @@ int mem_cgroup_try_charge(struct mm_struct *mm, | |||
528 | continue; | 522 | continue; |
529 | 523 | ||
530 | if (!nr_retries--) { | 524 | if (!nr_retries--) { |
531 | mem_cgroup_out_of_memory(mem, gfp_mask); | 525 | if (oom) |
526 | mem_cgroup_out_of_memory(mem, gfp_mask); | ||
532 | goto nomem; | 527 | goto nomem; |
533 | } | 528 | } |
534 | } | 529 | } |
@@ -538,6 +533,25 @@ nomem: | |||
538 | return -ENOMEM; | 533 | return -ENOMEM; |
539 | } | 534 | } |
540 | 535 | ||
536 | /** | ||
537 | * mem_cgroup_try_charge - get charge of PAGE_SIZE. | ||
538 | * @mm: an mm_struct which is charged against. (when *memcg is NULL) | ||
539 | * @gfp_mask: gfp_mask for reclaim. | ||
540 | * @memcg: a pointer to memory cgroup which is charged against. | ||
541 | * | ||
542 | * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | ||
543 | * memory cgroup from @mm is got and stored in *memcg. | ||
544 | * | ||
545 | * Returns 0 if success. -ENOMEM at failure. | ||
546 | * This call can invoke OOM-Killer. | ||
547 | */ | ||
548 | |||
549 | int mem_cgroup_try_charge(struct mm_struct *mm, | ||
550 | gfp_t mask, struct mem_cgroup **memcg) | ||
551 | { | ||
552 | return __mem_cgroup_try_charge(mm, mask, memcg, true); | ||
553 | } | ||
554 | |||
541 | /* | 555 | /* |
542 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be | 556 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be |
543 | * USED state. If already USED, uncharge and return. | 557 | * USED state. If already USED, uncharge and return. |
@@ -571,11 +585,109 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
571 | mz = page_cgroup_zoneinfo(pc); | 585 | mz = page_cgroup_zoneinfo(pc); |
572 | 586 | ||
573 | spin_lock_irqsave(&mz->lru_lock, flags); | 587 | spin_lock_irqsave(&mz->lru_lock, flags); |
574 | __mem_cgroup_add_list(mz, pc); | 588 | __mem_cgroup_add_list(mz, pc, true); |
575 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 589 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
576 | unlock_page_cgroup(pc); | 590 | unlock_page_cgroup(pc); |
577 | } | 591 | } |
578 | 592 | ||
593 | /** | ||
594 | * mem_cgroup_move_account - move account of the page | ||
595 | * @pc: page_cgroup of the page. | ||
596 | * @from: mem_cgroup which the page is moved from. | ||
597 | * @to: mem_cgroup which the page is moved to. @from != @to. | ||
598 | * | ||
599 | * The caller must confirm following. | ||
600 | * 1. disable irq. | ||
601 | * 2. lru_lock of old mem_cgroup(@from) should be held. | ||
602 | * | ||
603 | * returns 0 at success, | ||
604 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
605 | * | ||
606 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | ||
607 | * new cgroup. It should be done by a caller. | ||
608 | */ | ||
609 | |||
610 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
611 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
612 | { | ||
613 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
614 | int nid, zid; | ||
615 | int ret = -EBUSY; | ||
616 | |||
617 | VM_BUG_ON(!irqs_disabled()); | ||
618 | VM_BUG_ON(from == to); | ||
619 | |||
620 | nid = page_cgroup_nid(pc); | ||
621 | zid = page_cgroup_zid(pc); | ||
622 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
623 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
624 | |||
625 | |||
626 | if (!trylock_page_cgroup(pc)) | ||
627 | return ret; | ||
628 | |||
629 | if (!PageCgroupUsed(pc)) | ||
630 | goto out; | ||
631 | |||
632 | if (pc->mem_cgroup != from) | ||
633 | goto out; | ||
634 | |||
635 | if (spin_trylock(&to_mz->lru_lock)) { | ||
636 | __mem_cgroup_remove_list(from_mz, pc); | ||
637 | css_put(&from->css); | ||
638 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
639 | pc->mem_cgroup = to; | ||
640 | css_get(&to->css); | ||
641 | __mem_cgroup_add_list(to_mz, pc, false); | ||
642 | ret = 0; | ||
643 | spin_unlock(&to_mz->lru_lock); | ||
644 | } | ||
645 | out: | ||
646 | unlock_page_cgroup(pc); | ||
647 | return ret; | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * move charges to its parent. | ||
652 | */ | ||
653 | |||
654 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | ||
655 | struct mem_cgroup *child, | ||
656 | gfp_t gfp_mask) | ||
657 | { | ||
658 | struct cgroup *cg = child->css.cgroup; | ||
659 | struct cgroup *pcg = cg->parent; | ||
660 | struct mem_cgroup *parent; | ||
661 | struct mem_cgroup_per_zone *mz; | ||
662 | unsigned long flags; | ||
663 | int ret; | ||
664 | |||
665 | /* Is ROOT ? */ | ||
666 | if (!pcg) | ||
667 | return -EINVAL; | ||
668 | |||
669 | parent = mem_cgroup_from_cont(pcg); | ||
670 | |||
671 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | ||
672 | if (ret) | ||
673 | return ret; | ||
674 | |||
675 | mz = mem_cgroup_zoneinfo(child, | ||
676 | page_cgroup_nid(pc), page_cgroup_zid(pc)); | ||
677 | |||
678 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
679 | ret = mem_cgroup_move_account(pc, child, parent); | ||
680 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
681 | |||
682 | /* drop extra refcnt */ | ||
683 | css_put(&parent->css); | ||
684 | /* uncharge if move fails */ | ||
685 | if (ret) | ||
686 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
687 | |||
688 | return ret; | ||
689 | } | ||
690 | |||
579 | /* | 691 | /* |
580 | * Charge the memory controller for page usage. | 692 | * Charge the memory controller for page usage. |
581 | * Return | 693 | * Return |
@@ -597,7 +709,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
597 | prefetchw(pc); | 709 | prefetchw(pc); |
598 | 710 | ||
599 | mem = memcg; | 711 | mem = memcg; |
600 | ret = mem_cgroup_try_charge(mm, gfp_mask, &mem); | 712 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
601 | if (ret) | 713 | if (ret) |
602 | return ret; | 714 | return ret; |
603 | 715 | ||
@@ -899,46 +1011,52 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
899 | * This routine traverse page_cgroup in given list and drop them all. | 1011 | * This routine traverse page_cgroup in given list and drop them all. |
900 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 1012 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
901 | */ | 1013 | */ |
902 | #define FORCE_UNCHARGE_BATCH (128) | 1014 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
903 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | ||
904 | struct mem_cgroup_per_zone *mz, | 1015 | struct mem_cgroup_per_zone *mz, |
905 | enum lru_list lru) | 1016 | enum lru_list lru) |
906 | { | 1017 | { |
907 | struct page_cgroup *pc; | 1018 | struct page_cgroup *pc, *busy; |
908 | struct page *page; | ||
909 | int count = FORCE_UNCHARGE_BATCH; | ||
910 | unsigned long flags; | 1019 | unsigned long flags; |
1020 | unsigned long loop; | ||
911 | struct list_head *list; | 1021 | struct list_head *list; |
1022 | int ret = 0; | ||
912 | 1023 | ||
913 | list = &mz->lists[lru]; | 1024 | list = &mz->lists[lru]; |
914 | 1025 | ||
915 | spin_lock_irqsave(&mz->lru_lock, flags); | 1026 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
916 | while (!list_empty(list)) { | 1027 | /* give some margin against EBUSY etc...*/ |
917 | pc = list_entry(list->prev, struct page_cgroup, lru); | 1028 | loop += 256; |
918 | page = pc->page; | 1029 | busy = NULL; |
919 | if (!PageCgroupUsed(pc)) | 1030 | while (loop--) { |
1031 | ret = 0; | ||
1032 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
1033 | if (list_empty(list)) { | ||
1034 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
920 | break; | 1035 | break; |
921 | get_page(page); | 1036 | } |
1037 | pc = list_entry(list->prev, struct page_cgroup, lru); | ||
1038 | if (busy == pc) { | ||
1039 | list_move(&pc->lru, list); | ||
1040 | busy = 0; | ||
1041 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
1042 | continue; | ||
1043 | } | ||
922 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1044 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
923 | /* | 1045 | |
924 | * Check if this page is on LRU. !LRU page can be found | 1046 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); |
925 | * if it's under page migration. | 1047 | if (ret == -ENOMEM) |
926 | */ | ||
927 | if (PageLRU(page)) { | ||
928 | __mem_cgroup_uncharge_common(page, | ||
929 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
930 | put_page(page); | ||
931 | if (--count <= 0) { | ||
932 | count = FORCE_UNCHARGE_BATCH; | ||
933 | cond_resched(); | ||
934 | } | ||
935 | } else { | ||
936 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
937 | break; | 1048 | break; |
938 | } | 1049 | |
939 | spin_lock_irqsave(&mz->lru_lock, flags); | 1050 | if (ret == -EBUSY || ret == -EINVAL) { |
1051 | /* found lock contention or "pc" is obsolete. */ | ||
1052 | busy = pc; | ||
1053 | cond_resched(); | ||
1054 | } else | ||
1055 | busy = NULL; | ||
940 | } | 1056 | } |
941 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1057 | if (!ret && !list_empty(list)) |
1058 | return -EBUSY; | ||
1059 | return ret; | ||
942 | } | 1060 | } |
943 | 1061 | ||
944 | /* | 1062 | /* |
@@ -947,34 +1065,68 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
947 | */ | 1065 | */ |
948 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) | 1066 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) |
949 | { | 1067 | { |
950 | int ret = -EBUSY; | 1068 | int ret; |
951 | int node, zid; | 1069 | int node, zid, shrink; |
1070 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
952 | 1071 | ||
953 | css_get(&mem->css); | 1072 | css_get(&mem->css); |
954 | /* | 1073 | |
955 | * page reclaim code (kswapd etc..) will move pages between | 1074 | shrink = 0; |
956 | * active_list <-> inactive_list while we don't take a lock. | 1075 | move_account: |
957 | * So, we have to do loop here until all lists are empty. | ||
958 | */ | ||
959 | while (mem->res.usage > 0) { | 1076 | while (mem->res.usage > 0) { |
1077 | ret = -EBUSY; | ||
960 | if (atomic_read(&mem->css.cgroup->count) > 0) | 1078 | if (atomic_read(&mem->css.cgroup->count) > 0) |
961 | goto out; | 1079 | goto out; |
1080 | |||
962 | /* This is for making all *used* pages to be on LRU. */ | 1081 | /* This is for making all *used* pages to be on LRU. */ |
963 | lru_add_drain_all(); | 1082 | lru_add_drain_all(); |
964 | for_each_node_state(node, N_POSSIBLE) | 1083 | ret = 0; |
965 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 1084 | for_each_node_state(node, N_POSSIBLE) { |
1085 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | ||
966 | struct mem_cgroup_per_zone *mz; | 1086 | struct mem_cgroup_per_zone *mz; |
967 | enum lru_list l; | 1087 | enum lru_list l; |
968 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 1088 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
969 | for_each_lru(l) | 1089 | for_each_lru(l) { |
970 | mem_cgroup_force_empty_list(mem, mz, l); | 1090 | ret = mem_cgroup_force_empty_list(mem, |
1091 | mz, l); | ||
1092 | if (ret) | ||
1093 | break; | ||
1094 | } | ||
971 | } | 1095 | } |
1096 | if (ret) | ||
1097 | break; | ||
1098 | } | ||
1099 | /* it seems parent cgroup doesn't have enough mem */ | ||
1100 | if (ret == -ENOMEM) | ||
1101 | goto try_to_free; | ||
972 | cond_resched(); | 1102 | cond_resched(); |
973 | } | 1103 | } |
974 | ret = 0; | 1104 | ret = 0; |
975 | out: | 1105 | out: |
976 | css_put(&mem->css); | 1106 | css_put(&mem->css); |
977 | return ret; | 1107 | return ret; |
1108 | |||
1109 | try_to_free: | ||
1110 | /* returns EBUSY if we come here twice. */ | ||
1111 | if (shrink) { | ||
1112 | ret = -EBUSY; | ||
1113 | goto out; | ||
1114 | } | ||
1115 | /* try to free all pages in this cgroup */ | ||
1116 | shrink = 1; | ||
1117 | while (nr_retries && mem->res.usage > 0) { | ||
1118 | int progress; | ||
1119 | progress = try_to_free_mem_cgroup_pages(mem, | ||
1120 | GFP_HIGHUSER_MOVABLE); | ||
1121 | if (!progress) | ||
1122 | nr_retries--; | ||
1123 | |||
1124 | } | ||
1125 | /* try move_account...there may be some *locked* pages. */ | ||
1126 | if (mem->res.usage) | ||
1127 | goto move_account; | ||
1128 | ret = 0; | ||
1129 | goto out; | ||
978 | } | 1130 | } |
979 | 1131 | ||
980 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1132 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
@@ -1023,11 +1175,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
1023 | return 0; | 1175 | return 0; |
1024 | } | 1176 | } |
1025 | 1177 | ||
1026 | static int mem_force_empty_write(struct cgroup *cont, unsigned int event) | ||
1027 | { | ||
1028 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); | ||
1029 | } | ||
1030 | |||
1031 | static const struct mem_cgroup_stat_desc { | 1178 | static const struct mem_cgroup_stat_desc { |
1032 | const char *msg; | 1179 | const char *msg; |
1033 | u64 unit; | 1180 | u64 unit; |
@@ -1104,10 +1251,6 @@ static struct cftype mem_cgroup_files[] = { | |||
1104 | .read_u64 = mem_cgroup_read, | 1251 | .read_u64 = mem_cgroup_read, |
1105 | }, | 1252 | }, |
1106 | { | 1253 | { |
1107 | .name = "force_empty", | ||
1108 | .trigger = mem_force_empty_write, | ||
1109 | }, | ||
1110 | { | ||
1111 | .name = "stat", | 1254 | .name = "stat", |
1112 | .read_map = mem_control_stat_show, | 1255 | .read_map = mem_control_stat_show, |
1113 | }, | 1256 | }, |