diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-01-07 21:08:33 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-08 11:31:10 -0500 |
commit | 54595fe2652f04dc8f5b985312c7cef5aa7bf722 (patch) | |
tree | 4e63df850afb307a170c045217b2097aae271b78 | |
parent | a7ba0eef3af51cd1b6fc4028e4705b3ea2ea9469 (diff) |
memcg: use css_tryget in memcg
From:KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
css_tryget() newly is added and we can know css is alive or not and get
refcnt of css in very safe way. ("alive" here means "rmdir/destroy" is
not called.)
This patch replaces css_get() to css_tryget(), where I cannot explain
why css_get() is safe. And removes memcg->obsolete flag.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 98 |
1 files changed, 62 insertions, 36 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f9a9c5a02e2..b311f19bbe01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -162,7 +162,6 @@ struct mem_cgroup { | |||
162 | */ | 162 | */ |
163 | bool use_hierarchy; | 163 | bool use_hierarchy; |
164 | unsigned long last_oom_jiffies; | 164 | unsigned long last_oom_jiffies; |
165 | int obsolete; | ||
166 | atomic_t refcnt; | 165 | atomic_t refcnt; |
167 | 166 | ||
168 | unsigned int swappiness; | 167 | unsigned int swappiness; |
@@ -283,6 +282,31 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
283 | struct mem_cgroup, css); | 282 | struct mem_cgroup, css); |
284 | } | 283 | } |
285 | 284 | ||
285 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | ||
286 | { | ||
287 | struct mem_cgroup *mem = NULL; | ||
288 | /* | ||
289 | * Because we have no locks, mm->owner's may be being moved to other | ||
290 | * cgroup. We use css_tryget() here even if this looks | ||
291 | * pessimistic (rather than adding locks here). | ||
292 | */ | ||
293 | rcu_read_lock(); | ||
294 | do { | ||
295 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
296 | if (unlikely(!mem)) | ||
297 | break; | ||
298 | } while (!css_tryget(&mem->css)); | ||
299 | rcu_read_unlock(); | ||
300 | return mem; | ||
301 | } | ||
302 | |||
303 | static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) | ||
304 | { | ||
305 | if (!mem) | ||
306 | return true; | ||
307 | return css_is_removed(&mem->css); | ||
308 | } | ||
309 | |||
286 | /* | 310 | /* |
287 | * Following LRU functions are allowed to be used without PCG_LOCK. | 311 | * Following LRU functions are allowed to be used without PCG_LOCK. |
288 | * Operations are called by routine of global LRU independently from memcg. | 312 | * Operations are called by routine of global LRU independently from memcg. |
@@ -622,8 +646,9 @@ mem_cgroup_get_first_node(struct mem_cgroup *root_mem) | |||
622 | { | 646 | { |
623 | struct cgroup *cgroup; | 647 | struct cgroup *cgroup; |
624 | struct mem_cgroup *ret; | 648 | struct mem_cgroup *ret; |
625 | bool obsolete = (root_mem->last_scanned_child && | 649 | bool obsolete; |
626 | root_mem->last_scanned_child->obsolete); | 650 | |
651 | obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); | ||
627 | 652 | ||
628 | /* | 653 | /* |
629 | * Scan all children under the mem_cgroup mem | 654 | * Scan all children under the mem_cgroup mem |
@@ -636,7 +661,7 @@ mem_cgroup_get_first_node(struct mem_cgroup *root_mem) | |||
636 | 661 | ||
637 | if (!root_mem->last_scanned_child || obsolete) { | 662 | if (!root_mem->last_scanned_child || obsolete) { |
638 | 663 | ||
639 | if (obsolete) | 664 | if (obsolete && root_mem->last_scanned_child) |
640 | mem_cgroup_put(root_mem->last_scanned_child); | 665 | mem_cgroup_put(root_mem->last_scanned_child); |
641 | 666 | ||
642 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | 667 | cgroup = list_first_entry(&root_mem->css.cgroup->children, |
@@ -711,7 +736,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
711 | next_mem = mem_cgroup_get_first_node(root_mem); | 736 | next_mem = mem_cgroup_get_first_node(root_mem); |
712 | 737 | ||
713 | while (next_mem != root_mem) { | 738 | while (next_mem != root_mem) { |
714 | if (next_mem->obsolete) { | 739 | if (mem_cgroup_is_obsolete(next_mem)) { |
715 | mem_cgroup_put(next_mem); | 740 | mem_cgroup_put(next_mem); |
716 | cgroup_lock(); | 741 | cgroup_lock(); |
717 | next_mem = mem_cgroup_get_first_node(root_mem); | 742 | next_mem = mem_cgroup_get_first_node(root_mem); |
@@ -769,23 +794,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
769 | * thread group leader migrates. It's possible that mm is not | 794 | * thread group leader migrates. It's possible that mm is not |
770 | * set, if so charge the init_mm (happens for pagecache usage). | 795 | * set, if so charge the init_mm (happens for pagecache usage). |
771 | */ | 796 | */ |
772 | if (likely(!*memcg)) { | 797 | mem = *memcg; |
773 | rcu_read_lock(); | 798 | if (likely(!mem)) { |
774 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 799 | mem = try_get_mem_cgroup_from_mm(mm); |
775 | if (unlikely(!mem)) { | ||
776 | rcu_read_unlock(); | ||
777 | return 0; | ||
778 | } | ||
779 | /* | ||
780 | * For every charge from the cgroup, increment reference count | ||
781 | */ | ||
782 | css_get(&mem->css); | ||
783 | *memcg = mem; | 800 | *memcg = mem; |
784 | rcu_read_unlock(); | ||
785 | } else { | 801 | } else { |
786 | mem = *memcg; | ||
787 | css_get(&mem->css); | 802 | css_get(&mem->css); |
788 | } | 803 | } |
804 | if (unlikely(!mem)) | ||
805 | return 0; | ||
806 | |||
807 | VM_BUG_ON(mem_cgroup_is_obsolete(mem)); | ||
789 | 808 | ||
790 | while (1) { | 809 | while (1) { |
791 | int ret; | 810 | int ret; |
@@ -1072,12 +1091,19 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
1072 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 1091 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); |
1073 | } | 1092 | } |
1074 | 1093 | ||
1094 | /* | ||
1095 | * While swap-in, try_charge -> commit or cancel, the page is locked. | ||
1096 | * And when try_charge() successfully returns, one refcnt to memcg without | ||
1097 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | ||
1098 | * "commit()" or removed by "cancel()" | ||
1099 | */ | ||
1075 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 1100 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
1076 | struct page *page, | 1101 | struct page *page, |
1077 | gfp_t mask, struct mem_cgroup **ptr) | 1102 | gfp_t mask, struct mem_cgroup **ptr) |
1078 | { | 1103 | { |
1079 | struct mem_cgroup *mem; | 1104 | struct mem_cgroup *mem; |
1080 | swp_entry_t ent; | 1105 | swp_entry_t ent; |
1106 | int ret; | ||
1081 | 1107 | ||
1082 | if (mem_cgroup_disabled()) | 1108 | if (mem_cgroup_disabled()) |
1083 | return 0; | 1109 | return 0; |
@@ -1096,10 +1122,15 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1096 | ent.val = page_private(page); | 1122 | ent.val = page_private(page); |
1097 | 1123 | ||
1098 | mem = lookup_swap_cgroup(ent); | 1124 | mem = lookup_swap_cgroup(ent); |
1099 | if (!mem || mem->obsolete) | 1125 | if (!mem) |
1126 | goto charge_cur_mm; | ||
1127 | if (!css_tryget(&mem->css)) | ||
1100 | goto charge_cur_mm; | 1128 | goto charge_cur_mm; |
1101 | *ptr = mem; | 1129 | *ptr = mem; |
1102 | return __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1130 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1131 | /* drop extra refcnt from tryget */ | ||
1132 | css_put(&mem->css); | ||
1133 | return ret; | ||
1103 | charge_cur_mm: | 1134 | charge_cur_mm: |
1104 | if (unlikely(!mm)) | 1135 | if (unlikely(!mm)) |
1105 | mm = &init_mm; | 1136 | mm = &init_mm; |
@@ -1130,13 +1161,18 @@ int mem_cgroup_cache_charge_swapin(struct page *page, | |||
1130 | ent.val = page_private(page); | 1161 | ent.val = page_private(page); |
1131 | if (do_swap_account) { | 1162 | if (do_swap_account) { |
1132 | mem = lookup_swap_cgroup(ent); | 1163 | mem = lookup_swap_cgroup(ent); |
1133 | if (mem && mem->obsolete) | 1164 | if (mem) { |
1134 | mem = NULL; | 1165 | if (css_tryget(&mem->css)) |
1135 | if (mem) | 1166 | mm = NULL; /* charge to recorded */ |
1136 | mm = NULL; | 1167 | else |
1168 | mem = NULL; /* charge to current */ | ||
1169 | } | ||
1137 | } | 1170 | } |
1138 | ret = mem_cgroup_charge_common(page, mm, mask, | 1171 | ret = mem_cgroup_charge_common(page, mm, mask, |
1139 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); | 1172 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); |
1173 | /* drop extra refcnt from tryget */ | ||
1174 | if (mem) | ||
1175 | css_put(&mem->css); | ||
1140 | 1176 | ||
1141 | if (!ret && do_swap_account) { | 1177 | if (!ret && do_swap_account) { |
1142 | /* avoid double counting */ | 1178 | /* avoid double counting */ |
@@ -1178,7 +1214,6 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
1178 | struct mem_cgroup *memcg; | 1214 | struct mem_cgroup *memcg; |
1179 | memcg = swap_cgroup_record(ent, NULL); | 1215 | memcg = swap_cgroup_record(ent, NULL); |
1180 | if (memcg) { | 1216 | if (memcg) { |
1181 | /* If memcg is obsolete, memcg can be != ptr */ | ||
1182 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1217 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); |
1183 | mem_cgroup_put(memcg); | 1218 | mem_cgroup_put(memcg); |
1184 | } | 1219 | } |
@@ -1421,14 +1456,9 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
1421 | if (!mm) | 1456 | if (!mm) |
1422 | return 0; | 1457 | return 0; |
1423 | 1458 | ||
1424 | rcu_read_lock(); | 1459 | mem = try_get_mem_cgroup_from_mm(mm); |
1425 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1460 | if (unlikely(!mem)) |
1426 | if (unlikely(!mem)) { | ||
1427 | rcu_read_unlock(); | ||
1428 | return 0; | 1461 | return 0; |
1429 | } | ||
1430 | css_get(&mem->css); | ||
1431 | rcu_read_unlock(); | ||
1432 | 1462 | ||
1433 | do { | 1463 | do { |
1434 | progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); | 1464 | progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); |
@@ -2086,9 +2116,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
2086 | * the number of reference from swap_cgroup and free mem_cgroup when | 2116 | * the number of reference from swap_cgroup and free mem_cgroup when |
2087 | * it goes down to 0. | 2117 | * it goes down to 0. |
2088 | * | 2118 | * |
2089 | * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and | ||
2090 | * entry which points to this memcg will be ignore at swapin. | ||
2091 | * | ||
2092 | * Removal of cgroup itself succeeds regardless of refs from swap. | 2119 | * Removal of cgroup itself succeeds regardless of refs from swap. |
2093 | */ | 2120 | */ |
2094 | 2121 | ||
@@ -2174,7 +2201,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
2174 | struct cgroup *cont) | 2201 | struct cgroup *cont) |
2175 | { | 2202 | { |
2176 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2203 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2177 | mem->obsolete = 1; | ||
2178 | mem_cgroup_force_empty(mem, false); | 2204 | mem_cgroup_force_empty(mem, false); |
2179 | } | 2205 | } |
2180 | 2206 | ||