diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2012-01-12 20:19:01 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-12 23:13:07 -0500 |
commit | 38c5d72f3ebe5ddd57d2f08dc035070fc6c9a287 (patch) | |
tree | 6461c0dfec95dccc92191f059fbe99b5660a8860 | |
parent | 4e5f01c2b9b94321992acb09c35d34f5ee5bb274 (diff) |
memcg: simplify LRU handling by new rule
Now, at LRU handling, memory cgroup needs to do complicated works to see
valid pc->mem_cgroup, which may be overwritten.
This patch is for relaxing the protocol. This patch guarantees
- when pc->mem_cgroup is overwritten, page must not be on LRU.
By this, LRU routine can believe pc->mem_cgroup and don't need to check
bits on pc->flags. This new rule may adds small overheads to swapin. But
in most case, lru handling gets faster.
After this patch, PCG_ACCT_LRU bit is obsolete and removed.
[akpm@linux-foundation.org: remove unneeded VM_BUG_ON(), restore hannes's christmas tree]
[akpm@linux-foundation.org: clean up code comment]
[hughd@google.com: fix NULL mem_cgroup_try_charge]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/page_cgroup.h | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 123 |
2 files changed, 54 insertions, 77 deletions
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 1153095ee457..a2d11771c84b 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h | |||
@@ -10,8 +10,6 @@ enum { | |||
10 | /* flags for mem_cgroup and file and I/O status */ | 10 | /* flags for mem_cgroup and file and I/O status */ |
11 | PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ | 11 | PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ |
12 | PCG_FILE_MAPPED, /* page is accounted as "mapped" */ | 12 | PCG_FILE_MAPPED, /* page is accounted as "mapped" */ |
13 | /* No lock in page_cgroup */ | ||
14 | PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ | ||
15 | __NR_PCG_FLAGS, | 13 | __NR_PCG_FLAGS, |
16 | }; | 14 | }; |
17 | 15 | ||
@@ -75,12 +73,6 @@ TESTPCGFLAG(Used, USED) | |||
75 | CLEARPCGFLAG(Used, USED) | 73 | CLEARPCGFLAG(Used, USED) |
76 | SETPCGFLAG(Used, USED) | 74 | SETPCGFLAG(Used, USED) |
77 | 75 | ||
78 | SETPCGFLAG(AcctLRU, ACCT_LRU) | ||
79 | CLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
80 | TESTPCGFLAG(AcctLRU, ACCT_LRU) | ||
81 | TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) | ||
82 | |||
83 | |||
84 | SETPCGFLAG(FileMapped, FILE_MAPPED) | 76 | SETPCGFLAG(FileMapped, FILE_MAPPED) |
85 | CLEARPCGFLAG(FileMapped, FILE_MAPPED) | 77 | CLEARPCGFLAG(FileMapped, FILE_MAPPED) |
86 | TESTPCGFLAG(FileMapped, FILE_MAPPED) | 78 | TESTPCGFLAG(FileMapped, FILE_MAPPED) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c74102d6eb5a..ff051ee8fb4b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1040,30 +1040,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1040 | return &zone->lruvec; | 1040 | return &zone->lruvec; |
1041 | 1041 | ||
1042 | pc = lookup_page_cgroup(page); | 1042 | pc = lookup_page_cgroup(page); |
1043 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 1043 | memcg = pc->mem_cgroup; |
1044 | /* | ||
1045 | * putback: charge: | ||
1046 | * SetPageLRU SetPageCgroupUsed | ||
1047 | * smp_mb smp_mb | ||
1048 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1049 | * | ||
1050 | * Ensure that one of the two sides adds the page to the memcg | ||
1051 | * LRU during a race. | ||
1052 | */ | ||
1053 | smp_mb(); | ||
1054 | /* | ||
1055 | * If the page is uncharged, it may be freed soon, but it | ||
1056 | * could also be swap cache (readahead, swapoff) that needs to | ||
1057 | * be reclaimable in the future. root_mem_cgroup will babysit | ||
1058 | * it for the time being. | ||
1059 | */ | ||
1060 | if (PageCgroupUsed(pc)) { | ||
1061 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1062 | smp_rmb(); | ||
1063 | memcg = pc->mem_cgroup; | ||
1064 | SetPageCgroupAcctLRU(pc); | ||
1065 | } else | ||
1066 | memcg = root_mem_cgroup; | ||
1067 | mz = page_cgroup_zoneinfo(memcg, page); | 1044 | mz = page_cgroup_zoneinfo(memcg, page); |
1068 | /* compound_order() is stabilized through lru_lock */ | 1045 | /* compound_order() is stabilized through lru_lock */ |
1069 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1046 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
@@ -1090,18 +1067,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | |||
1090 | return; | 1067 | return; |
1091 | 1068 | ||
1092 | pc = lookup_page_cgroup(page); | 1069 | pc = lookup_page_cgroup(page); |
1093 | /* | 1070 | memcg = pc->mem_cgroup; |
1094 | * root_mem_cgroup babysits uncharged LRU pages, but | 1071 | VM_BUG_ON(!memcg); |
1095 | * PageCgroupUsed is cleared when the page is about to get | ||
1096 | * freed. PageCgroupAcctLRU remembers whether the | ||
1097 | * LRU-accounting happened against pc->mem_cgroup or | ||
1098 | * root_mem_cgroup. | ||
1099 | */ | ||
1100 | if (TestClearPageCgroupAcctLRU(pc)) { | ||
1101 | VM_BUG_ON(!pc->mem_cgroup); | ||
1102 | memcg = pc->mem_cgroup; | ||
1103 | } else | ||
1104 | memcg = root_mem_cgroup; | ||
1105 | mz = page_cgroup_zoneinfo(memcg, page); | 1072 | mz = page_cgroup_zoneinfo(memcg, page); |
1106 | /* huge page split is done under lru_lock. so, we have no races. */ | 1073 | /* huge page split is done under lru_lock. so, we have no races. */ |
1107 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 1074 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
@@ -2217,8 +2184,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2217 | } | 2184 | } |
2218 | 2185 | ||
2219 | /* | 2186 | /* |
2220 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 2187 | * __mem_cgroup_try_charge() does |
2221 | * oom-killer can be invoked. | 2188 | * 1. detect memcg to be charged against from passed *mm and *ptr, |
2189 | * 2. update res_counter | ||
2190 | * 3. call memory reclaim if necessary. | ||
2191 | * | ||
2192 | * In some special case, if the task is fatal, fatal_signal_pending() or | ||
2193 | * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup | ||
2194 | * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon | ||
2195 | * as possible without any hazards. 2: all pages should have a valid | ||
2196 | * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg | ||
2197 | * pointer, that is treated as a charge to root_mem_cgroup. | ||
2198 | * | ||
2199 | * So __mem_cgroup_try_charge() will return | ||
2200 | * 0 ... on success, filling *ptr with a valid memcg pointer. | ||
2201 | * -ENOMEM ... charge failure because of resource limits. | ||
2202 | * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. | ||
2203 | * | ||
2204 | * Unlike the exported interface, an "oom" parameter is added. if oom==true, | ||
2205 | * the oom-killer can be invoked. | ||
2222 | */ | 2206 | */ |
2223 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2207 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
2224 | gfp_t gfp_mask, | 2208 | gfp_t gfp_mask, |
@@ -2247,7 +2231,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2247 | * set, if so charge the init_mm (happens for pagecache usage). | 2231 | * set, if so charge the init_mm (happens for pagecache usage). |
2248 | */ | 2232 | */ |
2249 | if (!*ptr && !mm) | 2233 | if (!*ptr && !mm) |
2250 | goto bypass; | 2234 | *ptr = root_mem_cgroup; |
2251 | again: | 2235 | again: |
2252 | if (*ptr) { /* css should be a valid one */ | 2236 | if (*ptr) { /* css should be a valid one */ |
2253 | memcg = *ptr; | 2237 | memcg = *ptr; |
@@ -2273,7 +2257,9 @@ again: | |||
2273 | * task-struct. So, mm->owner can be NULL. | 2257 | * task-struct. So, mm->owner can be NULL. |
2274 | */ | 2258 | */ |
2275 | memcg = mem_cgroup_from_task(p); | 2259 | memcg = mem_cgroup_from_task(p); |
2276 | if (!memcg || mem_cgroup_is_root(memcg)) { | 2260 | if (!memcg) |
2261 | memcg = root_mem_cgroup; | ||
2262 | if (mem_cgroup_is_root(memcg)) { | ||
2277 | rcu_read_unlock(); | 2263 | rcu_read_unlock(); |
2278 | goto done; | 2264 | goto done; |
2279 | } | 2265 | } |
@@ -2348,8 +2334,8 @@ nomem: | |||
2348 | *ptr = NULL; | 2334 | *ptr = NULL; |
2349 | return -ENOMEM; | 2335 | return -ENOMEM; |
2350 | bypass: | 2336 | bypass: |
2351 | *ptr = NULL; | 2337 | *ptr = root_mem_cgroup; |
2352 | return 0; | 2338 | return -EINTR; |
2353 | } | 2339 | } |
2354 | 2340 | ||
2355 | /* | 2341 | /* |
@@ -2457,6 +2443,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2457 | 2443 | ||
2458 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2444 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); |
2459 | unlock_page_cgroup(pc); | 2445 | unlock_page_cgroup(pc); |
2446 | WARN_ON_ONCE(PageLRU(page)); | ||
2460 | /* | 2447 | /* |
2461 | * "charge_statistics" updated event counter. Then, check it. | 2448 | * "charge_statistics" updated event counter. Then, check it. |
2462 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2449 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
@@ -2468,7 +2455,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2468 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2455 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2469 | 2456 | ||
2470 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2457 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ |
2471 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | 2458 | (1 << PCG_MIGRATION)) |
2472 | /* | 2459 | /* |
2473 | * Because tail pages are not marked as "used", set it. We're under | 2460 | * Because tail pages are not marked as "used", set it. We're under |
2474 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2461 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2478,7 +2465,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2478 | void mem_cgroup_split_huge_fixup(struct page *head) | 2465 | void mem_cgroup_split_huge_fixup(struct page *head) |
2479 | { | 2466 | { |
2480 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 2467 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
2468 | struct mem_cgroup_per_zone *mz; | ||
2481 | struct page_cgroup *pc; | 2469 | struct page_cgroup *pc; |
2470 | enum lru_list lru; | ||
2482 | int i; | 2471 | int i; |
2483 | 2472 | ||
2484 | if (mem_cgroup_disabled()) | 2473 | if (mem_cgroup_disabled()) |
@@ -2487,23 +2476,15 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
2487 | pc = head_pc + i; | 2476 | pc = head_pc + i; |
2488 | pc->mem_cgroup = head_pc->mem_cgroup; | 2477 | pc->mem_cgroup = head_pc->mem_cgroup; |
2489 | smp_wmb();/* see __commit_charge() */ | 2478 | smp_wmb();/* see __commit_charge() */ |
2490 | /* | ||
2491 | * LRU flags cannot be copied because we need to add tail | ||
2492 | * page to LRU by generic call and our hooks will be called. | ||
2493 | */ | ||
2494 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 2479 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
2495 | } | 2480 | } |
2496 | 2481 | /* | |
2497 | if (PageCgroupAcctLRU(head_pc)) { | 2482 | * Tail pages will be added to LRU. |
2498 | enum lru_list lru; | 2483 | * We hold lru_lock,then,reduce counter directly. |
2499 | struct mem_cgroup_per_zone *mz; | 2484 | */ |
2500 | /* | 2485 | lru = page_lru(head); |
2501 | * We hold lru_lock, then, reduce counter directly. | 2486 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); |
2502 | */ | 2487 | MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; |
2503 | lru = page_lru(head); | ||
2504 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); | ||
2505 | MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; | ||
2506 | } | ||
2507 | } | 2488 | } |
2508 | #endif | 2489 | #endif |
2509 | 2490 | ||
@@ -2620,7 +2601,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2620 | 2601 | ||
2621 | parent = mem_cgroup_from_cont(pcg); | 2602 | parent = mem_cgroup_from_cont(pcg); |
2622 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2603 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2623 | if (ret || !parent) | 2604 | if (ret) |
2624 | goto put_back; | 2605 | goto put_back; |
2625 | 2606 | ||
2626 | if (nr_pages > 1) | 2607 | if (nr_pages > 1) |
@@ -2667,9 +2648,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2667 | 2648 | ||
2668 | pc = lookup_page_cgroup(page); | 2649 | pc = lookup_page_cgroup(page); |
2669 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 2650 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2670 | if (ret || !memcg) | 2651 | if (ret == -ENOMEM) |
2671 | return ret; | 2652 | return ret; |
2672 | |||
2673 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); | 2653 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); |
2674 | return 0; | 2654 | return 0; |
2675 | } | 2655 | } |
@@ -2736,10 +2716,9 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2736 | if (!page_is_file_cache(page)) | 2716 | if (!page_is_file_cache(page)) |
2737 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 2717 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2738 | 2718 | ||
2739 | if (!PageSwapCache(page)) { | 2719 | if (!PageSwapCache(page)) |
2740 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | 2720 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); |
2741 | WARN_ON_ONCE(PageLRU(page)); | 2721 | else { /* page is swapcache/shmem */ |
2742 | } else { /* page is swapcache/shmem */ | ||
2743 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | 2722 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); |
2744 | if (!ret) | 2723 | if (!ret) |
2745 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | 2724 | __mem_cgroup_commit_charge_swapin(page, memcg, type); |
@@ -2781,11 +2760,16 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2781 | *memcgp = memcg; | 2760 | *memcgp = memcg; |
2782 | ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); | 2761 | ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); |
2783 | css_put(&memcg->css); | 2762 | css_put(&memcg->css); |
2763 | if (ret == -EINTR) | ||
2764 | ret = 0; | ||
2784 | return ret; | 2765 | return ret; |
2785 | charge_cur_mm: | 2766 | charge_cur_mm: |
2786 | if (unlikely(!mm)) | 2767 | if (unlikely(!mm)) |
2787 | mm = &init_mm; | 2768 | mm = &init_mm; |
2788 | return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 2769 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2770 | if (ret == -EINTR) | ||
2771 | ret = 0; | ||
2772 | return ret; | ||
2789 | } | 2773 | } |
2790 | 2774 | ||
2791 | static void | 2775 | static void |
@@ -3245,7 +3229,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3245 | *memcgp = memcg; | 3229 | *memcgp = memcg; |
3246 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); | 3230 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); |
3247 | css_put(&memcg->css);/* drop extra refcnt */ | 3231 | css_put(&memcg->css);/* drop extra refcnt */ |
3248 | if (ret || *memcgp == NULL) { | 3232 | if (ret) { |
3249 | if (PageAnon(page)) { | 3233 | if (PageAnon(page)) { |
3250 | lock_page_cgroup(pc); | 3234 | lock_page_cgroup(pc); |
3251 | ClearPageCgroupMigration(pc); | 3235 | ClearPageCgroupMigration(pc); |
@@ -3255,6 +3239,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3255 | */ | 3239 | */ |
3256 | mem_cgroup_uncharge_page(page); | 3240 | mem_cgroup_uncharge_page(page); |
3257 | } | 3241 | } |
3242 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3258 | return -ENOMEM; | 3243 | return -ENOMEM; |
3259 | } | 3244 | } |
3260 | /* | 3245 | /* |
@@ -3674,7 +3659,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3674 | pc = lookup_page_cgroup(page); | 3659 | pc = lookup_page_cgroup(page); |
3675 | 3660 | ||
3676 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3661 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); |
3677 | if (ret == -ENOMEM) | 3662 | if (ret == -ENOMEM || ret == -EINTR) |
3678 | break; | 3663 | break; |
3679 | 3664 | ||
3680 | if (ret == -EBUSY || ret == -EINVAL) { | 3665 | if (ret == -EBUSY || ret == -EINVAL) { |
@@ -5065,9 +5050,9 @@ one_by_one: | |||
5065 | } | 5050 | } |
5066 | ret = __mem_cgroup_try_charge(NULL, | 5051 | ret = __mem_cgroup_try_charge(NULL, |
5067 | GFP_KERNEL, 1, &memcg, false); | 5052 | GFP_KERNEL, 1, &memcg, false); |
5068 | if (ret || !memcg) | 5053 | if (ret) |
5069 | /* mem_cgroup_clear_mc() will do uncharge later */ | 5054 | /* mem_cgroup_clear_mc() will do uncharge later */ |
5070 | return -ENOMEM; | 5055 | return ret; |
5071 | mc.precharge++; | 5056 | mc.precharge++; |
5072 | } | 5057 | } |
5073 | return ret; | 5058 | return ret; |