aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-06 19:05:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 21:01:17 -0400
commit6539cc053869bd32a2db731b215b7c73b11f68d3 (patch)
tree8dc4998ae344345bab4fd2bafeb77d5cc4e4fd85
parent2f3e442ccceb85c51c7dffd3799bfd84de213874 (diff)
mm: memcontrol: fold mem_cgroup_do_charge()
These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 13): This function was split out because mem_cgroup_try_charge() got too big. But having essentially one sequence of operations arbitrarily split in half is not good for reworking the code. Fold it back in. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c166
1 files changed, 64 insertions, 102 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f009a14918d2..fe3ad310656d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,80 +2551,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2551 return NOTIFY_OK; 2551 return NOTIFY_OK;
2552} 2552}
2553 2553
2554
2555/* See mem_cgroup_try_charge() for details */
2556enum {
2557 CHARGE_OK, /* success */
2558 CHARGE_RETRY, /* need to retry but retry is not bad */
2559 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2560 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2561};
2562
2563static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2564 unsigned int nr_pages, unsigned int min_pages,
2565 bool invoke_oom)
2566{
2567 unsigned long csize = nr_pages * PAGE_SIZE;
2568 struct mem_cgroup *mem_over_limit;
2569 struct res_counter *fail_res;
2570 unsigned long flags = 0;
2571 int ret;
2572
2573 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2574
2575 if (likely(!ret)) {
2576 if (!do_swap_account)
2577 return CHARGE_OK;
2578 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2579 if (likely(!ret))
2580 return CHARGE_OK;
2581
2582 res_counter_uncharge(&memcg->res, csize);
2583 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2584 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2585 } else
2586 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2587 /*
2588 * Never reclaim on behalf of optional batching, retry with a
2589 * single page instead.
2590 */
2591 if (nr_pages > min_pages)
2592 return CHARGE_RETRY;
2593
2594 if (!(gfp_mask & __GFP_WAIT))
2595 return CHARGE_WOULDBLOCK;
2596
2597 if (gfp_mask & __GFP_NORETRY)
2598 return CHARGE_NOMEM;
2599
2600 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2601 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2602 return CHARGE_RETRY;
2603 /*
2604 * Even though the limit is exceeded at this point, reclaim
2605 * may have been able to free some pages. Retry the charge
2606 * before killing the task.
2607 *
2608 * Only for regular pages, though: huge pages are rather
2609 * unlikely to succeed so close to the limit, and we fall back
2610 * to regular pages anyway in case of failure.
2611 */
2612 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2613 return CHARGE_RETRY;
2614
2615 /*
2616 * At task move, charge accounts can be doubly counted. So, it's
2617 * better to wait until the end of task_move if something is going on.
2618 */
2619 if (mem_cgroup_wait_acct_move(mem_over_limit))
2620 return CHARGE_RETRY;
2621
2622 if (invoke_oom)
2623 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2624
2625 return CHARGE_NOMEM;
2626}
2627
2628/** 2554/**
2629 * mem_cgroup_try_charge - try charging a memcg 2555 * mem_cgroup_try_charge - try charging a memcg
2630 * @memcg: memcg to charge 2556 * @memcg: memcg to charge
@@ -2641,7 +2567,11 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2641{ 2567{
2642 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2568 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2643 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2569 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2644 int ret; 2570 struct mem_cgroup *mem_over_limit;
2571 struct res_counter *fail_res;
2572 unsigned long nr_reclaimed;
2573 unsigned long flags = 0;
2574 unsigned long long size;
2645 2575
2646 if (mem_cgroup_is_root(memcg)) 2576 if (mem_cgroup_is_root(memcg))
2647 goto done; 2577 goto done;
@@ -2661,44 +2591,76 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2661 2591
2662 if (gfp_mask & __GFP_NOFAIL) 2592 if (gfp_mask & __GFP_NOFAIL)
2663 oom = false; 2593 oom = false;
2664again: 2594retry:
2665 if (consume_stock(memcg, nr_pages)) 2595 if (consume_stock(memcg, nr_pages))
2666 goto done; 2596 goto done;
2667 2597
2668 do { 2598 size = batch * PAGE_SIZE;
2669 bool invoke_oom = oom && !nr_oom_retries; 2599 if (!res_counter_charge(&memcg->res, size, &fail_res)) {
2600 if (!do_swap_account)
2601 goto done_restock;
2602 if (!res_counter_charge(&memcg->memsw, size, &fail_res))
2603 goto done_restock;
2604 res_counter_uncharge(&memcg->res, size);
2605 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2606 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2607 } else
2608 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2670 2609
2671 /* If killed, bypass charge */ 2610 if (batch > nr_pages) {
2672 if (fatal_signal_pending(current)) 2611 batch = nr_pages;
2673 goto bypass; 2612 goto retry;
2613 }
2674 2614
2675 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2615 if (!(gfp_mask & __GFP_WAIT))
2676 nr_pages, invoke_oom); 2616 goto nomem;
2677 switch (ret) {
2678 case CHARGE_OK:
2679 break;
2680 case CHARGE_RETRY: /* not in OOM situation but retry */
2681 batch = nr_pages;
2682 goto again;
2683 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2684 goto nomem;
2685 case CHARGE_NOMEM: /* OOM routine works */
2686 if (!oom || invoke_oom)
2687 goto nomem;
2688 nr_oom_retries--;
2689 break;
2690 }
2691 } while (ret != CHARGE_OK);
2692 2617
2693 if (batch > nr_pages) 2618 if (gfp_mask & __GFP_NORETRY)
2694 refill_stock(memcg, batch - nr_pages); 2619 goto nomem;
2695done: 2620
2696 return 0; 2621 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2622
2623 if (mem_cgroup_margin(mem_over_limit) >= batch)
2624 goto retry;
2625 /*
2626 * Even though the limit is exceeded at this point, reclaim
2627 * may have been able to free some pages. Retry the charge
2628 * before killing the task.
2629 *
2630 * Only for regular pages, though: huge pages are rather
2631 * unlikely to succeed so close to the limit, and we fall back
2632 * to regular pages anyway in case of failure.
2633 */
2634 if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2635 goto retry;
2636 /*
2637 * At task move, charge accounts can be doubly counted. So, it's
2638 * better to wait until the end of task_move if something is going on.
2639 */
2640 if (mem_cgroup_wait_acct_move(mem_over_limit))
2641 goto retry;
2642
2643 if (fatal_signal_pending(current))
2644 goto bypass;
2645
2646 if (!oom)
2647 goto nomem;
2648
2649 if (nr_oom_retries--)
2650 goto retry;
2651
2652 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
2697nomem: 2653nomem:
2698 if (!(gfp_mask & __GFP_NOFAIL)) 2654 if (!(gfp_mask & __GFP_NOFAIL))
2699 return -ENOMEM; 2655 return -ENOMEM;
2700bypass: 2656bypass:
2701 return -EINTR; 2657 return -EINTR;
2658
2659done_restock:
2660 if (batch > nr_pages)
2661 refill_stock(memcg, batch - nr_pages);
2662done:
2663 return 0;
2702} 2664}
2703 2665
2704/** 2666/**