mm: memcontrol: fold mem_cgroup_do_charge()

These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 13): This function was split out because mem_cgroup_try_charge() got too big. But having essentially one sequence of operations arbitrarily split in half is not good for reworking the code. Fold it back in. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Johannes Weiner <hannes@cmpxchg.org> 2014-08-06 19:05:42 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-06 21:01:17 -0400
commit: 6539cc053869bd32a2db731b215b7c73b11f68d3 (patch)
tree: 8dc4998ae344345bab4fd2bafeb77d5cc4e4fd85
parent: 2f3e442ccceb85c51c7dffd3799bfd84de213874 (diff)
1 files changed, 64 insertions, 102 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f009a14918d2..fe3ad310656d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,80 +2551,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
-/* See mem_cgroup_try_charge() for details */
-enum {
-        CHARGE_OK,              /* success */
-        CHARGE_RETRY,           /* need to retry but retry is not bad */
-        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
-        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-};
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                                unsigned int nr_pages, unsigned int min_pages,
-                                bool invoke_oom)
-{
-        unsigned long csize = nr_pages * PAGE_SIZE;
-        struct mem_cgroup *mem_over_limit;
-        struct res_counter *fail_res;
-        unsigned long flags = 0;
-        int ret;
-        ret = res_counter_charge(&memcg->res, csize, &fail_res);
-        if (likely(!ret)) {
-                if (!do_swap_account)
-                        return CHARGE_OK;
-                ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
-                if (likely(!ret))
-                        return CHARGE_OK;
-                res_counter_uncharge(&memcg->res, csize);
-                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
-                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-        } else
-                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-        /*
-         * Never reclaim on behalf of optional batching, retry with a
-         * single page instead.
-         */
-        if (nr_pages > min_pages)
-                return CHARGE_RETRY;
-        if (!(gfp_mask & __GFP_WAIT))
-                return CHARGE_WOULDBLOCK;
-        if (gfp_mask & __GFP_NORETRY)
-                return CHARGE_NOMEM;
-        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-                return CHARGE_RETRY;
-        /*
-         * Even though the limit is exceeded at this point, reclaim
-         * may have been able to free some pages.  Retry the charge
-         * before killing the task.
-         *
-         * Only for regular pages, though: huge pages are rather
-         * unlikely to succeed so close to the limit, and we fall back
-         * to regular pages anyway in case of failure.
-         */
-        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
-                return CHARGE_RETRY;
-        /*
-         * At task move, charge accounts can be doubly counted. So, it's
-         * better to wait until the end of task_move if something is going on.
-         */
-        if (mem_cgroup_wait_acct_move(mem_over_limit))
-                return CHARGE_RETRY;
-        if (invoke_oom)
-                mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
-        return CHARGE_NOMEM;
-}
 /**
 * mem_cgroup_try_charge - try charging a memcg
 * @memcg: memcg to charge
@@ -2641,7 +2567,11 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 {
        unsigned int batch = max(CHARGE_BATCH, nr_pages);
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        int ret;
+        struct mem_cgroup *mem_over_limit;
+        struct res_counter *fail_res;
+        unsigned long nr_reclaimed;
+        unsigned long flags = 0;
+        unsigned long long size;
        if (mem_cgroup_is_root(memcg))
                goto done;
@@ -2661,44 +2591,76 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
        if (gfp_mask & __GFP_NOFAIL)
                oom = false;
-again:
+retry:
        if (consume_stock(memcg, nr_pages))
                goto done;
-        do {
+        size = batch * PAGE_SIZE;
-                bool invoke_oom = oom && !nr_oom_retries;
+        if (!res_counter_charge(&memcg->res, size, &fail_res)) {
+                if (!do_swap_account)
+                        goto done_restock;
+                if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+                        goto done_restock;
+                res_counter_uncharge(&memcg->res, size);
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+        } else
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-                /* If killed, bypass charge */
+        if (batch > nr_pages) {
-                if (fatal_signal_pending(current))
+                batch = nr_pages;
-                        goto bypass;
+                goto retry;
+        }
-                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+        if (!(gfp_mask & __GFP_WAIT))
-                                           nr_pages, invoke_oom);
+                goto nomem;
-                switch (ret) {
-                case CHARGE_OK:
-                        break;
-                case CHARGE_RETRY: /* not in OOM situation but retry */
-                        batch = nr_pages;
-                        goto again;
-                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-                        goto nomem;
-                case CHARGE_NOMEM: /* OOM routine works */
-                        if (!oom || invoke_oom)
-                                goto nomem;
-                        nr_oom_retries--;
-                        break;
-                }
-        } while (ret != CHARGE_OK);
-        if (batch > nr_pages)
+        if (gfp_mask & __GFP_NORETRY)
-                refill_stock(memcg, batch - nr_pages);
+                goto nomem;
-done:
-        return 0;
+        nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+        if (mem_cgroup_margin(mem_over_limit) >= batch)
+                goto retry;
+        /*
+         * Even though the limit is exceeded at this point, reclaim
+         * may have been able to free some pages.  Retry the charge
+         * before killing the task.
+         *
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
+         */
+        if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+                goto retry;
+        /*
+         * At task move, charge accounts can be doubly counted. So, it's
+         * better to wait until the end of task_move if something is going on.
+         */
+        if (mem_cgroup_wait_acct_move(mem_over_limit))
+                goto retry;
+        if (fatal_signal_pending(current))
+                goto bypass;
+        if (!oom)
+                goto nomem;
+        if (nr_oom_retries--)
+                goto retry;
+        mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
 bypass:
        return -EINTR;
+done_restock:
+        if (batch > nr_pages)
+                refill_stock(memcg, batch - nr_pages);
+done:
+        return 0;
 }
 /**
author	Johannes Weiner <hannes@cmpxchg.org>	2014-08-06 19:05:42 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-08-06 21:01:17 -0400
commit	6539cc053869bd32a2db731b215b7c73b11f68d3 (patch)
tree	8dc4998ae344345bab4fd2bafeb77d5cc4e4fd85
parent	2f3e442ccceb85c51c7dffd3799bfd84de213874 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14918d2..fe3ad310656d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2551,80 +2551,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2551	return NOTIFY_OK;	2551	return NOTIFY_OK;
2552	}	2552	}
2553		2553
2554
2555	/* See mem_cgroup_try_charge() for details */
2556	enum {
2557	CHARGE_OK, /* success */
2558	CHARGE_RETRY, /* need to retry but retry is not bad */
2559	CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2560	CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2561	};
2562
2563	static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2564	unsigned int nr_pages, unsigned int min_pages,
2565	bool invoke_oom)
2566	{
2567	unsigned long csize = nr_pages * PAGE_SIZE;
2568	struct mem_cgroup *mem_over_limit;
2569	struct res_counter *fail_res;
2570	unsigned long flags = 0;
2571	int ret;
2572
2573	ret = res_counter_charge(&memcg->res, csize, &fail_res);
2574
2575	if (likely(!ret)) {
2576	if (!do_swap_account)
2577	return CHARGE_OK;
2578	ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2579	if (likely(!ret))
2580	return CHARGE_OK;
2581
2582	res_counter_uncharge(&memcg->res, csize);
2583	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2584	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;
2585	} else
2586	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2587	/*
2588	* Never reclaim on behalf of optional batching, retry with a
2589	* single page instead.
2590	*/
2591	if (nr_pages > min_pages)
2592	return CHARGE_RETRY;
2593
2594	if (!(gfp_mask & __GFP_WAIT))
2595	return CHARGE_WOULDBLOCK;
2596
2597	if (gfp_mask & __GFP_NORETRY)
2598	return CHARGE_NOMEM;
2599
2600	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2601	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2602	return CHARGE_RETRY;
2603	/*
2604	* Even though the limit is exceeded at this point, reclaim
2605	* may have been able to free some pages. Retry the charge
2606	* before killing the task.
2607	*
2608	* Only for regular pages, though: huge pages are rather
2609	* unlikely to succeed so close to the limit, and we fall back
2610	* to regular pages anyway in case of failure.
2611	*/
2612	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2613	return CHARGE_RETRY;
2614
2615	/*
2616	* At task move, charge accounts can be doubly counted. So, it's
2617	* better to wait until the end of task_move if something is going on.
2618	*/
2619	if (mem_cgroup_wait_acct_move(mem_over_limit))
2620	return CHARGE_RETRY;
2621
2622	if (invoke_oom)
2623	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2624
2625	return CHARGE_NOMEM;
2626	}
2627
2628	/**	2554	/**
2629	* mem_cgroup_try_charge - try charging a memcg	2555	* mem_cgroup_try_charge - try charging a memcg
2630	* @memcg: memcg to charge	2556	* @memcg: memcg to charge
@@ -2641,7 +2567,11 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2641	{	2567	{
2642	unsigned int batch = max(CHARGE_BATCH, nr_pages);	2568	unsigned int batch = max(CHARGE_BATCH, nr_pages);
2643	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;	2569	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2644	int ret;	2570	struct mem_cgroup *mem_over_limit;
		2571	struct res_counter *fail_res;
		2572	unsigned long nr_reclaimed;
		2573	unsigned long flags = 0;
		2574	unsigned long long size;
2645		2575
2646	if (mem_cgroup_is_root(memcg))	2576	if (mem_cgroup_is_root(memcg))
2647	goto done;	2577	goto done;
@@ -2661,44 +2591,76 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2661		2591
2662	if (gfp_mask & __GFP_NOFAIL)	2592	if (gfp_mask & __GFP_NOFAIL)
2663	oom = false;	2593	oom = false;
2664	again:	2594	retry:
2665	if (consume_stock(memcg, nr_pages))	2595	if (consume_stock(memcg, nr_pages))
2666	goto done;	2596	goto done;
2667		2597
2668	do {	2598	size = batch * PAGE_SIZE;
2669	bool invoke_oom = oom && !nr_oom_retries;	2599	if (!res_counter_charge(&memcg->res, size, &fail_res)) {
		2600	if (!do_swap_account)
		2601	goto done_restock;
		2602	if (!res_counter_charge(&memcg->memsw, size, &fail_res))
		2603	goto done_restock;
		2604	res_counter_uncharge(&memcg->res, size);
		2605	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
		2606	flags \|= MEM_CGROUP_RECLAIM_NOSWAP;
		2607	} else
		2608	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2670		2609
2671	/* If killed, bypass charge */	2610	if (batch > nr_pages) {
2672	if (fatal_signal_pending(current))	2611	batch = nr_pages;
2673	goto bypass;	2612	goto retry;
		2613	}
2674		2614
2675	ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,	2615	if (!(gfp_mask & __GFP_WAIT))
2676	nr_pages, invoke_oom);	2616	goto nomem;
2677	switch (ret) {
2678	case CHARGE_OK:
2679	break;
2680	case CHARGE_RETRY: /* not in OOM situation but retry */
2681	batch = nr_pages;
2682	goto again;
2683	case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2684	goto nomem;
2685	case CHARGE_NOMEM: /* OOM routine works */
2686	if (!oom \|\| invoke_oom)
2687	goto nomem;
2688	nr_oom_retries--;
2689	break;
2690	}
2691	} while (ret != CHARGE_OK);
2692		2617
2693	if (batch > nr_pages)	2618	if (gfp_mask & __GFP_NORETRY)
2694	refill_stock(memcg, batch - nr_pages);	2619	goto nomem;
2695	done:	2620
2696	return 0;	2621	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
		2622
		2623	if (mem_cgroup_margin(mem_over_limit) >= batch)
		2624	goto retry;
		2625	/*
		2626	* Even though the limit is exceeded at this point, reclaim
		2627	* may have been able to free some pages. Retry the charge
		2628	* before killing the task.
		2629	*
		2630	* Only for regular pages, though: huge pages are rather
		2631	* unlikely to succeed so close to the limit, and we fall back
		2632	* to regular pages anyway in case of failure.
		2633	*/
		2634	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
		2635	goto retry;
		2636	/*
		2637	* At task move, charge accounts can be doubly counted. So, it's
		2638	* better to wait until the end of task_move if something is going on.
		2639	*/
		2640	if (mem_cgroup_wait_acct_move(mem_over_limit))
		2641	goto retry;
		2642
		2643	if (fatal_signal_pending(current))
		2644	goto bypass;
		2645
		2646	if (!oom)
		2647	goto nomem;
		2648
		2649	if (nr_oom_retries--)
		2650	goto retry;
		2651
		2652	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
2697	nomem:	2653	nomem:
2698	if (!(gfp_mask & __GFP_NOFAIL))	2654	if (!(gfp_mask & __GFP_NOFAIL))
2699	return -ENOMEM;	2655	return -ENOMEM;
2700	bypass:	2656	bypass:
2701	return -EINTR;	2657	return -EINTR;
		2658
		2659	done_restock:
		2660	if (batch > nr_pages)
		2661	refill_stock(memcg, batch - nr_pages);
		2662	done:
		2663	return 0;
2702	}	2664	}
2703		2665
2704	/**	2666	/**