2 files changed, 280 insertions, 0 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e98a74c0c9c0..afa2ad40457e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
 #define _LINUX_MEMCONTROL_H
 #include <linux/cgroup.h>
 #include <linux/vm_event_item.h>
+#include <linux/hardirq.h>
 struct mem_cgroup;
 struct page_cgroup;
@@ -414,5 +415,114 @@ static inline void sock_release_memcg(struct sock *sk)
 {
 }
 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_kmem_enabled(void)
+{
+        return true;
+}
+/*
+ * In general, we'll do everything in our power to not incur in any overhead
+ * for non-memcg users for the kmem functions. Not even a function call, if we
+ * can avoid it.
+ *
+ * Therefore, we'll inline all those functions so that in the best case, we'll
+ * see that kmemcg is off for everybody and proceed quickly.  If it is on,
+ * we'll still do most of the flag checking inline. We check a lot of
+ * conditions, but because they are pretty simple, they are expected to be
+ * fast.
+ */
+bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
+                                        int order);
+void __memcg_kmem_commit_charge(struct page *page,
+                                       struct mem_cgroup *memcg, int order);
+void __memcg_kmem_uncharge_pages(struct page *page, int order);
+/**
+ * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
+ * @gfp: the gfp allocation flags.
+ * @memcg: a pointer to the memcg this was charged against.
+ * @order: allocation order.
+ *
+ * returns true if the memcg where the current task belongs can hold this
+ * allocation.
+ *
+ * We return true automatically if this allocation is not to be accounted to
+ * any memcg.
+ */
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+        if (!memcg_kmem_enabled())
+                return true;
+        /*
+         * __GFP_NOFAIL allocations will move on even if charging is not
+         * possible. Therefore we don't even try, and have this allocation
+         * unaccounted. We could in theory charge it with
+         * res_counter_charge_nofail, but we hope those allocations are rare,
+         * and won't be worth the trouble.
+         */
+        if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+                return true;
+        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+                return true;
+        /* If the test is dying, just let it go. */
+        if (unlikely(fatal_signal_pending(current)))
+                return true;
+        return __memcg_kmem_newpage_charge(gfp, memcg, order);
+}
+/**
+ * memcg_kmem_uncharge_pages: uncharge pages from memcg
+ * @page: pointer to struct page being freed
+ * @order: allocation order.
+ *
+ * there is no need to specify memcg here, since it is embedded in page_cgroup
+ */
+static inline void
+memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+        if (memcg_kmem_enabled())
+                __memcg_kmem_uncharge_pages(page, order);
+}
+/**
+ * memcg_kmem_commit_charge: embeds correct memcg in a page
+ * @page: pointer to struct page recently allocated
+ * @memcg: the memcg structure we charged against
+ * @order: allocation order.
+ *
+ * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
+ * failure of the allocation. if @page is NULL, this function will revert the
+ * charges. Otherwise, it will commit the memcg given by @memcg to the
+ * corresponding page_cgroup.
+ */
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+        if (memcg_kmem_enabled() && memcg)
+                __memcg_kmem_commit_charge(page, memcg, order);
+}
+#else
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+        return true;
+}
+static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+}
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bba1cb4bbb82..b9afa060b8d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -2661,6 +2665,172 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        memcg_check_events(memcg, page);
 }
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+        struct res_counter *fail_res;
+        struct mem_cgroup *_memcg;
+        int ret = 0;
+        bool may_oom;
+        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+        if (ret)
+                return ret;
+        /*
+         * Conditions under which we can wait for the oom_killer. Those are
+         * the same conditions tested by the core page allocator
+         */
+        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+        _memcg = memcg;
+        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+                                      &_memcg, may_oom);
+        if (ret == -EINTR)  {
+                /*
+                 * __mem_cgroup_try_charge() chosed to bypass to root due to
+                 * OOM kill or fatal signal.  Since our only options are to
+                 * either fail the allocation or charge it to this cgroup, do
+                 * it as a temporary condition. But we can't fail. From a
+                 * kmem/slab perspective, the cache has already been selected,
+                 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+                 * our minds.
+                 *
+                 * This condition will only trigger if the task entered
+                 * memcg_charge_kmem in a sane state, but was OOM-killed during
+                 * __mem_cgroup_try_charge() above. Tasks that were already
+                 * dying when the allocation triggers should have been already
+                 * directed to the root cgroup in memcontrol.h
+                 */
+                res_counter_charge_nofail(&memcg->res, size, &fail_res);
+                if (do_swap_account)
+                        res_counter_charge_nofail(&memcg->memsw, size,
+                                                  &fail_res);
+                ret = 0;
+        } else if (ret)
+                res_counter_uncharge(&memcg->kmem, size);
+        return ret;
+}
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+        res_counter_uncharge(&memcg->kmem, size);
+        res_counter_uncharge(&memcg->res, size);
+        if (do_swap_account)
+                res_counter_uncharge(&memcg->memsw, size);
+}
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer.  We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+        struct mem_cgroup *memcg;
+        int ret;
+        *_memcg = NULL;
+        memcg = try_get_mem_cgroup_from_mm(current->mm);
+        /*
+         * very rare case described in mem_cgroup_from_task. Unfortunately there
+         * isn't much we can do without complicating this too much, and it would
+         * be gfp-dependent anyway. Just let it go
+         */
+        if (unlikely(!memcg))
+                return true;
+        if (!memcg_can_account_kmem(memcg)) {
+                css_put(&memcg->css);
+                return true;
+        }
+        mem_cgroup_get(memcg);
+        ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+        if (!ret)
+                *_memcg = memcg;
+        else
+                mem_cgroup_put(memcg);
+        css_put(&memcg->css);
+        return (ret == 0);
+}
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+                              int order)
+{
+        struct page_cgroup *pc;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        /* The page allocation failed. Revert */
+        if (!page) {
+                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+                mem_cgroup_put(memcg);
+                return;
+        }
+        pc = lookup_page_cgroup(page);
+        lock_page_cgroup(pc);
+        pc->mem_cgroup = memcg;
+        SetPageCgroupUsed(pc);
+        unlock_page_cgroup(pc);
+}
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup(page);
+        /*
+         * Fast unlocked return. Theoretically might have changed, have to
+         * check again after locking.
+         */
+        if (!PageCgroupUsed(pc))
+                return;
+        lock_page_cgroup(pc);
+        if (PageCgroupUsed(pc)) {
+                memcg = pc->mem_cgroup;
+                ClearPageCgroupUsed(pc);
+        }
+        unlock_page_cgroup(pc);
+        /*
+         * We trust that only if there is a memcg associated with the page, it
+         * is a valid allocation
+         */
+        if (!memcg)
+                return;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+        mem_cgroup_put(memcg);
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e98a74c0c9c0..afa2ad40457e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
21	#define _LINUX_MEMCONTROL_H	21	#define _LINUX_MEMCONTROL_H
22	#include <linux/cgroup.h>	22	#include <linux/cgroup.h>
23	#include <linux/vm_event_item.h>	23	#include <linux/vm_event_item.h>
		24	#include <linux/hardirq.h>
24		25
25	struct mem_cgroup;	26	struct mem_cgroup;
26	struct page_cgroup;	27	struct page_cgroup;
@@ -414,5 +415,114 @@ static inline void sock_release_memcg(struct sock *sk)
414	{	415	{
415	}	416	}
416	#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */	417	#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
		418
		419	#ifdef CONFIG_MEMCG_KMEM
		420	static inline bool memcg_kmem_enabled(void)
		421	{
		422	return true;
		423	}
		424
		425	/*
		426	* In general, we'll do everything in our power to not incur in any overhead
		427	* for non-memcg users for the kmem functions. Not even a function call, if we
		428	* can avoid it.
		429	*
		430	* Therefore, we'll inline all those functions so that in the best case, we'll
		431	* see that kmemcg is off for everybody and proceed quickly. If it is on,
		432	* we'll still do most of the flag checking inline. We check a lot of
		433	* conditions, but because they are pretty simple, they are expected to be
		434	* fast.
		435	*/
		436	bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
		437	int order);
		438	void __memcg_kmem_commit_charge(struct page *page,
		439	struct mem_cgroup *memcg, int order);
		440	void __memcg_kmem_uncharge_pages(struct page *page, int order);
		441
		442	/**
		443	* memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
		444	* @gfp: the gfp allocation flags.
		445	* @memcg: a pointer to the memcg this was charged against.
		446	* @order: allocation order.
		447	*
		448	* returns true if the memcg where the current task belongs can hold this
		449	* allocation.
		450	*
		451	* We return true automatically if this allocation is not to be accounted to
		452	* any memcg.
		453	*/
		454	static inline bool
		455	memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
		456	{
		457	if (!memcg_kmem_enabled())
		458	return true;
		459
		460	/*
		461	* __GFP_NOFAIL allocations will move on even if charging is not
		462	* possible. Therefore we don't even try, and have this allocation
		463	* unaccounted. We could in theory charge it with
		464	* res_counter_charge_nofail, but we hope those allocations are rare,
		465	* and won't be worth the trouble.
		466	*/
		467	if (!(gfp & __GFP_KMEMCG) \|\| (gfp & __GFP_NOFAIL))
		468	return true;
		469	if (in_interrupt() \|\| (!current->mm) \|\| (current->flags & PF_KTHREAD))
		470	return true;
		471
		472	/* If the test is dying, just let it go. */
		473	if (unlikely(fatal_signal_pending(current)))
		474	return true;
		475
		476	return __memcg_kmem_newpage_charge(gfp, memcg, order);
		477	}
		478
		479	/**
		480	* memcg_kmem_uncharge_pages: uncharge pages from memcg
		481	* @page: pointer to struct page being freed
		482	* @order: allocation order.
		483	*
		484	* there is no need to specify memcg here, since it is embedded in page_cgroup
		485	*/
		486	static inline void
		487	memcg_kmem_uncharge_pages(struct page *page, int order)
		488	{
		489	if (memcg_kmem_enabled())
		490	__memcg_kmem_uncharge_pages(page, order);
		491	}
		492
		493	/**
		494	* memcg_kmem_commit_charge: embeds correct memcg in a page
		495	* @page: pointer to struct page recently allocated
		496	* @memcg: the memcg structure we charged against
		497	* @order: allocation order.
		498	*
		499	* Needs to be called after memcg_kmem_newpage_charge, regardless of success or
		500	* failure of the allocation. if @page is NULL, this function will revert the
		501	* charges. Otherwise, it will commit the memcg given by @memcg to the
		502	* corresponding page_cgroup.
		503	*/
		504	static inline void
		505	memcg_kmem_commit_charge(struct page page, struct mem_cgroup memcg, int order)
		506	{
		507	if (memcg_kmem_enabled() && memcg)
		508	__memcg_kmem_commit_charge(page, memcg, order);
		509	}
		510
		511	#else
		512	static inline bool
		513	memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
		514	{
		515	return true;
		516	}
		517
		518	static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
		519	{
		520	}
		521
		522	static inline void
		523	memcg_kmem_commit_charge(struct page page, struct mem_cgroup memcg, int order)
		524	{
		525	}
		526	#endif /* CONFIG_MEMCG_KMEM */
417	#endif /* _LINUX_MEMCONTROL_H */	527	#endif /* _LINUX_MEMCONTROL_H */
418		528


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bba1cb4bbb82..b9afa060b8d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10	* Copyright (C) 2009 Nokia Corporation	10	* Copyright (C) 2009 Nokia Corporation
11	* Author: Kirill A. Shutemov	11	* Author: Kirill A. Shutemov
12	*	12	*
		13	* Kernel Memory Controller
		14	* Copyright (C) 2012 Parallels Inc. and Google Inc.
		15	* Authors: Glauber Costa and Suleiman Souhlal
		16	*
13	* This program is free software; you can redistribute it and/or modify	17	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by	18	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or	19	* the Free Software Foundation; either version 2 of the License, or
@@ -2661,6 +2665,172 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2661	memcg_check_events(memcg, page);	2665	memcg_check_events(memcg, page);
2662	}	2666	}
2663		2667
		2668	#ifdef CONFIG_MEMCG_KMEM
		2669	static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
		2670	{
		2671	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
		2672	(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
		2673	}
		2674
		2675	static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
		2676	{
		2677	struct res_counter *fail_res;
		2678	struct mem_cgroup *_memcg;
		2679	int ret = 0;
		2680	bool may_oom;
		2681
		2682	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
		2683	if (ret)
		2684	return ret;
		2685
		2686	/*
		2687	* Conditions under which we can wait for the oom_killer. Those are
		2688	* the same conditions tested by the core page allocator
		2689	*/
		2690	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
		2691
		2692	_memcg = memcg;
		2693	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
		2694	&_memcg, may_oom);
		2695
		2696	if (ret == -EINTR) {
		2697	/*
		2698	* __mem_cgroup_try_charge() chosed to bypass to root due to
		2699	* OOM kill or fatal signal. Since our only options are to
		2700	* either fail the allocation or charge it to this cgroup, do
		2701	* it as a temporary condition. But we can't fail. From a
		2702	* kmem/slab perspective, the cache has already been selected,
		2703	* by mem_cgroup_kmem_get_cache(), so it is too late to change
		2704	* our minds.
		2705	*
		2706	* This condition will only trigger if the task entered
		2707	* memcg_charge_kmem in a sane state, but was OOM-killed during
		2708	* __mem_cgroup_try_charge() above. Tasks that were already
		2709	* dying when the allocation triggers should have been already
		2710	* directed to the root cgroup in memcontrol.h
		2711	*/
		2712	res_counter_charge_nofail(&memcg->res, size, &fail_res);
		2713	if (do_swap_account)
		2714	res_counter_charge_nofail(&memcg->memsw, size,
		2715	&fail_res);
		2716	ret = 0;
		2717	} else if (ret)
		2718	res_counter_uncharge(&memcg->kmem, size);
		2719
		2720	return ret;
		2721	}
		2722
		2723	static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
		2724	{
		2725	res_counter_uncharge(&memcg->kmem, size);
		2726	res_counter_uncharge(&memcg->res, size);
		2727	if (do_swap_account)
		2728	res_counter_uncharge(&memcg->memsw, size);
		2729	}
		2730
		2731	/*
		2732	* We need to verify if the allocation against current->mm->owner's memcg is
		2733	* possible for the given order. But the page is not allocated yet, so we'll
		2734	* need a further commit step to do the final arrangements.
		2735	*
		2736	* It is possible for the task to switch cgroups in this mean time, so at
		2737	* commit time, we can't rely on task conversion any longer. We'll then use
		2738	* the handle argument to return to the caller which cgroup we should commit
		2739	* against. We could also return the memcg directly and avoid the pointer
		2740	* passing, but a boolean return value gives better semantics considering
		2741	* the compiled-out case as well.
		2742	*
		2743	* Returning true means the allocation is possible.
		2744	*/
		2745	bool
		2746	__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
		2747	{
		2748	struct mem_cgroup *memcg;
		2749	int ret;
		2750
		2751	*_memcg = NULL;
		2752	memcg = try_get_mem_cgroup_from_mm(current->mm);
		2753
		2754	/*
		2755	* very rare case described in mem_cgroup_from_task. Unfortunately there
		2756	* isn't much we can do without complicating this too much, and it would
		2757	* be gfp-dependent anyway. Just let it go
		2758	*/
		2759	if (unlikely(!memcg))
		2760	return true;
		2761
		2762	if (!memcg_can_account_kmem(memcg)) {
		2763	css_put(&memcg->css);
		2764	return true;
		2765	}
		2766
		2767	mem_cgroup_get(memcg);
		2768
		2769	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
		2770	if (!ret)
		2771	*_memcg = memcg;
		2772	else
		2773	mem_cgroup_put(memcg);
		2774
		2775	css_put(&memcg->css);
		2776	return (ret == 0);
		2777	}
		2778
		2779	void __memcg_kmem_commit_charge(struct page page, struct mem_cgroup memcg,
		2780	int order)
		2781	{
		2782	struct page_cgroup *pc;
		2783
		2784	VM_BUG_ON(mem_cgroup_is_root(memcg));
		2785
		2786	/* The page allocation failed. Revert */
		2787	if (!page) {
		2788	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
		2789	mem_cgroup_put(memcg);
		2790	return;
		2791	}
		2792
		2793	pc = lookup_page_cgroup(page);
		2794	lock_page_cgroup(pc);
		2795	pc->mem_cgroup = memcg;
		2796	SetPageCgroupUsed(pc);
		2797	unlock_page_cgroup(pc);
		2798	}
		2799
		2800	void __memcg_kmem_uncharge_pages(struct page *page, int order)
		2801	{
		2802	struct mem_cgroup *memcg = NULL;
		2803	struct page_cgroup *pc;
		2804
		2805
		2806	pc = lookup_page_cgroup(page);
		2807	/*
		2808	* Fast unlocked return. Theoretically might have changed, have to
		2809	* check again after locking.
		2810	*/
		2811	if (!PageCgroupUsed(pc))
		2812	return;
		2813
		2814	lock_page_cgroup(pc);
		2815	if (PageCgroupUsed(pc)) {
		2816	memcg = pc->mem_cgroup;
		2817	ClearPageCgroupUsed(pc);
		2818	}
		2819	unlock_page_cgroup(pc);
		2820
		2821	/*
		2822	* We trust that only if there is a memcg associated with the page, it
		2823	* is a valid allocation
		2824	*/
		2825	if (!memcg)
		2826	return;
		2827
		2828	VM_BUG_ON(mem_cgroup_is_root(memcg));
		2829	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
		2830	mem_cgroup_put(memcg);
		2831	}
		2832	#endif /* CONFIG_MEMCG_KMEM */
		2833
2664	#ifdef CONFIG_TRANSPARENT_HUGEPAGE	2834	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2665		2835
2666	#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK \| 1 << PCG_MIGRATION)	2836	#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK \| 1 << PCG_MIGRATION)