cgroups: add an owner to the mm_struct

Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Hirokazu Takahashi <taka@valinux.co.jp> Cc: David Rientjes <rientjes@google.com>, Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Pekka Enberg <penberg@cs.helsinki.fi> Reviewed-by: Paul Menage <menage@google.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Balbir Singh <balbir@linux.vnet.ibm.com> 2008-04-29 04:00:16 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-29 11:06:10 -0400
commit: cf475ad28ac35cc9ba612d67158f29b73b38b05d (patch)
tree: 2c7cd568d00357bd42643ea602884e731cc24f26 /kernel
parent: 29486df325e1fe6e1764afcb19e3370804c2b002 (diff)
3 files changed, 121 insertions, 3 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index abc433772e5a..b9d467d83fc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -119,6 +119,7 @@ static int root_count;
 * be called.
 */
 static int need_forkexit_callback;
+static int need_mm_owner_callback __read_mostly;
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
        need_forkexit_callback |= ss->fork || ss->exit;
+        need_mm_owner_callback |= !!ss->mm_owner_changed;
        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
@@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
        }
 }
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+        struct cgroup *oldcgrp, *newcgrp;
+        if (need_mm_owner_callback) {
+                int i;
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        oldcgrp = task_cgroup(old, ss->subsys_id);
+                        newcgrp = task_cgroup(new, ss->subsys_id);
+                        if (oldcgrp == newcgrp)
+                                continue;
+                        if (ss->mm_owner_changed)
+                                ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+                }
+        }
+}
+#endif /* CONFIG_MM_OWNER */
 /**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..ae0f2c4e452b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(exit_fs);
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        /*
+         * If there are other users of the mm and the owner (us) is exiting
+         * we need to find a new owner to take on the responsibility.
+         */
+        if (!mm)
+                return 0;
+        if (atomic_read(&mm->mm_users) <= 1)
+                return 0;
+        if (mm->owner != p)
+                return 0;
+        return 1;
+}
+void mm_update_next_owner(struct mm_struct *mm)
+{
+        struct task_struct *c, *g, *p = current;
+retry:
+        if (!mm_need_new_owner(mm, p))
+                return;
+        read_lock(&tasklist_lock);
+        /*
+         * Search in the children
+         */
+        list_for_each_entry(c, &p->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search in the siblings
+         */
+        list_for_each_entry(c, &p->parent->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search through everything else. We should not get
+         * here often
+         */
+        do_each_thread(g, c) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        } while_each_thread(g, c);
+        read_unlock(&tasklist_lock);
+        return;
+assign_new_owner:
+        BUG_ON(c == p);
+        get_task_struct(c);
+        /*
+         * The task_lock protects c->mm from changing.
+         * We always want mm->owner->mm == mm
+         */
+        task_lock(c);
+        /*
+         * Delay read_unlock() till we have the task_lock()
+         * to ensure that c does not slip away underneath us
+         */
+        read_unlock(&tasklist_lock);
+        if (c->mm != mm) {
+                task_unlock(c);
+                put_task_struct(c);
+                goto retry;
+        }
+        cgroup_mm_owner_callbacks(mm->owner, c);
+        mm->owner = c;
+        task_unlock(c);
+        put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * Turn us into a lazy TLB process if we
 * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
        task_unlock(tsk);
+        mm_update_next_owner(mm);
        mmput(mm);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..156db96ff754 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->ioctx_list = NULL;
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
-        mm_init_cgroup(mm, p);
+        mm_init_owner(mm, p);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
                return mm;
        }
-        mm_free_cgroup(mm);
        free_mm(mm);
        return NULL;
 }
@@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm)
                        spin_unlock(&mmlist_lock);
                }
                put_swap_token(mm);
-                mm_free_cgroup(mm);
                mmdrop(mm);
        }
 }
@@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
author	Balbir Singh <balbir@linux.vnet.ibm.com>	2008-04-29 04:00:16 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-29 11:06:10 -0400
commit	cf475ad28ac35cc9ba612d67158f29b73b38b05d (patch)
tree	2c7cd568d00357bd42643ea602884e731cc24f26 /kernel
parent	29486df325e1fe6e1764afcb19e3370804c2b002 (diff)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc433772e5a..b9d467d83fc1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -119,6 +119,7 @@ static int root_count;
119	* be called.	119	* be called.
120	*/	120	*/
121	static int need_forkexit_callback;	121	static int need_forkexit_callback;
		122	static int need_mm_owner_callback __read_mostly;
122		123
123	/* convenient tests for these bits */	124	/* convenient tests for these bits */
124	inline int cgroup_is_removed(const struct cgroup *cgrp)	125	inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2498	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];	2499	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2499		2500
2500	need_forkexit_callback \|= ss->fork \|\| ss->exit;	2501	need_forkexit_callback \|= ss->fork \|\| ss->exit;
		2502	need_mm_owner_callback \|= !!ss->mm_owner_changed;
2501		2503
2502	/* At system boot, before all subsystems have been	2504	/* At system boot, before all subsystems have been
2503	* registered, no tasks have been forked, so we don't	2505	* registered, no tasks have been forked, so we don't
@@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2748	}	2750	}
2749	}	2751	}
2750		2752
		2753	#ifdef CONFIG_MM_OWNER
		2754	/**
		2755	* cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
		2756	* @p: the new owner
		2757	*
		2758	* Called on every change to mm->owner. mm_init_owner() does not
		2759	* invoke this routine, since it assigns the mm->owner the first time
		2760	* and does not change it.
		2761	*/
		2762	void cgroup_mm_owner_callbacks(struct task_struct old, struct task_struct new)
		2763	{
		2764	struct cgroup oldcgrp, newcgrp;
		2765
		2766	if (need_mm_owner_callback) {
		2767	int i;
		2768	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		2769	struct cgroup_subsys *ss = subsys[i];
		2770	oldcgrp = task_cgroup(old, ss->subsys_id);
		2771	newcgrp = task_cgroup(new, ss->subsys_id);
		2772	if (oldcgrp == newcgrp)
		2773	continue;
		2774	if (ss->mm_owner_changed)
		2775	ss->mm_owner_changed(ss, oldcgrp, newcgrp);
		2776	}
		2777	}
		2778	}
		2779	#endif /* CONFIG_MM_OWNER */
		2780
2751	/**	2781	/**
2752	* cgroup_post_fork - called on a new task after adding it to the task list	2782	* cgroup_post_fork - called on a new task after adding it to the task list
2753	* @child: the task in question	2783	* @child: the task in question


diff --git a/kernel/exit.c b/kernel/exit.c index 2a9d98c641ac..ae0f2c4e452b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
557		557
558	EXPORT_SYMBOL_GPL(exit_fs);	558	EXPORT_SYMBOL_GPL(exit_fs);
559		559
		560	#ifdef CONFIG_MM_OWNER
		561	/*
		562	* Task p is exiting and it owned mm, lets find a new owner for it
		563	*/
		564	static inline int
		565	mm_need_new_owner(struct mm_struct mm, struct task_struct p)
		566	{
		567	/*
		568	* If there are other users of the mm and the owner (us) is exiting
		569	* we need to find a new owner to take on the responsibility.
		570	*/
		571	if (!mm)
		572	return 0;
		573	if (atomic_read(&mm->mm_users) <= 1)
		574	return 0;
		575	if (mm->owner != p)
		576	return 0;
		577	return 1;
		578	}
		579
		580	void mm_update_next_owner(struct mm_struct *mm)
		581	{
		582	struct task_struct c, g, *p = current;
		583
		584	retry:
		585	if (!mm_need_new_owner(mm, p))
		586	return;
		587
		588	read_lock(&tasklist_lock);
		589	/*
		590	* Search in the children
		591	*/
		592	list_for_each_entry(c, &p->children, sibling) {
		593	if (c->mm == mm)
		594	goto assign_new_owner;
		595	}
		596
		597	/*
		598	* Search in the siblings
		599	*/
		600	list_for_each_entry(c, &p->parent->children, sibling) {
		601	if (c->mm == mm)
		602	goto assign_new_owner;
		603	}
		604
		605	/*
		606	* Search through everything else. We should not get
		607	* here often
		608	*/
		609	do_each_thread(g, c) {
		610	if (c->mm == mm)
		611	goto assign_new_owner;
		612	} while_each_thread(g, c);
		613
		614	read_unlock(&tasklist_lock);
		615	return;
		616
		617	assign_new_owner:
		618	BUG_ON(c == p);
		619	get_task_struct(c);
		620	/*
		621	* The task_lock protects c->mm from changing.
		622	* We always want mm->owner->mm == mm
		623	*/
		624	task_lock(c);
		625	/*
		626	* Delay read_unlock() till we have the task_lock()
		627	* to ensure that c does not slip away underneath us
		628	*/
		629	read_unlock(&tasklist_lock);
		630	if (c->mm != mm) {
		631	task_unlock(c);
		632	put_task_struct(c);
		633	goto retry;
		634	}
		635	cgroup_mm_owner_callbacks(mm->owner, c);
		636	mm->owner = c;
		637	task_unlock(c);
		638	put_task_struct(c);
		639	}
		640	#endif /* CONFIG_MM_OWNER */
		641
560	/*	642	/*
561	* Turn us into a lazy TLB process if we	643	* Turn us into a lazy TLB process if we
562	* aren't already..	644	* aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
596	/* We don't want this task to be frozen prematurely */	678	/* We don't want this task to be frozen prematurely */
597	clear_freeze_flag(tsk);	679	clear_freeze_flag(tsk);
598	task_unlock(tsk);	680	task_unlock(tsk);
		681	mm_update_next_owner(mm);
599	mmput(mm);	682	mmput(mm);
600	}	683	}
601		684


diff --git a/kernel/fork.c b/kernel/fork.c index 6067e429f281..156db96ff754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
381	mm->ioctx_list = NULL;	381	mm->ioctx_list = NULL;
382	mm->free_area_cache = TASK_UNMAPPED_BASE;	382	mm->free_area_cache = TASK_UNMAPPED_BASE;
383	mm->cached_hole_size = ~0UL;	383	mm->cached_hole_size = ~0UL;
384	mm_init_cgroup(mm, p);	384	mm_init_owner(mm, p);
385		385
386	if (likely(!mm_alloc_pgd(mm))) {	386	if (likely(!mm_alloc_pgd(mm))) {
387	mm->def_flags = 0;	387	mm->def_flags = 0;
388	return mm;	388	return mm;
389	}	389	}
390		390
391	mm_free_cgroup(mm);
392	free_mm(mm);	391	free_mm(mm);
393	return NULL;	392	return NULL;
394	}	393	}
@@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm)
438	spin_unlock(&mmlist_lock);	437	spin_unlock(&mmlist_lock);
439	}	438	}
440	put_swap_token(mm);	439	put_swap_token(mm);
441	mm_free_cgroup(mm);
442	mmdrop(mm);	440	mmdrop(mm);
443	}	441	}
444	}	442	}
@@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p)
982	#endif	980	#endif
983	}	981	}
984		982
		983	#ifdef CONFIG_MM_OWNER
		984	void mm_init_owner(struct mm_struct mm, struct task_struct p)
		985	{
		986	mm->owner = p;
		987	}
		988	#endif /* CONFIG_MM_OWNER */
		989
985	/*	990	/*
986	* This creates a new process as a copy of the old one,	991	* This creates a new process as a copy of the old one,
987	* but does not actually start it yet.	992	* but does not actually start it yet.