Task Control Groups: basic task cgroup framework

Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-19 02:39:30 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:36 -0400
commit: ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree: 0881a031e669582f819d572339e955b04abfc3d2 /kernel
parent: 55a230aae650157720becc09cadb7d10efbf5013 (diff)
2 files changed, 1199 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 001bd3b65dd1..ea8c8a12e19a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..6ba857bec71b
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,1198 @@
+/*
+ *  kernel/cgroup.c
+ *
+ *  Generic process-grouping system.
+ *
+ *  Based originally on the cpuset system, extracted by Paul Menage
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Copyright notices from the original cpuset code:
+ *  --------------------------------------------------
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  ---------------------------------------------------
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <asm/atomic.h>
+/* Generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+/*
+ * A cgroupfs_root represents the root of a cgroup hierarchy,
+ * and may be associated with a superblock to form an active
+ * hierarchy
+ */
+struct cgroupfs_root {
+        struct super_block *sb;
+        /*
+         * The bitmask of subsystems intended to be attached to this
+         * hierarchy
+         */
+        unsigned long subsys_bits;
+        /* The bitmask of subsystems currently attached to this hierarchy */
+        unsigned long actual_subsys_bits;
+        /* A list running through the attached subsystems */
+        struct list_head subsys_list;
+        /* The root cgroup for this hierarchy */
+        struct cgroup top_cgroup;
+        /* Tracks how many cgroups are currently defined in hierarchy.*/
+        int number_of_cgroups;
+        /* A list running through the mounted hierarchies */
+        struct list_head root_list;
+        /* Hierarchy-specific flags */
+        unsigned long flags;
+};
+/*
+ * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
+ * subsystems that are otherwise unattached - it never has more than a
+ * single cgroup, and all tasks are part of that cgroup.
+ */
+static struct cgroupfs_root rootnode;
+/* The list of hierarchy roots */
+static LIST_HEAD(roots);
+/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
+#define dummytop (&rootnode.top_cgroup)
+/* This flag indicates whether tasks in the fork and exit paths should
+ * take callback_mutex and check for fork/exit handlers to call. This
+ * avoids us having to do extra work in the fork/exit path if none of the
+ * subsystems need to be called.
+ */
+static int need_forkexit_callback;
+/* bits in struct cgroup flags field */
+enum {
+        CONT_REMOVED,
+};
+/* convenient tests for these bits */
+inline int cgroup_is_removed(const struct cgroup *cont)
+{
+        return test_bit(CONT_REMOVED, &cont->flags);
+}
+/* bits in struct cgroupfs_root flags field */
+enum {
+        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+};
+/*
+ * for_each_subsys() allows you to iterate on each subsystem attached to
+ * an active hierarchy
+ */
+#define for_each_subsys(_root, _ss) \
+list_for_each_entry(_ss, &_root->subsys_list, sibling)
+/* for_each_root() allows you to iterate across the active hierarchies */
+#define for_each_root(_root) \
+list_for_each_entry(_root, &roots, root_list)
+/*
+ * There is one global cgroup mutex. We also require taking
+ * task_lock() when dereferencing a task's cgroup subsys pointers.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing.  However, if the count goes to zero, then only
+ * attach_task() can increment it again.  Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count).  So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * The cgroup_common_file_write handler for operations that modify
+ * the cgroup hierarchy holds cgroup_mutex across the entire operation,
+ * single threading all such cgroup modifications across the system.
+ *
+ * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
+ * (usually) take cgroup_mutex.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cgroup_exit(),
+ * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
+ * is taken, and if the cgroup count is zero, a usermode call made
+ * to /sbin/cgroup_release_agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty.  Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * always has either children cgroups and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cgroup cannot be deleted.
+ *
+ *      The task_lock() exception
+ *
+ * The need for this exception arises from the action of
+ * attach_task(), which overwrites one tasks cgroup pointer with
+ * another.  It does so using cgroup_mutexe, however there are
+ * several performance critical places that need to reference
+ * task->cgroup without the expense of grabbing a system global
+ * mutex.  Therefore except as noted below, when dereferencing or, as
+ * in attach_task(), modifying a task'ss cgroup pointer we use
+ * task_lock(), which acts on a spinlock (task->alloc_lock) already in
author	Paul Menage <menage@google.com>	2007-10-19 02:39:30 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:36 -0400
commit	ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree	0881a031e669582f819d572339e955b04abfc3d2 /kernel
parent	55a230aae650157720becc09cadb7d10efbf5013 (diff)

diff --git a/kernel/Makefile b/kernel/Makefile index 001bd3b65dd1..ea8c8a12e19a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/
36	obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o	36	obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37	obj-$(CONFIG_KEXEC) += kexec.o	37	obj-$(CONFIG_KEXEC) += kexec.o
38	obj-$(CONFIG_COMPAT) += compat.o	38	obj-$(CONFIG_COMPAT) += compat.o
		39	obj-$(CONFIG_CGROUPS) += cgroup.o
39	obj-$(CONFIG_CPUSETS) += cpuset.o	40	obj-$(CONFIG_CPUSETS) += cpuset.o
40	obj-$(CONFIG_IKCONFIG) += configs.o	41	obj-$(CONFIG_IKCONFIG) += configs.o
41	obj-$(CONFIG_STOP_MACHINE) += stop_machine.o	42	obj-$(CONFIG_STOP_MACHINE) += stop_machine.o


diff --git a/kernel/cgroup.c b/kernel/cgroup.c new file mode 100644 index 000000000000..6ba857bec71b --- /dev/null +++ b/kernel/cgroup.c
@@ -0,0 +1,1198 @@
		1	/*
		2	* kernel/cgroup.c
		3	*
		4	* Generic process-grouping system.
		5	*
		6	* Based originally on the cpuset system, extracted by Paul Menage
		7	* Copyright (C) 2006 Google, Inc
		8	*
		9	* Copyright notices from the original cpuset code:
		10	* --------------------------------------------------
		11	* Copyright (C) 2003 BULL SA.
		12	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
		13	*
		14	* Portions derived from Patrick Mochel's sysfs code.
		15	* sysfs is Copyright (c) 2001-3 Patrick Mochel
		16	*
		17	* 2003-10-10 Written by Simon Derr.
		18	* 2003-10-22 Updates by Stephen Hemminger.
		19	* 2004 May-July Rework by Paul Jackson.
		20	* ---------------------------------------------------
		21	*
		22	* This file is subject to the terms and conditions of the GNU General Public
		23	* License. See the file COPYING in the main directory of the Linux
		24	* distribution for more details.
		25	*/
		26
		27	#include <linux/cgroup.h>
		28	#include <linux/errno.h>
		29	#include <linux/fs.h>
		30	#include <linux/kernel.h>
		31	#include <linux/list.h>
		32	#include <linux/mm.h>
		33	#include <linux/mutex.h>
		34	#include <linux/mount.h>
		35	#include <linux/pagemap.h>
		36	#include <linux/rcupdate.h>
		37	#include <linux/sched.h>
		38	#include <linux/seq_file.h>
		39	#include <linux/slab.h>
		40	#include <linux/magic.h>
		41	#include <linux/spinlock.h>
		42	#include <linux/string.h>
		43
		44	#include <asm/atomic.h>
		45
		46	/* Generate an array of cgroup subsystem pointers */
		47	#define SUBSYS(_x) &_x ## _subsys,
		48
		49	static struct cgroup_subsys *subsys[] = {
		50	#include <linux/cgroup_subsys.h>
		51	};
		52
		53	/*
		54	* A cgroupfs_root represents the root of a cgroup hierarchy,
		55	* and may be associated with a superblock to form an active
		56	* hierarchy
		57	*/
		58	struct cgroupfs_root {
		59	struct super_block *sb;
		60
		61	/*
		62	* The bitmask of subsystems intended to be attached to this
		63	* hierarchy
		64	*/
		65	unsigned long subsys_bits;
		66
		67	/* The bitmask of subsystems currently attached to this hierarchy */
		68	unsigned long actual_subsys_bits;
		69
		70	/* A list running through the attached subsystems */
		71	struct list_head subsys_list;
		72
		73	/* The root cgroup for this hierarchy */
		74	struct cgroup top_cgroup;
		75
		76	/* Tracks how many cgroups are currently defined in hierarchy.*/
		77	int number_of_cgroups;
		78
		79	/* A list running through the mounted hierarchies */
		80	struct list_head root_list;
		81
		82	/* Hierarchy-specific flags */
		83	unsigned long flags;
		84	};
		85
		86
		87	/*
		88	* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
		89	* subsystems that are otherwise unattached - it never has more than a
		90	* single cgroup, and all tasks are part of that cgroup.
		91	*/
		92	static struct cgroupfs_root rootnode;
		93
		94	/* The list of hierarchy roots */
		95
		96	static LIST_HEAD(roots);
		97
		98	/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
		99	#define dummytop (&rootnode.top_cgroup)
		100
		101	/* This flag indicates whether tasks in the fork and exit paths should
		102	* take callback_mutex and check for fork/exit handlers to call. This
		103	* avoids us having to do extra work in the fork/exit path if none of the
		104	* subsystems need to be called.
		105	*/
		106	static int need_forkexit_callback;
		107
		108	/* bits in struct cgroup flags field */
		109	enum {
		110	CONT_REMOVED,
		111	};
		112
		113	/* convenient tests for these bits */
		114	inline int cgroup_is_removed(const struct cgroup *cont)
		115	{
		116	return test_bit(CONT_REMOVED, &cont->flags);
		117	}
		118
		119	/* bits in struct cgroupfs_root flags field */
		120	enum {
		121	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
		122	};
		123
		124	/*
		125	* for_each_subsys() allows you to iterate on each subsystem attached to
		126	* an active hierarchy
		127	*/
		128	#define for_each_subsys(_root, _ss) \
		129	list_for_each_entry(_ss, &_root->subsys_list, sibling)
		130
		131	/* for_each_root() allows you to iterate across the active hierarchies */
		132	#define for_each_root(_root) \
		133	list_for_each_entry(_root, &roots, root_list)
		134
		135	/*
		136	* There is one global cgroup mutex. We also require taking
		137	* task_lock() when dereferencing a task's cgroup subsys pointers.
		138	* See "The task_lock() exception", at the end of this comment.
		139	*
		140	* A task must hold cgroup_mutex to modify cgroups.
		141	*
		142	* Any task can increment and decrement the count field without lock.
		143	* So in general, code holding cgroup_mutex can't rely on the count
		144	* field not changing. However, if the count goes to zero, then only
		145	* attach_task() can increment it again. Because a count of zero
		146	* means that no tasks are currently attached, therefore there is no
		147	* way a task attached to that cgroup can fork (the other way to
		148	* increment the count). So code holding cgroup_mutex can safely
		149	* assume that if the count is zero, it will stay zero. Similarly, if
		150	* a task holds cgroup_mutex on a cgroup with zero count, it
		151	* knows that the cgroup won't be removed, as cgroup_rmdir()
		152	* needs that mutex.
		153	*
		154	* The cgroup_common_file_write handler for operations that modify
		155	* the cgroup hierarchy holds cgroup_mutex across the entire operation,
		156	* single threading all such cgroup modifications across the system.
		157	*
		158	* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
		159	* (usually) take cgroup_mutex. These are the two most performance
		160	* critical pieces of code here. The exception occurs on cgroup_exit(),
		161	* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
		162	* is taken, and if the cgroup count is zero, a usermode call made
		163	* to /sbin/cgroup_release_agent with the name of the cgroup (path
		164	* relative to the root of cgroup file system) as the argument.
		165	*
		166	* A cgroup can only be deleted if both its 'count' of using tasks
		167	* is zero, and its list of 'children' cgroups is empty. Since all
		168	* tasks in the system use _some_ cgroup, and since there is always at
		169	* least one task in the system (init, pid == 1), therefore, top_cgroup
		170	* always has either children cgroups and/or using tasks. So we don't
		171	* need a special hack to ensure that top_cgroup cannot be deleted.
		172	*
		173	* The task_lock() exception
		174	*
		175	* The need for this exception arises from the action of
		176	* attach_task(), which overwrites one tasks cgroup pointer with
		177	* another. It does so using cgroup_mutexe, however there are
		178	* several performance critical places that need to reference
		179	* task->cgroup without the expense of grabbing a system global
		180	* mutex. Therefore except as noted below, when dereferencing or, as
		181	* in attach_task(), modifying a task'ss cgroup pointer we use
		182	* task_lock(), which acts on a spinlock (task->alloc_lock) already in