2 files changed, 1199 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 001bd3b65dd1..ea8c8a12e19a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..6ba857bec71b
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,1198 @@
+/*
+ *  kernel/cgroup.c
+ *
+ *  Generic process-grouping system.
+ *
+ *  Based originally on the cpuset system, extracted by Paul Menage
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Copyright notices from the original cpuset code:
+ *  --------------------------------------------------
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  ---------------------------------------------------
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <asm/atomic.h>
+/* Generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+/*
+ * A cgroupfs_root represents the root of a cgroup hierarchy,
+ * and may be associated with a superblock to form an active
+ * hierarchy
+ */
+struct cgroupfs_root {
+        struct super_block *sb;
+        /*
+         * The bitmask of subsystems intended to be attached to this
+         * hierarchy
+         */
+        unsigned long subsys_bits;
+        /* The bitmask of subsystems currently attached to this hierarchy */
+        unsigned long actual_subsys_bits;
+        /* A list running through the attached subsystems */
+        struct list_head subsys_list;
+        /* The root cgroup for this hierarchy */
+        struct cgroup top_cgroup;
+        /* Tracks how many cgroups are currently defined in hierarchy.*/
+        int number_of_cgroups;
+        /* A list running through the mounted hierarchies */
+        struct list_head root_list;
+        /* Hierarchy-specific flags */
+        unsigned long flags;
+};
+/*
+ * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
+ * subsystems that are otherwise unattached - it never has more than a
+ * single cgroup, and all tasks are part of that cgroup.
+ */
+static struct cgroupfs_root rootnode;
+/* The list of hierarchy roots */
+static LIST_HEAD(roots);
+/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
+#define dummytop (&rootnode.top_cgroup)
+/* This flag indicates whether tasks in the fork and exit paths should
+ * take callback_mutex and check for fork/exit handlers to call. This
+ * avoids us having to do extra work in the fork/exit path if none of the
+ * subsystems need to be called.
+ */
+static int need_forkexit_callback;
+/* bits in struct cgroup flags field */
+enum {
+        CONT_REMOVED,
+};
+/* convenient tests for these bits */
+inline int cgroup_is_removed(const struct cgroup *cont)
+{
+        return test_bit(CONT_REMOVED, &cont->flags);
+}
+/* bits in struct cgroupfs_root flags field */
+enum {
+        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+};
+/*
+ * for_each_subsys() allows you to iterate on each subsystem attached to
+ * an active hierarchy
+ */
+#define for_each_subsys(_root, _ss) \
+list_for_each_entry(_ss, &_root->subsys_list, sibling)
+/* for_each_root() allows you to iterate across the active hierarchies */
+#define for_each_root(_root) \
+list_for_each_entry(_root, &roots, root_list)
+/*
+ * There is one global cgroup mutex. We also require taking
+ * task_lock() when dereferencing a task's cgroup subsys pointers.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing.  However, if the count goes to zero, then only
+ * attach_task() can increment it again.  Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count).  So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * The cgroup_common_file_write handler for operations that modify
+ * the cgroup hierarchy holds cgroup_mutex across the entire operation,
+ * single threading all such cgroup modifications across the system.
+ *
+ * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
+ * (usually) take cgroup_mutex.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cgroup_exit(),
+ * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
+ * is taken, and if the cgroup count is zero, a usermode call made
+ * to /sbin/cgroup_release_agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty.  Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * always has either children cgroups and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cgroup cannot be deleted.
+ *
+ *      The task_lock() exception
+ *
+ * The need for this exception arises from the action of
+ * attach_task(), which overwrites one tasks cgroup pointer with
+ * another.  It does so using cgroup_mutexe, however there are
+ * several performance critical places that need to reference
+ * task->cgroup without the expense of grabbing a system global
+ * mutex.  Therefore except as noted below, when dereferencing or, as
+ * in attach_task(), modifying a task'ss cgroup pointer we use
+ * task_lock(), which acts on a spinlock (task->alloc_lock) already in
+ * the task_struct routinely used for such matters.
+ *
+ * P.S.  One more locking exception.  RCU is used to guard the
+ * update of a tasks cgroup pointer by attach_task()
+ */
+static DEFINE_MUTEX(cgroup_mutex);
+/**
+ * cgroup_lock - lock out any changes to cgroup structures
+ *
+ */
+void cgroup_lock(void)
+{

diff --git a/kernel/Makefile b/kernel/Makefile index 001bd3b65dd1..ea8c8a12e19a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/
36	obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o	36	obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37	obj-$(CONFIG_KEXEC) += kexec.o	37	obj-$(CONFIG_KEXEC) += kexec.o
38	obj-$(CONFIG_COMPAT) += compat.o	38	obj-$(CONFIG_COMPAT) += compat.o
		39	obj-$(CONFIG_CGROUPS) += cgroup.o
39	obj-$(CONFIG_CPUSETS) += cpuset.o	40	obj-$(CONFIG_CPUSETS) += cpuset.o
40	obj-$(CONFIG_IKCONFIG) += configs.o	41	obj-$(CONFIG_IKCONFIG) += configs.o
41	obj-$(CONFIG_STOP_MACHINE) += stop_machine.o	42	obj-$(CONFIG_STOP_MACHINE) += stop_machine.o


diff --git a/kernel/cgroup.c b/kernel/cgroup.c new file mode 100644 index 000000000000..6ba857bec71b --- /dev/null +++ b/kernel/cgroup.c
@@ -0,0 +1,1198 @@
		1	/*
		2	* kernel/cgroup.c
		3	*
		4	* Generic process-grouping system.
		5	*
		6	* Based originally on the cpuset system, extracted by Paul Menage
		7	* Copyright (C) 2006 Google, Inc
		8	*
		9	* Copyright notices from the original cpuset code:
		10	* --------------------------------------------------
		11	* Copyright (C) 2003 BULL SA.
		12	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
		13	*
		14	* Portions derived from Patrick Mochel's sysfs code.
		15	* sysfs is Copyright (c) 2001-3 Patrick Mochel
		16	*
		17	* 2003-10-10 Written by Simon Derr.
		18	* 2003-10-22 Updates by Stephen Hemminger.
		19	* 2004 May-July Rework by Paul Jackson.
		20	* ---------------------------------------------------
		21	*
		22	* This file is subject to the terms and conditions of the GNU General Public
		23	* License. See the file COPYING in the main directory of the Linux
		24	* distribution for more details.
		25	*/
		26
		27	#include <linux/cgroup.h>
		28	#include <linux/errno.h>
		29	#include <linux/fs.h>
		30	#include <linux/kernel.h>
		31	#include <linux/list.h>
		32	#include <linux/mm.h>
		33	#include <linux/mutex.h>
		34	#include <linux/mount.h>
		35	#include <linux/pagemap.h>
		36	#include <linux/rcupdate.h>
		37	#include <linux/sched.h>
		38	#include <linux/seq_file.h>
		39	#include <linux/slab.h>
		40	#include <linux/magic.h>
		41	#include <linux/spinlock.h>
		42	#include <linux/string.h>
		43
		44	#include <asm/atomic.h>
		45
		46	/* Generate an array of cgroup subsystem pointers */
		47	#define SUBSYS(_x) &_x ## _subsys,
		48
		49	static struct cgroup_subsys *subsys[] = {
		50	#include <linux/cgroup_subsys.h>
		51	};
		52
		53	/*
		54	* A cgroupfs_root represents the root of a cgroup hierarchy,
		55	* and may be associated with a superblock to form an active
		56	* hierarchy
		57	*/
		58	struct cgroupfs_root {
		59	struct super_block *sb;
		60
		61	/*
		62	* The bitmask of subsystems intended to be attached to this
		63	* hierarchy
		64	*/
		65	unsigned long subsys_bits;
		66
		67	/* The bitmask of subsystems currently attached to this hierarchy */
		68	unsigned long actual_subsys_bits;
		69
		70	/* A list running through the attached subsystems */
		71	struct list_head subsys_list;
		72
		73	/* The root cgroup for this hierarchy */
		74	struct cgroup top_cgroup;
		75
		76	/* Tracks how many cgroups are currently defined in hierarchy.*/
		77	int number_of_cgroups;
		78
		79	/* A list running through the mounted hierarchies */
		80	struct list_head root_list;
		81
		82	/* Hierarchy-specific flags */
		83	unsigned long flags;
		84	};
		85
		86
		87	/*
		88	* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
		89	* subsystems that are otherwise unattached - it never has more than a
		90	* single cgroup, and all tasks are part of that cgroup.
		91	*/
		92	static struct cgroupfs_root rootnode;
		93
		94	/* The list of hierarchy roots */
		95
		96	static LIST_HEAD(roots);
		97
		98	/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
		99	#define dummytop (&rootnode.top_cgroup)
		100
		101	/* This flag indicates whether tasks in the fork and exit paths should
		102	* take callback_mutex and check for fork/exit handlers to call. This
		103	* avoids us having to do extra work in the fork/exit path if none of the
		104	* subsystems need to be called.
		105	*/
		106	static int need_forkexit_callback;
		107
		108	/* bits in struct cgroup flags field */
		109	enum {
		110	CONT_REMOVED,
		111	};
		112
		113	/* convenient tests for these bits */
		114	inline int cgroup_is_removed(const struct cgroup *cont)
		115	{
		116	return test_bit(CONT_REMOVED, &cont->flags);
		117	}
		118
		119	/* bits in struct cgroupfs_root flags field */
		120	enum {
		121	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
		122	};
		123
		124	/*
		125	* for_each_subsys() allows you to iterate on each subsystem attached to
		126	* an active hierarchy
		127	*/
		128	#define for_each_subsys(_root, _ss) \
		129	list_for_each_entry(_ss, &_root->subsys_list, sibling)
		130
		131	/* for_each_root() allows you to iterate across the active hierarchies */
		132	#define for_each_root(_root) \
		133	list_for_each_entry(_root, &roots, root_list)
		134
		135	/*
		136	* There is one global cgroup mutex. We also require taking
		137	* task_lock() when dereferencing a task's cgroup subsys pointers.
		138	* See "The task_lock() exception", at the end of this comment.
		139	*
		140	* A task must hold cgroup_mutex to modify cgroups.
		141	*
		142	* Any task can increment and decrement the count field without lock.
		143	* So in general, code holding cgroup_mutex can't rely on the count
		144	* field not changing. However, if the count goes to zero, then only
		145	* attach_task() can increment it again. Because a count of zero
		146	* means that no tasks are currently attached, therefore there is no
		147	* way a task attached to that cgroup can fork (the other way to
		148	* increment the count). So code holding cgroup_mutex can safely
		149	* assume that if the count is zero, it will stay zero. Similarly, if
		150	* a task holds cgroup_mutex on a cgroup with zero count, it
		151	* knows that the cgroup won't be removed, as cgroup_rmdir()
		152	* needs that mutex.
		153	*
		154	* The cgroup_common_file_write handler for operations that modify
		155	* the cgroup hierarchy holds cgroup_mutex across the entire operation,
		156	* single threading all such cgroup modifications across the system.
		157	*
		158	* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
		159	* (usually) take cgroup_mutex. These are the two most performance
		160	* critical pieces of code here. The exception occurs on cgroup_exit(),
		161	* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
		162	* is taken, and if the cgroup count is zero, a usermode call made
		163	* to /sbin/cgroup_release_agent with the name of the cgroup (path
		164	* relative to the root of cgroup file system) as the argument.
		165	*
		166	* A cgroup can only be deleted if both its 'count' of using tasks
		167	* is zero, and its list of 'children' cgroups is empty. Since all
		168	* tasks in the system use _some_ cgroup, and since there is always at
		169	* least one task in the system (init, pid == 1), therefore, top_cgroup
		170	* always has either children cgroups and/or using tasks. So we don't
		171	* need a special hack to ensure that top_cgroup cannot be deleted.
		172	*
		173	* The task_lock() exception
		174	*
		175	* The need for this exception arises from the action of
		176	* attach_task(), which overwrites one tasks cgroup pointer with
		177	* another. It does so using cgroup_mutexe, however there are
		178	* several performance critical places that need to reference
		179	* task->cgroup without the expense of grabbing a system global
		180	* mutex. Therefore except as noted below, when dereferencing or, as
		181	* in attach_task(), modifying a task'ss cgroup pointer we use
		182	* task_lock(), which acts on a spinlock (task->alloc_lock) already in
		183	* the task_struct routinely used for such matters.
		184	*
		185	* P.S. One more locking exception. RCU is used to guard the
		186	* update of a tasks cgroup pointer by attach_task()
		187	*/
		188
		189	static DEFINE_MUTEX(cgroup_mutex);
		190
		191	/**
		192	* cgroup_lock - lock out any changes to cgroup structures
		193	*
		194	*/
		195
		196	void cgroup_lock(void)
		197	{