Task Control Groups: add tasks file interface

Add the per-directory "tasks" file for cgroupfs mounts; this allows the user to determine which tasks are members of a cgroup by reading a cgroup's "tasks", and to move a task into a cgroup by writing its pid to its "tasks". Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-19 02:39:32 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:36 -0400
commit: bbcb81d09104f0d440974b994c1fc508ccbe9503 (patch)
tree: 6d9ef3e2c611bb0a8f63519196f7bd7725b7ea1a
parent: ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (diff)
2 files changed, 368 insertions, 1 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 60735dcf427a..e2dd44f68f97 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -144,6 +144,16 @@ int cgroup_is_removed(const struct cgroup *cont);
 int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
+int __cgroup_task_count(const struct cgroup *cont);
+static inline int cgroup_task_count(const struct cgroup *cont)
+{
+        int task_count;
+        rcu_read_lock();
+        task_count = __cgroup_task_count(cont);
+        rcu_read_unlock();
+        return task_count;
+}
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cont);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6ba857bec71b..356c40d5d20a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,7 +40,7 @@
 #include <linux/magic.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/sort.h>
 #include <asm/atomic.h>
 /* Generate an array of cgroup subsystem pointers */
@@ -700,6 +700,127 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen)
        return 0;
 }
+/*
+ * Return the first subsystem attached to a cgroup's hierarchy, and
+ * its subsystem id.
+ */
+static void get_first_subsys(const struct cgroup *cont,
+                        struct cgroup_subsys_state **css, int *subsys_id)
+{
+        const struct cgroupfs_root *root = cont->root;
+        const struct cgroup_subsys *test_ss;
+        BUG_ON(list_empty(&root->subsys_list));
+        test_ss = list_entry(root->subsys_list.next,
+                             struct cgroup_subsys, sibling);
+        if (css) {
+                *css = cont->subsys[test_ss->subsys_id];
+                BUG_ON(!*css);
+        }
+        if (subsys_id)
+                *subsys_id = test_ss->subsys_id;
+}
+/*
+ * Attach task 'tsk' to cgroup 'cont'
+ *
+ * Call holding cgroup_mutex.  May take task_lock of
+ * the task 'pid' during call.
+ */
+static int attach_task(struct cgroup *cont, struct task_struct *tsk)
+{
+        int retval = 0;
+        struct cgroup_subsys *ss;
+        struct cgroup *oldcont;
+        struct css_set *cg = &tsk->cgroups;
+        struct cgroupfs_root *root = cont->root;
+        int i;
+        int subsys_id;
+        get_first_subsys(cont, NULL, &subsys_id);
+        /* Nothing to do if the task is already in that cgroup */
+        oldcont = task_cgroup(tsk, subsys_id);
+        if (cont == oldcont)
+                return 0;
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cont, tsk);
+                        if (retval) {
+                                return retval;
+                        }
+                }
+        }
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                return -ESRCH;
+        }
+        /* Update the css_set pointers for the subsystems in this
+         * hierarchy */
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (root->subsys_bits & (1ull << i)) {
+                        /* Subsystem is in this hierarchy. So we want
+                         * the subsystem state from the new
+                         * cgroup. Transfer the refcount from the
+                         * old to the new */
+                        atomic_inc(&cont->count);
+                        atomic_dec(&cg->subsys[i]->cgroup->count);
+                        rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+                }
+        }
+        task_unlock(tsk);
+        for_each_subsys(root, ss) {
+                if (ss->attach) {
+                        ss->attach(ss, cont, oldcont, tsk);
+                }
+        }
+        synchronize_rcu();
+        return 0;
+}
+/*
+ * Attach task with pid 'pid' to cgroup 'cont'. Call with
+ * cgroup_mutex, may take task_lock of task
+ */
+static int attach_task_by_pid(struct cgroup *cont, char *pidbuf)
+{
+        pid_t pid;
+        struct task_struct *tsk;
+        int ret;
+        if (sscanf(pidbuf, "%d", &pid) != 1)
+                return -EIO;
+        if (pid) {
+                rcu_read_lock();
+                tsk = find_task_by_pid(pid);
+                if (!tsk || tsk->flags & PF_EXITING) {
+                        rcu_read_unlock();
+                        return -ESRCH;
+                }
+                get_task_struct(tsk);
+                rcu_read_unlock();
+                if ((current->euid) && (current->euid != tsk->uid)
+                    && (current->euid != tsk->suid)) {
+                        put_task_struct(tsk);
+                        return -EACCES;
+                }
+        } else {
+                tsk = current;
+                get_task_struct(tsk);
+        }
+        ret = attach_task(cont, tsk);
+        put_task_struct(tsk);
+        return ret;
+}
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
@@ -708,6 +829,55 @@ enum cgroup_filetype {
        FILE_TASKLIST,
 };
+static ssize_t cgroup_common_file_write(struct cgroup *cont,
+                                           struct cftype *cft,
+                                           struct file *file,
+                                           const char __user *userbuf,
+                                           size_t nbytes, loff_t *unused_ppos)
+{
+        enum cgroup_filetype type = cft->private;
+        char *buffer;
+        int retval = 0;
+        if (nbytes >= PATH_MAX)
+                return -E2BIG;
+        /* +1 for nul-terminator */
+        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+        if (buffer == NULL)
+                return -ENOMEM;
+        if (copy_from_user(buffer, userbuf, nbytes)) {
+                retval = -EFAULT;
+                goto out1;
+        }
+        buffer[nbytes] = 0;     /* nul-terminate */
+        mutex_lock(&cgroup_mutex);
+        if (cgroup_is_removed(cont)) {
+                retval = -ENODEV;
+                goto out2;
+        }
+        switch (type) {
+        case FILE_TASKLIST:
+                retval = attach_task_by_pid(cont, buffer);
+                break;
+        default:
+                retval = -EINVAL;
+                goto out2;
+        }
+        if (retval == 0)
+                retval = nbytes;
+out2:
+        mutex_unlock(&cgroup_mutex);
+out1:
+        kfree(buffer);
+        return retval;
+}
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                                                size_t nbytes, loff_t *ppos)
 {
@@ -914,6 +1084,189 @@ int cgroup_add_files(struct cgroup *cont,
        return 0;
 }
+/* Count the number of tasks in a cgroup. Could be made more
+ * time-efficient but less space-efficient with more linked lists
+ * running through each cgroup and the css_set structures that
+ * referenced it. Must be called with tasklist_lock held for read or
+ * write or in an rcu critical section.
+ */
+int __cgroup_task_count(const struct cgroup *cont)
+{
+        int count = 0;
+        struct task_struct *g, *p;
+        struct cgroup_subsys_state *css;
+        int subsys_id;
+        get_first_subsys(cont, &css, &subsys_id);
+        do_each_thread(g, p) {
+                if (task_subsys_state(p, subsys_id) == css)
+                        count ++;
+        } while_each_thread(g, p);
+        return count;
+}
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon tasks file open(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here).  The struct
+ * ctr_struct * is stored in file->private_data.  Its resources will
+ * be freed by release() when the file is closed.  The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+struct ctr_struct {
+        char *buf;
+        int bufsz;
+};
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using cgroup
+ * 'cont'.  Return actual number of pids loaded.  No need to
+ * task_lock(p) when reading out p->cgroup, since we're in an RCU
+ * read section, so the css_set can't go away, and is
+ * immutable after creation.
+ */
+static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
+{
+        int n = 0;
+        struct task_struct *g, *p;
+        struct cgroup_subsys_state *css;
+        int subsys_id;
+        get_first_subsys(cont, &css, &subsys_id);
+        rcu_read_lock();
+        do_each_thread(g, p) {
+                if (task_subsys_state(p, subsys_id) == css) {
+                        pidarray[n++] = pid_nr(task_pid(p));
+                        if (unlikely(n == npids))
+                                goto array_full;
+                }
+        } while_each_thread(g, p);
+array_full:
+        rcu_read_unlock();
+        return n;
+}
+static int cmppid(const void *a, const void *b)
+{
+        return *(pid_t *)a - *(pid_t *)b;
+}
+/*
+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
+ * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
+ * count 'cnt' of how many chars would be written if buf were large enough.
+ */
+static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+{
+        int cnt = 0;
+        int i;
+        for (i = 0; i < npids; i++)
+                cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
+        return cnt;
+}
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cgroup being opened.
+ *
+ * Does not require any specific cgroup mutexes, and does not take any.
+ */
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+        struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
+        struct ctr_struct *ctr;
+        pid_t *pidarray;
+        int npids;
+        char c;
+        if (!(file->f_mode & FMODE_READ))
+                return 0;
+        ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+        if (!ctr)
+                goto err0;
+        /*
+         * If cgroup gets more users after we read count, we won't have
+         * enough space - tough.  This race is indistinguishable to the
+         * caller from the case that the additional cgroup users didn't
+         * show up until sometime later on.
+         */
+        npids = cgroup_task_count(cont);
+        if (npids) {
+                pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+                if (!pidarray)
+                        goto err1;
+                npids = pid_array_load(pidarray, npids, cont);
+                sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+                /* Call pid_array_to_buf() twice, first just to get bufsz */
+                ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
+                ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
+                if (!ctr->buf)
+                        goto err2;
+                ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
+                kfree(pidarray);
+        } else {
+                ctr->buf = 0;
+                ctr->bufsz = 0;
+        }
+        file->private_data = ctr;
+        return 0;
+err2:
+        kfree(pidarray);
+err1:
+        kfree(ctr);
+err0:
+        return -ENOMEM;
+}
+static ssize_t cgroup_tasks_read(struct cgroup *cont,
+                                    struct cftype *cft,
+                                    struct file *file, char __user *buf,
+                                    size_t nbytes, loff_t *ppos)
+{
+        struct ctr_struct *ctr = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
+}
+static int cgroup_tasks_release(struct inode *unused_inode,
+                                        struct file *file)
+{
+        struct ctr_struct *ctr;
+        if (file->f_mode & FMODE_READ) {
+                ctr = file->private_data;
+                kfree(ctr->buf);
+                kfree(ctr);
+        }
+        return 0;
+}
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+static struct cftype cft_tasks = {
+        .name = "tasks",
+        .open = cgroup_tasks_open,
+        .read = cgroup_tasks_read,
+        .write = cgroup_common_file_write,
+        .release = cgroup_tasks_release,
+        .private = FILE_TASKLIST,
+};
 static int cgroup_populate_dir(struct cgroup *cont)
 {
        int err;
@@ -922,6 +1275,10 @@ static int cgroup_populate_dir(struct cgroup *cont)
        /* First clear out any existing files */
        cgroup_clear_directory(cont->dentry);
+        err = cgroup_add_file(cont, NULL, &cft_tasks);
+        if (err < 0)
+                return err;
        for_each_subsys(cont->root, ss) {
                if (ss->populate && (err = ss->populate(ss, cont)) < 0)
                        return err;
author	Paul Menage <menage@google.com>	2007-10-19 02:39:32 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:36 -0400
commit	bbcb81d09104f0d440974b994c1fc508ccbe9503 (patch)
tree	6d9ef3e2c611bb0a8f63519196f7bd7725b7ea1a
parent	ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (diff)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 60735dcf427a..e2dd44f68f97 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h
@@ -144,6 +144,16 @@ int cgroup_is_removed(const struct cgroup *cont);
144		144
145	int cgroup_path(const struct cgroup cont, char buf, int buflen);	145	int cgroup_path(const struct cgroup cont, char buf, int buflen);
146		146
		147	int __cgroup_task_count(const struct cgroup *cont);
		148	static inline int cgroup_task_count(const struct cgroup *cont)
		149	{
		150	int task_count;
		151	rcu_read_lock();
		152	task_count = __cgroup_task_count(cont);
		153	rcu_read_unlock();
		154	return task_count;
		155	}
		156
147	/* Return true if the cgroup is a descendant of the current cgroup */	157	/* Return true if the cgroup is a descendant of the current cgroup */
148	int cgroup_is_descendant(const struct cgroup *cont);	158	int cgroup_is_descendant(const struct cgroup *cont);
149		159


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ba857bec71b..356c40d5d20a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -40,7 +40,7 @@
40	#include <linux/magic.h>	40	#include <linux/magic.h>
41	#include <linux/spinlock.h>	41	#include <linux/spinlock.h>
42	#include <linux/string.h>	42	#include <linux/string.h>
43		43	#include <linux/sort.h>
44	#include <asm/atomic.h>	44	#include <asm/atomic.h>
45		45
46	/* Generate an array of cgroup subsystem pointers */	46	/* Generate an array of cgroup subsystem pointers */
@@ -700,6 +700,127 @@ int cgroup_path(const struct cgroup cont, char buf, int buflen)
700	return 0;	700	return 0;
701	}	701	}
702		702
		703	/*
		704	* Return the first subsystem attached to a cgroup's hierarchy, and
		705	* its subsystem id.
		706	*/
		707
		708	static void get_first_subsys(const struct cgroup *cont,
		709	struct cgroup_subsys_state *css, int subsys_id)
		710	{
		711	const struct cgroupfs_root *root = cont->root;
		712	const struct cgroup_subsys *test_ss;
		713	BUG_ON(list_empty(&root->subsys_list));
		714	test_ss = list_entry(root->subsys_list.next,
		715	struct cgroup_subsys, sibling);
		716	if (css) {
		717	*css = cont->subsys[test_ss->subsys_id];
		718	BUG_ON(!*css);
		719	}
		720	if (subsys_id)
		721	*subsys_id = test_ss->subsys_id;
		722	}
		723
		724	/*
		725	* Attach task 'tsk' to cgroup 'cont'
		726	*
		727	* Call holding cgroup_mutex. May take task_lock of
		728	* the task 'pid' during call.
		729	*/
		730	static int attach_task(struct cgroup cont, struct task_struct tsk)
		731	{
		732	int retval = 0;
		733	struct cgroup_subsys *ss;
		734	struct cgroup *oldcont;
		735	struct css_set *cg = &tsk->cgroups;
		736	struct cgroupfs_root *root = cont->root;
		737	int i;
		738	int subsys_id;
		739
		740	get_first_subsys(cont, NULL, &subsys_id);
		741
		742	/* Nothing to do if the task is already in that cgroup */
		743	oldcont = task_cgroup(tsk, subsys_id);
		744	if (cont == oldcont)
		745	return 0;
		746
		747	for_each_subsys(root, ss) {
		748	if (ss->can_attach) {
		749	retval = ss->can_attach(ss, cont, tsk);
		750	if (retval) {
		751	return retval;
		752	}
		753	}
		754	}
		755
		756	task_lock(tsk);
		757	if (tsk->flags & PF_EXITING) {
		758	task_unlock(tsk);
		759	return -ESRCH;
		760	}
		761	/* Update the css_set pointers for the subsystems in this
		762	* hierarchy */
		763	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		764	if (root->subsys_bits & (1ull << i)) {
		765	/* Subsystem is in this hierarchy. So we want
		766	* the subsystem state from the new
		767	* cgroup. Transfer the refcount from the
		768	* old to the new */
		769	atomic_inc(&cont->count);
		770	atomic_dec(&cg->subsys[i]->cgroup->count);
		771	rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
		772	}
		773	}
		774	task_unlock(tsk);
		775
		776	for_each_subsys(root, ss) {
		777	if (ss->attach) {
		778	ss->attach(ss, cont, oldcont, tsk);
		779	}
		780	}
		781
		782	synchronize_rcu();
		783	return 0;
		784	}
		785
		786	/*
		787	* Attach task with pid 'pid' to cgroup 'cont'. Call with
		788	* cgroup_mutex, may take task_lock of task
		789	*/
		790	static int attach_task_by_pid(struct cgroup cont, char pidbuf)
		791	{
		792	pid_t pid;
		793	struct task_struct *tsk;
		794	int ret;
		795
		796	if (sscanf(pidbuf, "%d", &pid) != 1)
		797	return -EIO;
		798
		799	if (pid) {
		800	rcu_read_lock();
		801	tsk = find_task_by_pid(pid);
		802	if (!tsk \|\| tsk->flags & PF_EXITING) {
		803	rcu_read_unlock();
		804	return -ESRCH;
		805	}
		806	get_task_struct(tsk);
		807	rcu_read_unlock();
		808
		809	if ((current->euid) && (current->euid != tsk->uid)
		810	&& (current->euid != tsk->suid)) {
		811	put_task_struct(tsk);
		812	return -EACCES;
		813	}
		814	} else {
		815	tsk = current;
		816	get_task_struct(tsk);
		817	}
		818
		819	ret = attach_task(cont, tsk);
		820	put_task_struct(tsk);
		821	return ret;
		822	}
		823
703	/* The various types of files and directories in a cgroup file system */	824	/* The various types of files and directories in a cgroup file system */
704		825
705	enum cgroup_filetype {	826	enum cgroup_filetype {
@@ -708,6 +829,55 @@ enum cgroup_filetype {
708	FILE_TASKLIST,	829	FILE_TASKLIST,
709	};	830	};
710		831
		832	static ssize_t cgroup_common_file_write(struct cgroup *cont,
		833	struct cftype *cft,
		834	struct file *file,
		835	const char __user *userbuf,
		836	size_t nbytes, loff_t *unused_ppos)
		837	{
		838	enum cgroup_filetype type = cft->private;
		839	char *buffer;
		840	int retval = 0;
		841
		842	if (nbytes >= PATH_MAX)
		843	return -E2BIG;
		844
		845	/* +1 for nul-terminator */
		846	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
		847	if (buffer == NULL)
		848	return -ENOMEM;
		849
		850	if (copy_from_user(buffer, userbuf, nbytes)) {
		851	retval = -EFAULT;
		852	goto out1;
		853	}
		854	buffer[nbytes] = 0; /* nul-terminate */
		855
		856	mutex_lock(&cgroup_mutex);
		857
		858	if (cgroup_is_removed(cont)) {
		859	retval = -ENODEV;
		860	goto out2;
		861	}
		862
		863	switch (type) {
		864	case FILE_TASKLIST:
		865	retval = attach_task_by_pid(cont, buffer);
		866	break;
		867	default:
		868	retval = -EINVAL;
		869	goto out2;
		870	}
		871
		872	if (retval == 0)
		873	retval = nbytes;
		874	out2:
		875	mutex_unlock(&cgroup_mutex);
		876	out1:
		877	kfree(buffer);
		878	return retval;
		879	}
		880
711	static ssize_t cgroup_file_write(struct file file, const char __user buf,	881	static ssize_t cgroup_file_write(struct file file, const char __user buf,
712	size_t nbytes, loff_t *ppos)	882	size_t nbytes, loff_t *ppos)
713	{	883	{
@@ -914,6 +1084,189 @@ int cgroup_add_files(struct cgroup *cont,
914	return 0;	1084	return 0;
915	}	1085	}
916		1086
		1087	/* Count the number of tasks in a cgroup. Could be made more
		1088	* time-efficient but less space-efficient with more linked lists
		1089	* running through each cgroup and the css_set structures that
		1090	* referenced it. Must be called with tasklist_lock held for read or
		1091	* write or in an rcu critical section.
		1092	*/
		1093	int __cgroup_task_count(const struct cgroup *cont)
		1094	{
		1095	int count = 0;
		1096	struct task_struct g, p;
		1097	struct cgroup_subsys_state *css;
		1098	int subsys_id;
		1099
		1100	get_first_subsys(cont, &css, &subsys_id);
		1101	do_each_thread(g, p) {
		1102	if (task_subsys_state(p, subsys_id) == css)
		1103	count ++;
		1104	} while_each_thread(g, p);
		1105	return count;
		1106	}
		1107
		1108	/*
		1109	* Stuff for reading the 'tasks' file.
		1110	*
		1111	* Reading this file can return large amounts of data if a cgroup has
		1112	* lots of attached tasks. So it may need several calls to read(),
		1113	* but we cannot guarantee that the information we produce is correct
		1114	* unless we produce it entirely atomically.
		1115	*
		1116	* Upon tasks file open(), a struct ctr_struct is allocated, that
		1117	* will have a pointer to an array (also allocated here). The struct
		1118	* ctr_struct * is stored in file->private_data. Its resources will
		1119	* be freed by release() when the file is closed. The array is used
		1120	* to sprintf the PIDs and then used by read().
		1121	*/
		1122	struct ctr_struct {
		1123	char *buf;
		1124	int bufsz;
		1125	};
		1126
		1127	/*
		1128	* Load into 'pidarray' up to 'npids' of the tasks using cgroup
		1129	* 'cont'. Return actual number of pids loaded. No need to
		1130	* task_lock(p) when reading out p->cgroup, since we're in an RCU
		1131	* read section, so the css_set can't go away, and is
		1132	* immutable after creation.
		1133	*/
		1134	static int pid_array_load(pid_t pidarray, int npids, struct cgroup cont)
		1135	{
		1136	int n = 0;
		1137	struct task_struct g, p;
		1138	struct cgroup_subsys_state *css;
		1139	int subsys_id;
		1140
		1141	get_first_subsys(cont, &css, &subsys_id);
		1142	rcu_read_lock();
		1143	do_each_thread(g, p) {
		1144	if (task_subsys_state(p, subsys_id) == css) {
		1145	pidarray[n++] = pid_nr(task_pid(p));
		1146	if (unlikely(n == npids))
		1147	goto array_full;
		1148	}
		1149	} while_each_thread(g, p);
		1150
		1151	array_full:
		1152	rcu_read_unlock();
		1153	return n;
		1154	}
		1155
		1156	static int cmppid(const void a, const void b)
		1157	{
		1158	return (pid_t )a - (pid_t )b;
		1159	}
		1160
		1161	/*
		1162	* Convert array 'a' of 'npids' pid_t's to a string of newline separated
		1163	* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
		1164	* count 'cnt' of how many chars would be written if buf were large enough.
		1165	*/
		1166	static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
		1167	{
		1168	int cnt = 0;
		1169	int i;
		1170
		1171	for (i = 0; i < npids; i++)
		1172	cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
		1173	return cnt;
		1174	}
		1175
		1176	/*
		1177	* Handle an open on 'tasks' file. Prepare a buffer listing the
		1178	* process id's of tasks currently attached to the cgroup being opened.
		1179	*
		1180	* Does not require any specific cgroup mutexes, and does not take any.
		1181	*/
		1182	static int cgroup_tasks_open(struct inode unused, struct file file)
		1183	{
		1184	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
		1185	struct ctr_struct *ctr;
		1186	pid_t *pidarray;
		1187	int npids;
		1188	char c;
		1189
		1190	if (!(file->f_mode & FMODE_READ))
		1191	return 0;
		1192
		1193	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
		1194	if (!ctr)
		1195	goto err0;
		1196
		1197	/*
		1198	* If cgroup gets more users after we read count, we won't have
		1199	* enough space - tough. This race is indistinguishable to the
		1200	* caller from the case that the additional cgroup users didn't
		1201	* show up until sometime later on.
		1202	*/
		1203	npids = cgroup_task_count(cont);
		1204	if (npids) {
		1205	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
		1206	if (!pidarray)
		1207	goto err1;
		1208
		1209	npids = pid_array_load(pidarray, npids, cont);
		1210	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
		1211
		1212	/* Call pid_array_to_buf() twice, first just to get bufsz */
		1213	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
		1214	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
		1215	if (!ctr->buf)
		1216	goto err2;
		1217	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
		1218
		1219	kfree(pidarray);
		1220	} else {
		1221	ctr->buf = 0;
		1222	ctr->bufsz = 0;
		1223	}
		1224	file->private_data = ctr;
		1225	return 0;
		1226
		1227	err2:
		1228	kfree(pidarray);
		1229	err1:
		1230	kfree(ctr);
		1231	err0:
		1232	return -ENOMEM;
		1233	}
		1234
		1235	static ssize_t cgroup_tasks_read(struct cgroup *cont,
		1236	struct cftype *cft,
		1237	struct file file, char __user buf,
		1238	size_t nbytes, loff_t *ppos)
		1239	{
		1240	struct ctr_struct *ctr = file->private_data;
		1241
		1242	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
		1243	}
		1244
		1245	static int cgroup_tasks_release(struct inode *unused_inode,
		1246	struct file *file)
		1247	{
		1248	struct ctr_struct *ctr;
		1249
		1250	if (file->f_mode & FMODE_READ) {
		1251	ctr = file->private_data;
		1252	kfree(ctr->buf);
		1253	kfree(ctr);
		1254	}
		1255	return 0;
		1256	}
		1257
		1258	/*
		1259	* for the common functions, 'private' gives the type of file
		1260	*/
		1261	static struct cftype cft_tasks = {
		1262	.name = "tasks",
		1263	.open = cgroup_tasks_open,
		1264	.read = cgroup_tasks_read,
		1265	.write = cgroup_common_file_write,
		1266	.release = cgroup_tasks_release,
		1267	.private = FILE_TASKLIST,
		1268	};
		1269
917	static int cgroup_populate_dir(struct cgroup *cont)	1270	static int cgroup_populate_dir(struct cgroup *cont)
918	{	1271	{
919	int err;	1272	int err;
@@ -922,6 +1275,10 @@ static int cgroup_populate_dir(struct cgroup *cont)
922	/* First clear out any existing files */	1275	/* First clear out any existing files */
923	cgroup_clear_directory(cont->dentry);	1276	cgroup_clear_directory(cont->dentry);
924		1277
		1278	err = cgroup_add_file(cont, NULL, &cft_tasks);
		1279	if (err < 0)
		1280	return err;
		1281
925	for_each_subsys(cont->root, ss) {	1282	for_each_subsys(cont->root, ss) {
926	if (ss->populate && (err = ss->populate(ss, cont)) < 0)	1283	if (ss->populate && (err = ss->populate(ss, cont)) < 0)
927	return err;	1284	return err;