cgroups: make procs file writable

Make procs file writable to move all threads by tgid at once. Add functionality that enables users to move all threads in a threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This current implementation makes use of a per-threadgroup rwsem that's taken for reading in the fork() path to prevent newly forking threads within the threadgroup from "escaping" while the move is in progress. Signed-off-by: Ben Blum <bblum@andrew.cmu.edu> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Matt Helsley <matthltc@us.ibm.com> Reviewed-by: Paul Menage <menage@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Ben Blum <bblum@andrew.cmu.edu> 2011-05-26 19:25:20 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-26 20:12:34 -0400
commit: 74a1166dfe1135dcc168d35fa5261aa7e087011b (patch)
tree: a7add70f0344e2352b8d0d6beb10aef85c6585f7
parent: f780bdb7c1c73009cb57adcf99ef50027d80bf3c (diff)
2 files changed, 401 insertions, 47 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index b3bd3bdbe202..8c4f3466c894 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
 - cgroup.procs: list of tgids in the cgroup.  This list is not
   guaranteed to be sorted or free of duplicate tgids, and userspace
   should sort/uniquify the list if this property is required.
-   This is a read-only file, for now.
+   Writing a thread group id into this file moves all threads in that
+   group into this cgroup.
 - notify_on_release flag: run the release agent on exit?
 - release_agent: the path to use for release notifications (this file
   exists in the top cgroup only)
@@ -430,6 +431,12 @@ You can attach the current shell task by echoing 0:
 # echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
 Note: Since every task is always a member of exactly one cgroup in each
 mounted hierarchy, to remove a task from its current cgroup you must
 move it into a new cgroup (possibly the root cgroup) by writing to the
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 38fb0ad1cb46..5e6a9745f0e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                               struct task_struct *tsk, bool guarantee)
+{
+        struct css_set *oldcg;
+        struct css_set *newcg;
+        /*
+         * get old css_set. we need to take task_lock and refcount it, because
+         * an exiting task can change its css_set to init_css_set and drop its
+         * old one without taking cgroup_mutex.
+         */
+        task_lock(tsk);
+        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
+        rcu_assign_pointer(tsk->cgroups, newcg);
+        task_unlock(tsk);
+        /* Update the css_set linked lists if we're using them */
+        write_lock(&css_set_lock);
+        if (!list_empty(&tsk->cg_list))
+                list_move(&tsk->cg_list, &newcg->tasks);
+        write_unlock(&css_set_lock);
+        /*
+         * We just gained a reference on oldcg by taking it from the task. As
+         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * it here; it will be freed under RCU.
+         */
+        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
+}
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-        struct css_set *cg;
-        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        /* Nothing to do if the task is already in that cgroup */
@@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                }
        }
-        task_lock(tsk);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-        cg = tsk->cgroups;
+        if (retval)
-        get_css_set(cg);
-        task_unlock(tsk);
-        /*
-         * Locate or allocate a new css_set for this task,
-         * based on its final set of cgroups
-         */
-        newcg = find_css_set(cg, cgrp);
-        put_css_set(cg);
-        if (!newcg) {
-                retval = -ENOMEM;
-                goto out;
-        }
-        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                retval = -ESRCH;
                goto out;
-        }
-        rcu_assign_pointer(tsk->cgroups, newcg);
-        task_unlock(tsk);
-        /* Update the css_set linked lists if we're using them */
-        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list))
-                list_move(&tsk->cg_list, &newcg->tasks);
-        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
                if (ss->pre_attach)
@@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                if (ss->attach)
                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
-        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
- * held. May take task_lock of task
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+        int retval, i, group_size;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
+        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
+        struct cgroupfs_root *root = cgrp->root;
+        /* threadgroup list cursor and array */
+        struct task_struct *tsk;
+        struct task_struct **group;
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
+        /*
+         * step 0: in order to do expensive, possibly blocking operations for
+         * every thread, we cannot iterate the thread group list, since it needs
+         * rcu or tasklist locked. instead, build an array of all threads in the
+         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * and if threads exit, this will just be an over-estimate.
+         */
+        group_size = get_nr_threads(leader);
+        group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+        if (!group)
+                return -ENOMEM;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
+        tsk = leader;
+        i = 0;
+        do {
+                /* as per above, nr_threads may decrease, but not increase. */
+                BUG_ON(i >= group_size);
+                get_task_struct(tsk);
+                group[i] = tsk;
+                i++;
+        } while_each_thread(leader, tsk);
+        /* remember the number of threads in the array for later. */
+        group_size = i;
+        rcu_read_unlock();
+        /*
+         * step 1: check that we can legitimately attach to the cgroup.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cgrp, leader);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out_cancel_attach;
+                        }
+                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                retval = ss->can_attach_task(cgrp, group[i]);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
+        }
+        /*
+         * step 2: make sure css_sets exist for all threads to be migrated.
+         * we use find_css_set, which allocates a new one if necessary.
+         */
+        INIT_LIST_HEAD(&newcg_list);
+        for (i = 0; i < group_size; i++) {
+                tsk = group[i];
+                /* nothing to do if this task is already in the cgroup */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                if (tsk->flags & PF_EXITING) {
+                        /* ignore this task if it's going away */
+                        task_unlock(tsk);
+                        continue;
+                }
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
+                }
+        }
+        /*
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
+        for (i = 0; i < group_size; i++) {
+                tsk = group[i];
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
+        }
+        /* nothing is sensitive to fork() after this point. */
+        /*
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->attach)
+                        ss->attach(ss, cgrp, oldcgrp, leader);
+        }
+        /*
+         * step 5: success! and cleanup
+         */
+        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
+        retval = 0;
+out_list_teardown:
+        /* clean up the list of prefetched css_sets. */
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                list_del(&cg_entry->links);
+                put_css_set(cg_entry->cg);
+                kfree(cg_entry);
+        }
+out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
+                                break;
+                        }
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, leader);
+                }
+        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++)
+                put_task_struct(group[i]);
+out_free_group_list:
+        kfree(group);
+        return retval;
+}
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk || tsk->flags & PF_EXITING) {
+                if (!tsk) {
                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
+                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -ESRCH;
                }
+                /*
+                 * even if we're attaching all tasks in the thread group, we
+                 * only need to check permissions on one of them.
+                 */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -EACCES;
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-                tsk = current;
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
                get_task_struct(tsk);
        }
-        ret = cgroup_attach_task(cgrp, tsk);
+        if (threadgroup) {
+                threadgroup_fork_write_lock(tsk);
+                ret = cgroup_attach_proc(cgrp, tsk);
+                threadgroup_fork_write_unlock(tsk);
+        } else {
+                ret = cgroup_attach_task(cgrp, tsk);
+        }
        put_task_struct(tsk);
+        cgroup_unlock();
        return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
+        return attach_task_by_pid(cgrp, pid, false);
+}
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        do {
-                return -ENODEV;
+                /*
-        ret = attach_task_by_pid(cgrp, pid);
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
-        cgroup_unlock();
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
        return ret;
 }
@@ -3270,9 +3617,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-                /* .write_u64 = cgroup_procs_write, TODO */
+                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO,
+                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
author	Ben Blum <bblum@andrew.cmu.edu>	2011-05-26 19:25:20 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-26 20:12:34 -0400
commit	74a1166dfe1135dcc168d35fa5261aa7e087011b (patch)
tree	a7add70f0344e2352b8d0d6beb10aef85c6585f7
parent	f780bdb7c1c73009cb57adcf99ef50027d80bf3c (diff)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index b3bd3bdbe202..8c4f3466c894 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
236	- cgroup.procs: list of tgids in the cgroup. This list is not	236	- cgroup.procs: list of tgids in the cgroup. This list is not
237	guaranteed to be sorted or free of duplicate tgids, and userspace	237	guaranteed to be sorted or free of duplicate tgids, and userspace
238	should sort/uniquify the list if this property is required.	238	should sort/uniquify the list if this property is required.
239	This is a read-only file, for now.	239	Writing a thread group id into this file moves all threads in that
		240	group into this cgroup.
240	- notify_on_release flag: run the release agent on exit?	241	- notify_on_release flag: run the release agent on exit?
241	- release_agent: the path to use for release notifications (this file	242	- release_agent: the path to use for release notifications (this file
242	exists in the top cgroup only)	243	exists in the top cgroup only)
@@ -430,6 +431,12 @@ You can attach the current shell task by echoing 0:
430		431
431	# echo 0 > tasks	432	# echo 0 > tasks
432		433
		434	You can use the cgroup.procs file instead of the tasks file to move all
		435	threads in a threadgroup at once. Echoing the pid of any task in a
		436	threadgroup to cgroup.procs causes all tasks in that threadgroup to be
		437	be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
		438	in the writing task's threadgroup.
		439
433	Note: Since every task is always a member of exactly one cgroup in each	440	Note: Since every task is always a member of exactly one cgroup in each
434	mounted hierarchy, to remove a task from its current cgroup you must	441	mounted hierarchy, to remove a task from its current cgroup you must
435	move it into a new cgroup (possibly the root cgroup) by writing to the	442	move it into a new cgroup (possibly the root cgroup) by writing to the


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38fb0ad1cb46..5e6a9745f0e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup cgrp, char buf, int buflen)
1735	}	1735	}
1736	EXPORT_SYMBOL_GPL(cgroup_path);	1736	EXPORT_SYMBOL_GPL(cgroup_path);
1737		1737
		1738	/*
		1739	* cgroup_task_migrate - move a task from one cgroup to another.
		1740	*
		1741	* 'guarantee' is set if the caller promises that a new css_set for the task
		1742	* will already exist. If not set, this function might sleep, and can fail with
		1743	* -ENOMEM. Otherwise, it can only fail with -ESRCH.
		1744	*/
		1745	static int cgroup_task_migrate(struct cgroup cgrp, struct cgroup oldcgrp,
		1746	struct task_struct *tsk, bool guarantee)
		1747	{
		1748	struct css_set *oldcg;
		1749	struct css_set *newcg;
		1750
		1751	/*
		1752	* get old css_set. we need to take task_lock and refcount it, because
		1753	* an exiting task can change its css_set to init_css_set and drop its
		1754	* old one without taking cgroup_mutex.
		1755	*/
		1756	task_lock(tsk);
		1757	oldcg = tsk->cgroups;
		1758	get_css_set(oldcg);
		1759	task_unlock(tsk);
		1760
		1761	/* locate or allocate a new css_set for this task. */
		1762	if (guarantee) {
		1763	/* we know the css_set we want already exists. */
		1764	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
		1765	read_lock(&css_set_lock);
		1766	newcg = find_existing_css_set(oldcg, cgrp, template);
		1767	BUG_ON(!newcg);
		1768	get_css_set(newcg);
		1769	read_unlock(&css_set_lock);
		1770	} else {
		1771	might_sleep();
		1772	/* find_css_set will give us newcg already referenced. */
		1773	newcg = find_css_set(oldcg, cgrp);
		1774	if (!newcg) {
		1775	put_css_set(oldcg);
		1776	return -ENOMEM;
		1777	}
		1778	}
		1779	put_css_set(oldcg);
		1780
		1781	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
		1782	task_lock(tsk);
		1783	if (tsk->flags & PF_EXITING) {
		1784	task_unlock(tsk);
		1785	put_css_set(newcg);
		1786	return -ESRCH;
		1787	}
		1788	rcu_assign_pointer(tsk->cgroups, newcg);
		1789	task_unlock(tsk);
		1790
		1791	/* Update the css_set linked lists if we're using them */
		1792	write_lock(&css_set_lock);
		1793	if (!list_empty(&tsk->cg_list))
		1794	list_move(&tsk->cg_list, &newcg->tasks);
		1795	write_unlock(&css_set_lock);
		1796
		1797	/*
		1798	* We just gained a reference on oldcg by taking it from the task. As
		1799	* trading it for newcg is protected by cgroup_mutex, we're safe to drop
		1800	* it here; it will be freed under RCU.
		1801	*/
		1802	put_css_set(oldcg);
		1803
		1804	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
		1805	return 0;
		1806	}
		1807
1738	/**	1808	/**
1739	* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'	1809	* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1740	* @cgrp: the cgroup the task is attaching to	1810	* @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1745	*/	1815	*/
1746	int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)	1816	int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1747	{	1817	{
1748	int retval = 0;	1818	int retval;
1749	struct cgroup_subsys ss, failed_ss = NULL;	1819	struct cgroup_subsys ss, failed_ss = NULL;
1750	struct cgroup *oldcgrp;	1820	struct cgroup *oldcgrp;
1751	struct css_set *cg;
1752	struct css_set *newcg;
1753	struct cgroupfs_root *root = cgrp->root;	1821	struct cgroupfs_root *root = cgrp->root;
1754		1822
1755	/* Nothing to do if the task is already in that cgroup */	1823	/* Nothing to do if the task is already in that cgroup */
@@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1780	}	1848	}
1781	}	1849	}
1782		1850
1783	task_lock(tsk);	1851	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1784	cg = tsk->cgroups;	1852	if (retval)
1785	get_css_set(cg);
1786	task_unlock(tsk);
1787	/*
1788	* Locate or allocate a new css_set for this task,
1789	* based on its final set of cgroups
1790	*/
1791	newcg = find_css_set(cg, cgrp);
1792	put_css_set(cg);
1793	if (!newcg) {
1794	retval = -ENOMEM;
1795	goto out;
1796	}
1797
1798	task_lock(tsk);
1799	if (tsk->flags & PF_EXITING) {
1800	task_unlock(tsk);
1801	put_css_set(newcg);
1802	retval = -ESRCH;
1803	goto out;	1853	goto out;
1804	}
1805	rcu_assign_pointer(tsk->cgroups, newcg);
1806	task_unlock(tsk);
1807
1808	/* Update the css_set linked lists if we're using them */
1809	write_lock(&css_set_lock);
1810	if (!list_empty(&tsk->cg_list))
1811	list_move(&tsk->cg_list, &newcg->tasks);
1812	write_unlock(&css_set_lock);
1813		1854
1814	for_each_subsys(root, ss) {	1855	for_each_subsys(root, ss) {
1815	if (ss->pre_attach)	1856	if (ss->pre_attach)
@@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup cgrp, struct task_struct tsk)
1819	if (ss->attach)	1860	if (ss->attach)
1820	ss->attach(ss, cgrp, oldcgrp, tsk);	1861	ss->attach(ss, cgrp, oldcgrp, tsk);
1821	}	1862	}
1822	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);	1863
1823	synchronize_rcu();	1864	synchronize_rcu();
1824	put_css_set(cg);
1825		1865
1826	/*	1866	/*
1827	* wake up rmdir() waiter. the rmdir should fail since the cgroup	1867	* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct from, struct task_struct tsk)
1871	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);	1911	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1872		1912
1873	/*	1913	/*
1874	* Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex	1914	* cgroup_attach_proc works in two stages, the first of which prefetches all
1875	* held. May take task_lock of task	1915	* new css_sets needed (to make sure we have enough memory before committing
		1916	* to the move) and stores them in a list of entries of the following type.
		1917	* TODO: possible optimization: use css_set->rcu_head for chaining instead
		1918	*/
		1919	struct cg_list_entry {
		1920	struct css_set *cg;
		1921	struct list_head links;
		1922	};
		1923
		1924	static bool css_set_check_fetched(struct cgroup *cgrp,
		1925	struct task_struct tsk, struct css_set cg,
		1926	struct list_head *newcg_list)
		1927	{
		1928	struct css_set *newcg;
		1929	struct cg_list_entry *cg_entry;
		1930	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
		1931
		1932	read_lock(&css_set_lock);
		1933	newcg = find_existing_css_set(cg, cgrp, template);
		1934	if (newcg)
		1935	get_css_set(newcg);
		1936	read_unlock(&css_set_lock);
		1937
		1938	/* doesn't exist at all? */
		1939	if (!newcg)
		1940	return false;
		1941	/* see if it's already in the list */
		1942	list_for_each_entry(cg_entry, newcg_list, links) {
		1943	if (cg_entry->cg == newcg) {
		1944	put_css_set(newcg);
		1945	return true;
		1946	}
		1947	}
		1948
		1949	/* not found */
		1950	put_css_set(newcg);
		1951	return false;
		1952	}
		1953
		1954	/*
		1955	* Find the new css_set and store it in the list in preparation for moving the
		1956	* given task to the given cgroup. Returns 0 or -ENOMEM.
		1957	*/
		1958	static int css_set_prefetch(struct cgroup cgrp, struct css_set cg,
		1959	struct list_head *newcg_list)
		1960	{
		1961	struct css_set *newcg;
		1962	struct cg_list_entry *cg_entry;
		1963
		1964	/* ensure a new css_set will exist for this thread */
		1965	newcg = find_css_set(cg, cgrp);
		1966	if (!newcg)
		1967	return -ENOMEM;
		1968	/* add it to the list */
		1969	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
		1970	if (!cg_entry) {
		1971	put_css_set(newcg);
		1972	return -ENOMEM;
		1973	}
		1974	cg_entry->cg = newcg;
		1975	list_add(&cg_entry->links, newcg_list);
		1976	return 0;
		1977	}
		1978
		1979	/**
		1980	* cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
		1981	* @cgrp: the cgroup to attach to
		1982	* @leader: the threadgroup leader task_struct of the group to be attached
		1983	*
		1984	* Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
		1985	* take task_lock of each thread in leader's threadgroup individually in turn.
		1986	*/
		1987	int cgroup_attach_proc(struct cgroup cgrp, struct task_struct leader)
		1988	{
		1989	int retval, i, group_size;
		1990	struct cgroup_subsys ss, failed_ss = NULL;
		1991	bool cancel_failed_ss = false;
		1992	/* guaranteed to be initialized later, but the compiler needs this */
		1993	struct cgroup *oldcgrp = NULL;
		1994	struct css_set *oldcg;
		1995	struct cgroupfs_root *root = cgrp->root;
		1996	/* threadgroup list cursor and array */
		1997	struct task_struct *tsk;
		1998	struct task_struct **group;
		1999	/*
		2000	* we need to make sure we have css_sets for all the tasks we're
		2001	* going to move -before- we actually start moving them, so that in
		2002	* case we get an ENOMEM we can bail out before making any changes.
		2003	*/
		2004	struct list_head newcg_list;
		2005	struct cg_list_entry cg_entry, temp_nobe;
		2006
		2007	/*
		2008	* step 0: in order to do expensive, possibly blocking operations for
		2009	* every thread, we cannot iterate the thread group list, since it needs
		2010	* rcu or tasklist locked. instead, build an array of all threads in the
		2011	* group - threadgroup_fork_lock prevents new threads from appearing,
		2012	* and if threads exit, this will just be an over-estimate.
		2013	*/
		2014	group_size = get_nr_threads(leader);
		2015	group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
		2016	if (!group)
		2017	return -ENOMEM;
		2018
		2019	/* prevent changes to the threadgroup list while we take a snapshot. */
		2020	rcu_read_lock();
		2021	if (!thread_group_leader(leader)) {
		2022	/*
		2023	* a race with de_thread from another thread's exec() may strip
		2024	* us of our leadership, making while_each_thread unsafe to use
		2025	* on this task. if this happens, there is no choice but to
		2026	* throw this task away and try again (from cgroup_procs_write);
		2027	* this is "double-double-toil-and-trouble-check locking".
		2028	*/
		2029	rcu_read_unlock();
		2030	retval = -EAGAIN;
		2031	goto out_free_group_list;
		2032	}
		2033	/* take a reference on each task in the group to go in the array. */
		2034	tsk = leader;
		2035	i = 0;
		2036	do {
		2037	/* as per above, nr_threads may decrease, but not increase. */
		2038	BUG_ON(i >= group_size);
		2039	get_task_struct(tsk);
		2040	group[i] = tsk;
		2041	i++;
		2042	} while_each_thread(leader, tsk);
		2043	/* remember the number of threads in the array for later. */
		2044	group_size = i;
		2045	rcu_read_unlock();
		2046
		2047	/*
		2048	* step 1: check that we can legitimately attach to the cgroup.
		2049	*/
		2050	for_each_subsys(root, ss) {
		2051	if (ss->can_attach) {
		2052	retval = ss->can_attach(ss, cgrp, leader);
		2053	if (retval) {
		2054	failed_ss = ss;
		2055	goto out_cancel_attach;
		2056	}
		2057	}
		2058	/* a callback to be run on every thread in the threadgroup. */
		2059	if (ss->can_attach_task) {
		2060	/* run on each task in the threadgroup. */
		2061	for (i = 0; i < group_size; i++) {
		2062	retval = ss->can_attach_task(cgrp, group[i]);
		2063	if (retval) {
		2064	failed_ss = ss;
		2065	cancel_failed_ss = true;
		2066	goto out_cancel_attach;
		2067	}
		2068	}
		2069	}
		2070	}
		2071
		2072	/*
		2073	* step 2: make sure css_sets exist for all threads to be migrated.
		2074	* we use find_css_set, which allocates a new one if necessary.
		2075	*/
		2076	INIT_LIST_HEAD(&newcg_list);
		2077	for (i = 0; i < group_size; i++) {
		2078	tsk = group[i];
		2079	/* nothing to do if this task is already in the cgroup */
		2080	oldcgrp = task_cgroup_from_root(tsk, root);
		2081	if (cgrp == oldcgrp)
		2082	continue;
		2083	/* get old css_set pointer */
		2084	task_lock(tsk);
		2085	if (tsk->flags & PF_EXITING) {
		2086	/* ignore this task if it's going away */
		2087	task_unlock(tsk);
		2088	continue;
		2089	}
		2090	oldcg = tsk->cgroups;
		2091	get_css_set(oldcg);
		2092	task_unlock(tsk);
		2093	/* see if the new one for us is already in the list? */
		2094	if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
		2095	/* was already there, nothing to do. */
		2096	put_css_set(oldcg);
		2097	} else {
		2098	/* we don't already have it. get new one. */
		2099	retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
		2100	put_css_set(oldcg);
		2101	if (retval)
		2102	goto out_list_teardown;
		2103	}
		2104	}
		2105
		2106	/*
		2107	* step 3: now that we're guaranteed success wrt the css_sets, proceed
		2108	* to move all tasks to the new cgroup, calling ss->attach_task for each
		2109	* one along the way. there are no failure cases after here, so this is
		2110	* the commit point.
		2111	*/
		2112	for_each_subsys(root, ss) {
		2113	if (ss->pre_attach)
		2114	ss->pre_attach(cgrp);
		2115	}
		2116	for (i = 0; i < group_size; i++) {
		2117	tsk = group[i];
		2118	/* leave current thread as it is if it's already there */
		2119	oldcgrp = task_cgroup_from_root(tsk, root);
		2120	if (cgrp == oldcgrp)
		2121	continue;
		2122	/* attach each task to each subsystem */
		2123	for_each_subsys(root, ss) {
		2124	if (ss->attach_task)
		2125	ss->attach_task(cgrp, tsk);
		2126	}
		2127	/* if the thread is PF_EXITING, it can just get skipped. */
		2128	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
		2129	BUG_ON(retval != 0 && retval != -ESRCH);
		2130	}
		2131	/* nothing is sensitive to fork() after this point. */
		2132
		2133	/*
		2134	* step 4: do expensive, non-thread-specific subsystem callbacks.
		2135	* TODO: if ever a subsystem needs to know the oldcgrp for each task
		2136	* being moved, this call will need to be reworked to communicate that.
		2137	*/
		2138	for_each_subsys(root, ss) {
		2139	if (ss->attach)
		2140	ss->attach(ss, cgrp, oldcgrp, leader);
		2141	}
		2142
		2143	/*
		2144	* step 5: success! and cleanup
		2145	*/
		2146	synchronize_rcu();
		2147	cgroup_wakeup_rmdir_waiter(cgrp);
		2148	retval = 0;
		2149	out_list_teardown:
		2150	/* clean up the list of prefetched css_sets. */
		2151	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
		2152	list_del(&cg_entry->links);
		2153	put_css_set(cg_entry->cg);
		2154	kfree(cg_entry);
		2155	}
		2156	out_cancel_attach:
		2157	/* same deal as in cgroup_attach_task */
		2158	if (retval) {
		2159	for_each_subsys(root, ss) {
		2160	if (ss == failed_ss) {
		2161	if (cancel_failed_ss && ss->cancel_attach)
		2162	ss->cancel_attach(ss, cgrp, leader);
		2163	break;
		2164	}
		2165	if (ss->cancel_attach)
		2166	ss->cancel_attach(ss, cgrp, leader);
		2167	}
		2168	}
		2169	/* clean up the array of referenced threads in the group. */
		2170	for (i = 0; i < group_size; i++)
		2171	put_task_struct(group[i]);
		2172	out_free_group_list:
		2173	kfree(group);
		2174	return retval;
		2175	}
		2176
		2177	/*
		2178	* Find the task_struct of the task to attach by vpid and pass it along to the
		2179	* function to attach either it or all tasks in its threadgroup. Will take
		2180	* cgroup_mutex; may take task_lock of task.
1876	*/	2181	*/
1877	static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)	2182	static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1878	{	2183	{
1879	struct task_struct *tsk;	2184	struct task_struct *tsk;
1880	const struct cred cred = current_cred(), tcred;	2185	const struct cred cred = current_cred(), tcred;
1881	int ret;	2186	int ret;
1882		2187
		2188	if (!cgroup_lock_live_group(cgrp))
		2189	return -ENODEV;
		2190
1883	if (pid) {	2191	if (pid) {
1884	rcu_read_lock();	2192	rcu_read_lock();
1885	tsk = find_task_by_vpid(pid);	2193	tsk = find_task_by_vpid(pid);
1886	if (!tsk \|\| tsk->flags & PF_EXITING) {	2194	if (!tsk) {
1887	rcu_read_unlock();	2195	rcu_read_unlock();
		2196	cgroup_unlock();
		2197	return -ESRCH;
		2198	}
		2199	if (threadgroup) {
		2200	/*
		2201	* RCU protects this access, since tsk was found in the
		2202	* tid map. a race with de_thread may cause group_leader
		2203	* to stop being the leader, but cgroup_attach_proc will
		2204	* detect it later.
		2205	*/
		2206	tsk = tsk->group_leader;
		2207	} else if (tsk->flags & PF_EXITING) {
		2208	/* optimization for the single-task-only case */
		2209	rcu_read_unlock();
		2210	cgroup_unlock();
1888	return -ESRCH;	2211	return -ESRCH;
1889	}	2212	}
1890		2213
		2214	/*
		2215	* even if we're attaching all tasks in the thread group, we
		2216	* only need to check permissions on one of them.
		2217	*/
1891	tcred = __task_cred(tsk);	2218	tcred = __task_cred(tsk);
1892	if (cred->euid &&	2219	if (cred->euid &&
1893	cred->euid != tcred->uid &&	2220	cred->euid != tcred->uid &&
1894	cred->euid != tcred->suid) {	2221	cred->euid != tcred->suid) {
1895	rcu_read_unlock();	2222	rcu_read_unlock();
		2223	cgroup_unlock();
1896	return -EACCES;	2224	return -EACCES;
1897	}	2225	}
1898	get_task_struct(tsk);	2226	get_task_struct(tsk);
1899	rcu_read_unlock();	2227	rcu_read_unlock();
1900	} else {	2228	} else {
1901	tsk = current;	2229	if (threadgroup)
		2230	tsk = current->group_leader;
		2231	else
		2232	tsk = current;
1902	get_task_struct(tsk);	2233	get_task_struct(tsk);
1903	}	2234	}
1904		2235
1905	ret = cgroup_attach_task(cgrp, tsk);	2236	if (threadgroup) {
		2237	threadgroup_fork_write_lock(tsk);
		2238	ret = cgroup_attach_proc(cgrp, tsk);
		2239	threadgroup_fork_write_unlock(tsk);
		2240	} else {
		2241	ret = cgroup_attach_task(cgrp, tsk);
		2242	}
1906	put_task_struct(tsk);	2243	put_task_struct(tsk);
		2244	cgroup_unlock();
1907	return ret;	2245	return ret;
1908	}	2246	}
1909		2247
1910	static int cgroup_tasks_write(struct cgroup cgrp, struct cftype cft, u64 pid)	2248	static int cgroup_tasks_write(struct cgroup cgrp, struct cftype cft, u64 pid)
1911	{	2249	{
		2250	return attach_task_by_pid(cgrp, pid, false);
		2251	}
		2252
		2253	static int cgroup_procs_write(struct cgroup cgrp, struct cftype cft, u64 tgid)
		2254	{
1912	int ret;	2255	int ret;
1913	if (!cgroup_lock_live_group(cgrp))	2256	do {
1914	return -ENODEV;	2257	/*
1915	ret = attach_task_by_pid(cgrp, pid);	2258	* attach_proc fails with -EAGAIN if threadgroup leadership
1916	cgroup_unlock();	2259	* changes in the middle of the operation, in which case we need
		2260	* to find the task_struct for the new leader and start over.
		2261	*/
		2262	ret = attach_task_by_pid(cgrp, tgid, true);
		2263	} while (ret == -EAGAIN);
1917	return ret;	2264	return ret;
1918	}	2265	}
1919		2266
@@ -3270,9 +3617,9 @@ static struct cftype files[] = {
3270	{	3617	{
3271	.name = CGROUP_FILE_GENERIC_PREFIX "procs",	3618	.name = CGROUP_FILE_GENERIC_PREFIX "procs",
3272	.open = cgroup_procs_open,	3619	.open = cgroup_procs_open,
3273	/* .write_u64 = cgroup_procs_write, TODO */	3620	.write_u64 = cgroup_procs_write,
3274	.release = cgroup_pidlist_release,	3621	.release = cgroup_pidlist_release,
3275	.mode = S_IRUGO,	3622	.mode = S_IRUGO \| S_IWUSR,
3276	},	3623	},
3277	{	3624	{
3278	.name = "notify_on_release",	3625	.name = "notify_on_release",