1 files changed, 422 insertions, 165 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a35510af5..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
 #include <asm/atomic.h>
@@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                               struct task_struct *tsk, bool guarantee)
+{
+        struct css_set *oldcg;
+        struct css_set *newcg;
+        /*
+         * get old css_set. we need to take task_lock and refcount it, because
+         * an exiting task can change its css_set to init_css_set and drop its
+         * old one without taking cgroup_mutex.
+         */
+        task_lock(tsk);
+        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
+        rcu_assign_pointer(tsk->cgroups, newcg);
+        task_unlock(tsk);
+        /* Update the css_set linked lists if we're using them */
+        write_lock(&css_set_lock);
+        if (!list_empty(&tsk->cg_list))
+                list_move(&tsk->cg_list, &newcg->tasks);
+        write_unlock(&css_set_lock);
+        /*
+         * We just gained a reference on oldcg by taking it from the task. As
+         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * it here; it will be freed under RCU.
+         */
+        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
+}
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-        struct css_set *cg;
-        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        /* Nothing to do if the task is already in that cgroup */
@@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk, false);
+                        retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+                if (ss->can_attach_task) {
+                        retval = ss->can_attach_task(cgrp, tsk);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out;
+                        }
+                }
        }
-        task_lock(tsk);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-        cg = tsk->cgroups;
+        if (retval)
-        get_css_set(cg);
-        task_unlock(tsk);
-        /*
-         * Locate or allocate a new css_set for this task,
-         * based on its final set of cgroups
-         */
-        newcg = find_css_set(cg, cgrp);
-        put_css_set(cg);
-        if (!newcg) {
-                retval = -ENOMEM;
-                goto out;
-        }
-        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                retval = -ESRCH;
                goto out;
-        }
-        rcu_assign_pointer(tsk->cgroups, newcg);
-        task_unlock(tsk);
-        /* Update the css_set linked lists if we're using them */
-        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list))
-                list_move(&tsk->cg_list, &newcg->tasks);
-        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+                if (ss->attach_task)
+                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
-        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1829,7 +1881,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk, false);
+                                ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
- * held. May take task_lock of task
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+        int retval, i, group_size;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
+        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
+        struct cgroupfs_root *root = cgrp->root;
+        /* threadgroup list cursor and array */
+        struct task_struct *tsk;
+        struct flex_array *group;
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
+        /*
+         * step 0: in order to do expensive, possibly blocking operations for
+         * every thread, we cannot iterate the thread group list, since it needs
+         * rcu or tasklist locked. instead, build an array of all threads in the
+         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * and if threads exit, this will just be an over-estimate.
+         */
+        group_size = get_nr_threads(leader);
+        /* flex_array supports very large thread-groups better than kmalloc. */
+        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                 GFP_KERNEL);
+        if (!group)
+                return -ENOMEM;
+        /* pre-allocate to guarantee space while iterating in rcu read-side. */
+        retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+        if (retval)
+                goto out_free_group_list;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
+        tsk = leader;
+        i = 0;
+        do {
+                /* as per above, nr_threads may decrease, but not increase. */
+                BUG_ON(i >= group_size);
+                get_task_struct(tsk);
+                /*
+                 * saying GFP_ATOMIC has no effect here because we did prealloc
+                 * earlier, but it's good form to communicate our expectations.
+                 */
+                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                BUG_ON(retval != 0);
+                i++;
+        } while_each_thread(leader, tsk);
+        /* remember the number of threads in the array for later. */
+        group_size = i;
+        rcu_read_unlock();
+        /*
+         * step 1: check that we can legitimately attach to the cgroup.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cgrp, leader);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out_cancel_attach;
+                        }
+                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                tsk = flex_array_get_ptr(group, i);
+                                retval = ss->can_attach_task(cgrp, tsk);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
+        }
+        /*
+         * step 2: make sure css_sets exist for all threads to be migrated.
+         * we use find_css_set, which allocates a new one if necessary.
+         */
+        INIT_LIST_HEAD(&newcg_list);
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* nothing to do if this task is already in the cgroup */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                if (tsk->flags & PF_EXITING) {
+                        /* ignore this task if it's going away */
+                        task_unlock(tsk);
+                        continue;
+                }
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
+                }
+        }
+        /*
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
+        }
+        /* nothing is sensitive to fork() after this point. */
+        /*
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->attach)
+                        ss->attach(ss, cgrp, oldcgrp, leader);
+        }
+        /*
+         * step 5: success! and cleanup
+         */
+        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
+        retval = 0;
+out_list_teardown:
+        /* clean up the list of prefetched css_sets. */
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                list_del(&cg_entry->links);
+                put_css_set(cg_entry->cg);
+                kfree(cg_entry);
+        }
+out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
+                                break;
+                        }
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, leader);
+                }
+        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                put_task_struct(tsk);
+        }
+out_free_group_list:
+        flex_array_free(group);
+        return retval;
+}
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk || tsk->flags & PF_EXITING) {
+                if (!tsk) {
                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
+                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -ESRCH;
                }
+                /*
+                 * even if we're attaching all tasks in the thread group, we
+                 * only need to check permissions on one of them.
+                 */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -EACCES;
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-                tsk = current;
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
                get_task_struct(tsk);
        }
-        ret = cgroup_attach_task(cgrp, tsk);
+        if (threadgroup) {
+                threadgroup_fork_write_lock(tsk);
+                ret = cgroup_attach_proc(cgrp, tsk);
+                threadgroup_fork_write_unlock(tsk);
+        } else {
+                ret = cgroup_attach_task(cgrp, tsk);
+        }
        put_task_struct(tsk);
+        cgroup_unlock();
        return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
+        return attach_task_by_pid(cgrp, pid, false);
+}
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        do {
-                return -ENODEV;
+                /*
-        ret = attach_task_by_pid(cgrp, pid);
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
-        cgroup_unlock();
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
        return ret;
 }
@@ -3259,9 +3632,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-                /* .write_u64 = cgroup_procs_write, TODO */
+                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO,
+                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 }
 /**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
-                                                        char *nodename)
-{
-        struct dentry *dentry;
-        int ret = 0;
-        struct cgroup *parent, *child;
-        struct inode *inode;
-        struct css_set *cg;
-        struct cgroupfs_root *root;
-        struct cgroup_subsys *ss;
-        /* We shouldn't be called by an unregistered subsystem */
-        BUG_ON(!subsys->active);
-        /* First figure out what hierarchy and cgroup we're dealing
-         * with, and pin them so we can drop cgroup_mutex */
-        mutex_lock(&cgroup_mutex);
- again:
-        root = subsys->root;
-        if (root == &rootnode) {
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Pin the hierarchy */
-        if (!atomic_inc_not_zero(&root->sb->s_active)) {
-                /* We race with the final deactivate_super() */
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Keep the cgroup alive */
-        task_lock(tsk);
-        parent = task_cgroup(tsk, subsys->subsys_id);
-        cg = tsk->cgroups;
-        get_css_set(cg);
-        task_unlock(tsk);
-        mutex_unlock(&cgroup_mutex);
-        /* Now do the VFS work to create a cgroup */
-        inode = parent->dentry->d_inode;
-        /* Hold the parent directory mutex across this operation to
-         * stop anyone else deleting the new cgroup */
-        mutex_lock(&inode->i_mutex);
-        dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
-        if (IS_ERR(dentry)) {
-                printk(KERN_INFO
-                       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
-                       PTR_ERR(dentry));
-                ret = PTR_ERR(dentry);
-                goto out_release;
-        }
-        /* Create the cgroup directory, which also creates the cgroup */
-        ret = vfs_mkdir(inode, dentry, 0755);
-        child = __d_cgrp(dentry);
-        dput(dentry);
-        if (ret) {
-                printk(KERN_INFO
-                       "Failed to create cgroup %s: %d\n", nodename,
-                       ret);
-                goto out_release;
-        }
-        /* The cgroup now exists. Retake cgroup_mutex and check
-         * that we're still in the same state that we thought we
-         * were. */
-        mutex_lock(&cgroup_mutex);
-        if ((root != subsys->root) ||
-            (parent != task_cgroup(tsk, subsys->subsys_id))) {
-                /* Aargh, we raced ... */
-                mutex_unlock(&inode->i_mutex);
-                put_css_set(cg);
-                deactivate_super(root->sb);
-                /* The cgroup is still accessible in the VFS, but
-                 * we're not going to try to rmdir() it at this
-                 * point. */
-                printk(KERN_INFO
-                       "Race in cgroup_clone() - leaking cgroup %s\n",
-                       nodename);
-                goto again;
-        }
-        /* do any required auto-setup */
-        for_each_subsys(root, ss) {
-                if (ss->post_clone)
-                        ss->post_clone(ss, child);
-        }
-        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = cgroup_attach_task(child, tsk);
-        mutex_unlock(&cgroup_mutex);
- out_release:
-        mutex_unlock(&inode->i_mutex);
-        mutex_lock(&cgroup_mutex);
-        put_css_set(cg);
-        mutex_unlock(&cgroup_mutex);
-        deactivate_super(root->sb);
-        return ret;
-}
-/**
 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
 * @cgrp: the cgroup in question
 * @task: the task in question