1 files changed, 63 insertions, 22 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..bc1dcabe9217 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
 /*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+/*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
 * populated with the built in subsystems, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        schedule_work(&cgrp->destroy_work);
+        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -881,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                struct cgroup *cgrp = dentry->d_fsdata;
                BUG_ON(!(cgroup_is_dead(cgrp)));
+                /*
+                 * XXX: cgrp->id is only used to look up css's.  As cgroup
+                 * and css's lifetimes will be decoupled, it should be made
+                 * per-subsystem and moved to css->id so that lookups are
+                 * successful until the target css is released.
+                 */
+                idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                cgrp->id = -1;
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -2421,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = seq_lseek,
-        .release = single_release,
+        .release = cgroup_file_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2482,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
                ret = cft->release(inode, file);
        if (css->ss)
                css_put(css);
+        if (file->f_op == &cgroup_seqfile_operations)
+                single_release(inode, file);
        return ret;
 }
@@ -4249,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
         * css_put().  dput() requires process context which we don't have.
         */
        INIT_WORK(&css->destroy_work, css_free_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4257,6 +4278,7 @@ static void css_release(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
+        rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
        call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
@@ -4415,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry and the parent css */
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-                dget(dentry);
-                css_get(css->parent);
-        }
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
@@ -4434,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                if (err)
                        goto err_destroy;
+                /* each css holds a ref to the cgroup's dentry and parent css */
+                dget(dentry);
+                css_get(css->parent);
+                /* mark it consumed for error path */
+                css_ar[ss->subsys_id] = NULL;
                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                    parent->parent) {
                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4480,6 +4501,14 @@ err_free_cgrp:
        return err;
 err_destroy:
+        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+                if (css) {
+                        percpu_ref_cancel_init(&css->refcnt);
+                        ss->css_free(css);
+                }
+        }
        cgroup_destroy_locked(cgrp);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4539,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
        INIT_WORK(&css->destroy_work, css_killed_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
@@ -4641,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * will be invoked to perform the rest of destruction once the
         * percpu refs of all css's are confirmed to be killed.
         */
-        for_each_root_subsys(cgrp->root, ss)
+        for_each_root_subsys(cgrp->root, ss) {
-                kill_css(cgroup_css(cgrp, ss));
+                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                if (css)
+                        kill_css(css);
+        }
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
@@ -4711,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
        /* delete this cgroup from parent->children */
        list_del_rcu(&cgrp->sibling);
-        /*
-         * We should remove the cgroup object from idr before its grace
-         * period starts, so we won't be looking up a cgroup while the
-         * cgroup is being freed.
-         */
-        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-        cgrp->id = -1;
        dput(d);
        set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -5063,6 +5088,22 @@ out:
        return err;
 }
+static int __init cgroup_wq_init(void)
+{
+        /*
+         * There isn't much point in executing destruction path in
+         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+         * Use 1 for @max_active.
+         *
+         * We would prefer to do this in cgroup_init() above, but that
+         * is called before init_workqueues(): so leave this until after.
+         */
+        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+        BUG_ON(!cgroup_destroy_wq);
+        return 0;
+}
+core_initcall(cgroup_wq_init);
 /*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy