1 files changed, 168 insertions, 120 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..b5c64327e712 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
 * account cgroups in empty hierarchies.
 */
 #define CSS_SET_HASH_BITS       7
-#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
        int i;
-        int index;
+        unsigned long key = 0UL;
-        unsigned long tmp = 0UL;
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-                tmp += (unsigned long)css[i];
+                key += (unsigned long)css[i];
-        tmp = (tmp >> 16) ^ tmp;
+        key = (key >> 16) ^ key;
-        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return key;
-        return &css_set_table[index];
 }
 /* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
+        hash_del(&cg->hlist);
        css_set_count--;
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
+                /*
+                 * We may not be holding cgroup_mutex, and if cgrp->count is
+                 * dropped to 0 the cgroup can be destroyed at any time, hence
+                 * rcu_read_lock is used to keep it alive.
+                 */
+                rcu_read_lock();
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
+                rcu_read_unlock();
                kfree(link);
        }
@@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct hlist_head *hhead;
        struct hlist_node *node;
        struct css_set *cg;
+        unsigned long key;
        /*
         * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(
                }
        }
-        hhead = css_set_hash(template);
+        key = css_set_hash(template);
-        hlist_for_each_entry(cg, node, hhead, hlist) {
+        hash_for_each_possible(css_set_table, cg, node, hlist, key) {
                if (!compare_css_sets(cg, oldcg, cgrp, template))
                        continue;
@@ -657,8 +661,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
-        struct hlist_head *hhead;
        struct cg_cgroup_link *link;
+        unsigned long key;
        /* First see if we already have a cgroup group that matches
         * the desired set */
@@ -704,8 +708,8 @@ static struct css_set *find_css_set(
        css_set_count++;
        /* Add this cgroup group to the hash table */
-        hhead = css_set_hash(res->subsys);
+        key = css_set_hash(res->subsys);
-        hlist_add_head(&res->hlist, hhead);
+        hash_add(css_set_table, &res->hlist, key);
        write_unlock(&css_set_lock);
@@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
 {
-        /* is dentry a directory ? if so, kfree() associated cgroup */
+        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
-        if (S_ISDIR(inode->i_mode)) {
+        struct cgroup_subsys *ss;
-                struct cgroup *cgrp = dentry->d_fsdata;
-                struct cgroup_subsys *ss;
-                BUG_ON(!(cgroup_is_removed(cgrp)));
-                /* It's possible for external users to be holding css
-                 * reference counts on a cgroup; css_put() needs to
-                 * be able to access the cgroup after decrementing
-                 * the reference count in order to know if it needs to
-                 * queue the cgroup to be handled by the release
-                 * agent */
-                synchronize_rcu();
-                mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_mutex);
-                /*
+        /*
-                 * Release the subsystem state objects.
+         * Release the subsystem state objects.
-                 */
+         */
-                for_each_subsys(cgrp->root, ss)
+        for_each_subsys(cgrp->root, ss)
-                        ss->css_free(cgrp);
+                ss->css_free(cgrp);
-                cgrp->root->number_of_cgroups--;
+        cgrp->root->number_of_cgroups--;
-                mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgroup_mutex);
-                /*
+        /*
-                 * Drop the active superblock reference that we took when we
+         * Drop the active superblock reference that we took when we
-                 * created the cgroup
+         * created the cgroup
-                 */
+         */
-                deactivate_super(cgrp->root->sb);
+        deactivate_super(cgrp->root->sb);
-                /*
+        /*
-                 * if we're getting rid of the cgroup, refcount should ensure
+         * if we're getting rid of the cgroup, refcount should ensure
-                 * that there are no pidlists left.
+         * that there are no pidlists left.
-                 */
+         */
-                BUG_ON(!list_empty(&cgrp->pidlists));
+        BUG_ON(!list_empty(&cgrp->pidlists));
-                simple_xattrs_free(&cgrp->xattrs);
+        simple_xattrs_free(&cgrp->xattrs);
-                ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-                kfree_rcu(cgrp, rcu_head);
+        kfree(cgrp);
+}
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+        schedule_work(&cgrp->free_work);
+}
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+        /* is dentry a directory ? if so, kfree() associated cgroup */
+        if (S_ISDIR(inode->i_mode)) {
+                struct cgroup *cgrp = dentry->d_fsdata;
+                BUG_ON(!(cgroup_is_removed(cgrp)));
+                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
        struct cfent *cfe;
        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
+        /*
+         * If we're doing cleanup due to failure of cgroup_create(),
+         * the corresponding @cfe may not exist.
+         */
        list_for_each_entry(cfe, &cgrp->files, node) {
                struct dentry *d = cfe->dentry;
@@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                list_del_init(&cfe->node);
                dput(d);
-                return 0;
+                break;
        }
-        return -ENOENT;
 }
 /**
@@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                }
        }
        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-        synchronize_rcu();
        return 0;
 }
@@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
+        INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
+                struct hlist_node *node;
+                struct css_set *cg;
                BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                hash_for_each(css_set_table, i, node, cg, hlist)
-                        struct hlist_head *hhead = &css_set_table[i];
+                        link_css_set(&tmp_cg_links, cg, root_cgrp);
-                        struct hlist_node *node;
-                        struct css_set *cg;
-                        hlist_for_each_entry(cg, node, hhead, hlist)
-                                link_css_set(&tmp_cg_links, cg, root_cgrp);
-                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
                           "cgroup_path() called without proper locking");
-        if (!dentry || cgrp == dummytop) {
+        if (cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                        ss->attach(cgrp, &tset);
        }
-        synchronize_rcu();
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        /*
         * step 5: success! and cleanup
         */
-        synchronize_rcu();
        retval = 0;
 out_put_css_set_refs:
        if (retval) {
@@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
                        continue;
-                if (is_add)
+                if (is_add) {
                        err = cgroup_add_file(cgrp, subsys, cft);
-                else
+                        if (err)
-                        err = cgroup_rm_file(cgrp, cft);
+                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                if (err) {
+                                        cft->name, err);
-                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                   is_add ? "add" : "remove", cft->name, err);
                        ret = err;
+                } else {
+                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
@@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 }
 EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+        struct cgroup *last, *tmp;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        do {
+                last = pos;
+                /* ->prev isn't RCU safe, walk ->next till the end */
+                pos = NULL;
+                list_for_each_entry_rcu(tmp, &last->children, sibling)
+                        pos = tmp;
+        } while (pos);
+        return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
 static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 {
        struct cgroup *last;
@@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)
                        remove);
        struct cgroup *cgrp = event->cgrp;
+        remove_wait_queue(event->wqh, &event->wait);
        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
        eventfd_ctx_put(event->eventfd);
        kfree(event);
        dput(cgrp->dentry);
@@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
-                __remove_wait_queue(event->wqh, &event->wait);
-                spin_lock(&cgrp->event_list_lock);
-                list_del_init(&event->list);
-                spin_unlock(&cgrp->event_list_lock);
                /*
-                 * We are in atomic context, but cgroup_event_remove() may
+                 * If the event has been detached at cgroup removal, we
-                 * sleep, so we have to call it in workqueue.
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
                 */
-                schedule_work(&event->remove);
+                spin_lock(&cgrp->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&cgrp->event_list_lock);
        }
        return 0;
@@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        struct cgroup_event *event = NULL;
+        struct cgroup *cgrp_cfile;
        unsigned int efd, cfd;
        struct file *efile = NULL;
        struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                goto fail;
        }
+        /*
+         * The file to be monitored must be in the same cgroup as
+         * cgroup.event_control is.
+         */
+        cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+        if (cgrp_cfile != cgrp) {
+                ret = -EINVAL;
+                goto fail;
+        }
        if (!event->cft->register_event || !event->cft->unregister_event) {
                ret = -EINVAL;
                goto fail;
@@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        init_cgroup_housekeeping(cgrp);
+        dentry->d_fsdata = cgrp;
+        cgrp->dentry = dentry;
        cgrp->parent = parent;
        cgrp->root = parent->root;
        cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        lockdep_assert_held(&dentry->d_inode->i_mutex);
        /* allocation complete, commit to creation */
-        dentry->d_fsdata = cgrp;
-        cgrp->dentry = dentry;
        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace. Use
+         * directory to avoid race between userspace and kernelspace.
-         * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-         * cgroup_event_wake() is called with the wait queue head locked,
-         * remove_wait_queue() cannot be called while holding event_list_lock.
         */
        spin_lock(&cgrp->event_list_lock);
-        list_splice_init(&cgrp->event_list, &tmp_list);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-        spin_unlock(&cgrp->event_list_lock);
-        list_for_each_entry_safe(event, tmp, &tmp_list, list) {
                list_del_init(&event->list);
-                remove_wait_queue(event->wqh, &event->wait);
-                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
+        spin_unlock(&cgrp->event_list_lock);
        return 0;
 }
@@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
        int i, ret;
+        struct hlist_node *node, *tmp;
+        struct css_set *cg;
+        unsigned long key;
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+        hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
-                struct css_set *cg;
+                /* skip entries that we already rehashed */
-                struct hlist_node *node, *tmp;
+                if (cg->subsys[ss->subsys_id])
-                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                        continue;
+                /* remove existing entry */
-                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                hash_del(&cg->hlist);
-                        /* skip entries that we already rehashed */
+                /* set new value */
-                        if (cg->subsys[ss->subsys_id])
+                cg->subsys[ss->subsys_id] = css;
-                                continue;
+                /* recompute hash and restore entry */
-                        /* remove existing entry */
+                key = css_set_hash(cg->subsys);
-                        hlist_del(&cg->hlist);
+                hash_add(css_set_table, node, key);
-                        /* set new value */
-                        cg->subsys[ss->subsys_id] = css;
-                        /* recompute hash and restore entry */
-                        new_bucket = css_set_hash(cg->subsys);
-                        hlist_add_head(&cg->hlist, new_bucket);
-                }
        }
        write_unlock(&css_set_lock);
@@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
        struct cg_cgroup_link *link;
-        struct hlist_head *hhead;
        BUG_ON(ss->module == NULL);
@@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        write_lock(&css_set_lock);
        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                struct css_set *cg = link->cg;
+                unsigned long key;
-                hlist_del(&cg->hlist);
+                hash_del(&cg->hlist);
                cg->subsys[ss->subsys_id] = NULL;
-                hhead = css_set_hash(cg->subsys);
+                key = css_set_hash(cg->subsys);
-                hlist_add_head(&cg->hlist, hhead);
+                hash_add(css_set_table, &cg->hlist, key);
        }
        write_unlock(&css_set_lock);
@@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -4667,7 +4716,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct hlist_head *hhead;
+        unsigned long key;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -4686,8 +4735,8 @@ int __init cgroup_init(void)
        }
        /* Add init_css_set to the hash table */
-        hhead = css_set_hash(init_css_set.subsys);
+        key = css_set_hash(init_css_set.subsys);
-        hlist_add_head(&init_css_set.hlist, hhead);
+        hash_add(css_set_table, &init_css_set.hlist, key);
        BUG_ON(!init_root_id(&rootnode));
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        }
        task_unlock(tsk);
-        if (cg)
+        put_css_set_taskexit(cg);
-                put_css_set_taskexit(cg);
 }
 /**