Merge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: "The cgroup core saw several significant updates this cycle: - percpu_rwsem for threadgroup locking is reinstated. This was temporarily dropped due to down_write latency issues. Oleg's rework of percpu_rwsem which is scheduled to be merged in this merge window resolves the issue. - On the v2 hierarchy, when controllers are enabled and disabled, all operations are atomic and can fail and revert cleanly. This allows ->can_attach() failure which is necessary for cpu RT slices. - Tasks now stay associated with the original cgroups after exit until released. This allows tracking resources held by zombies (e.g. pids) and makes it easy to find out where zombies came from on the v2 hierarchy. The pids controller was broken before these changes as zombies escaped the limits; unfortunately, updating this behavior required too many invasive changes and I don't think it's a good idea to backport them, so the pids controller on 4.3, the first version which included the pids controller, will stay broken at least until I'm sure about the cgroup core changes. - Optimization of a couple common tests using static_key" * 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (38 commits) cgroup: fix race condition around termination check in css_task_iter_next() blkcg: don't create "io.stat" on the root cgroup cgroup: drop cgroup__DEVEL__legacy_files_on_dfl cgroup: replace error handling in cgroup_init() with WARN_ON()s cgroup: add cgroup_subsys->free() method and use it to fix pids controller cgroup: keep zombies associated with their original cgroups cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock cgroup: don't hold css_set_rwsem across css task iteration cgroup: reorganize css_task_iter functions cgroup: factor out css_set_move_task() cgroup: keep css_set and task lists in chronological order cgroup: make cgroup_destroy_locked() test cgroup_is_populated() cgroup: make css_sets pin the associated cgroups cgroup: relocate cgroup_[try]get/put() cgroup: move check_for_release() invocation cgroup: replace cgroup_has_tasks() with cgroup_is_populated() cgroup: make cgroup->nr_populated count the number of populated css_sets cgroup: remove an unused parameter from cgroup_task_migrate() cgroup: fix too early usage of static_branch_disable() cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-05 17:51:32 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-05 17:51:32 -0500
commit: 69234acee54407962a20bedf90ef9c96326994b5 (patch)
tree: 5e979b1a489d866691c2c65ac3f46b4f29feef68 /kernel/cgroup.c
parent: 11eaaadb3ea376c6c194491c2e9bddd647f9d253 (diff)
parent: d57456753787ab158f906f1f8eb58d54a2ccd9f4 (diff)
1 files changed, 725 insertions, 572 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2c9eae6ad970..b9d0cce3f9ce 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -75,7 +75,7 @@
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -83,12 +83,12 @@
 */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
 #endif
 /*
@@ -103,6 +103,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
 */
 static DEFINE_SPINLOCK(release_agent_path_lock);
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 #define cgroup_assert_mutex_or_rcu_locked()                             \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
                           !lockdep_is_held(&cgroup_mutex),             \
@@ -136,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
 };
 #undef SUBSYS
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x)                                                              \
+        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
+        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
+        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
+        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
 /*
 * The default hierarchy, reserved for the subsystems that are otherwise
 * unattached - it never has more than a single cgroup, and all tasks are
@@ -150,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 */
 static bool cgrp_dfl_root_visible;
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
 /* some controllers are not supported in the default hierarchy */
 static unsigned long cgrp_dfl_root_inhibit_ss_mask;
@@ -183,6 +200,7 @@ static u64 css_serial_nr_next = 1;
 */
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
@@ -192,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                      bool visible);
 static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core.  This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+        return cgrp->root == &cgrp_dfl_root;
+}
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
@@ -332,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
        return !(cgrp->self.flags & CSS_ONLINE);
 }
+static void cgroup_get(struct cgroup *cgrp)
+{
+        WARN_ON_ONCE(cgroup_is_dead(cgrp));
+        css_get(&cgrp->self);
+}
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+        return css_tryget(&cgrp->self);
+}
+static void cgroup_put(struct cgroup *cgrp)
+{
+        css_put(&cgrp->self);
+}
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
        struct cgroup *cgrp = of->kn->parent->priv;
@@ -481,19 +588,31 @@ struct css_set init_css_set = {
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 };
 static int css_set_count        = 1;    /* 1 for init_css_set */
 /**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+        lockdep_assert_held(&css_set_lock);
+        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+/**
 * cgroup_update_populated - updated populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
- * @cgrp is either getting the first task (css_set) or losing the last.
+ * One of the css_sets associated with @cgrp is either getting its first
- * Update @cgrp->populated_cnt accordingly.  The count is propagated
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- * towards root so that a given cgroup's populated_cnt is zero iff the
+ * count is propagated towards root so that a given cgroup's populated_cnt
- * cgroup and all its descendants are empty.
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if
 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
@@ -503,7 +622,7 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        do {
                bool trigger;
@@ -516,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
                if (!trigger)
                        break;
-                if (cgrp->populated_kn)
+                check_for_release(cgrp);
-                        kernfs_notify(cgrp->populated_kn);
+                cgroup_file_notify(&cgrp->events_file);
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
 }
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last.  Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+        struct cgrp_cset_link *link;
+        lockdep_assert_held(&css_set_lock);
+        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+                cgroup_update_populated(link->cgrp, populated);
+}
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ * css_set, @from_cset can be NULL.  If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+                              struct css_set *from_cset, struct css_set *to_cset,
+                              bool use_mg_tasks)
+{
+        lockdep_assert_held(&css_set_lock);
+        if (from_cset) {
+                struct css_task_iter *it, *pos;
+                WARN_ON_ONCE(list_empty(&task->cg_list));
+                /*
+                 * @task is leaving, advance task iterators which are
+                 * pointing to it so that they can resume at the next
+                 * position.  Advancing an iterator might remove it from
+                 * the list, use safe walk.  See css_task_iter_advance*()
+                 * for details.
+                 */
+                list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+                                         iters_node)
+                        if (it->task_pos == &task->cg_list)
+                                css_task_iter_advance(it);
+                list_del_init(&task->cg_list);
+                if (!css_set_populated(from_cset))
+                        css_set_update_populated(from_cset, false);
+        } else {
+                WARN_ON_ONCE(!list_empty(&task->cg_list));
+        }
+        if (to_cset) {
+                /*
+                 * We are synchronized through cgroup_threadgroup_rwsem
+                 * against PF_EXITING setting such that we can't race
+                 * against cgroup_exit() changing the css_set to
+                 * init_css_set and dropping the old one.
+                 */
+                WARN_ON_ONCE(task->flags & PF_EXITING);
+                if (!css_set_populated(to_cset))
+                        css_set_update_populated(to_cset, true);
+                rcu_assign_pointer(task->cgroups, to_cset);
+                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+                                                             &to_cset->tasks);
+        }
+}
 /*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
@@ -549,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
        struct cgroup_subsys *ss;
        int ssid;
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        if (!atomic_dec_and_test(&cset->refcount))
                return;
@@ -561,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
        css_set_count--;
        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
+                if (cgroup_parent(link->cgrp))
-                /* @cgrp can't go away while we're holding css_set_rwsem */
+                        cgroup_put(link->cgrp);
-                if (list_empty(&cgrp->cset_links)) {
-                        cgroup_update_populated(cgrp, false);
-                        check_for_release(cgrp);
-                }
                kfree(link);
        }
@@ -588,9 +781,9 @@ static void put_css_set(struct css_set *cset)
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        put_css_set_locked(cset);
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 /*
@@ -779,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
        link->cset = cset;
        link->cgrp = cgrp;
-        if (list_empty(&cgrp->cset_links))
-                cgroup_update_populated(cgrp, true);
-        list_move(&link->cset_link, &cgrp->cset_links);
        /*
-         * Always add links to the tail of the list so that the list
+         * Always add links to the tail of the lists so that the lists are
-         * is sorted by order of hierarchy creation
+         * in choronological order.
         */
+        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+        if (cgroup_parent(cgrp))
+                cgroup_get(cgrp);
 }
 /**
@@ -813,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        if (cset)
                return cset;
@@ -838,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->mg_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);
+        INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
@@ -866,53 +1060,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                list_add_tail(&cset->e_cset_node[ssid],
                              &cset->subsys[ssid]->cgroup->e_csets[ssid]);
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return cset;
 }
-void cgroup_threadgroup_change_begin(struct task_struct *tsk)
-{
-        down_read(&tsk->signal->group_rwsem);
-}
-void cgroup_threadgroup_change_end(struct task_struct *tsk)
-{
-        up_read(&tsk->signal->group_rwsem);
-}
-/**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid.  This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
- *
- * fork and exit explicitly call threadgroup_change_{begin|end}() for
- * synchronization.  While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
- *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
- */
-static void threadgroup_lock(struct task_struct *tsk)
-{
-        down_write(&tsk->signal->group_rwsem);
-}
-/**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
- *
- * Reverse threadgroup_lock().
- */
-static inline void threadgroup_unlock(struct task_struct *tsk)
-{
-        up_write(&tsk->signal->group_rwsem);
-}
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
        struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -972,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@ -1001,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
        struct cgroup *res = NULL;
        lockdep_assert_held(&cgroup_mutex);
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        if (cset == &init_css_set) {
                res = &root->cgrp;
@@ -1024,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 /*
 * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
 */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroup_root *root)
@@ -1063,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
@@ -1086,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
- * returns cft->mode if ->mode is not 0
+ * S_IRUGO for read, S_IWUSR for write.
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
 */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
        umode_t mode = 0;
-        if (cft->mode)
-                return cft->mode;
        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;
-        if (cft->write_u64 || cft->write_s64 || cft->write)
+        if (cft->write_u64 || cft->write_s64 || cft->write) {
-                mode |= S_IWUSR;
+                if (cft->flags & CFTYPE_WORLD_WRITABLE)
+                        mode |= S_IWUGO;
+                else
+                        mode |= S_IWUSR;
+        }
        return mode;
 }
-static void cgroup_get(struct cgroup *cgrp)
-{
-        WARN_ON_ONCE(cgroup_is_dead(cgrp));
-        css_get(&cgrp->self);
-}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-        return css_tryget(&cgrp->self);
-}
-static void cgroup_put(struct cgroup *cgrp)
-{
-        css_put(&cgrp->self);
-}
 /**
 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
 * @cgrp: the target cgroup
@@ -1263,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 }
 /**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * css_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
+ * @css: taget css
- * @subsys_mask: mask of the subsystem ids whose files should be removed
+ * @cgrp_override: specify if target cgroup is different from css->cgroup
 */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void css_clear_dir(struct cgroup_subsys_state *css,
+                          struct cgroup *cgrp_override)
 {
-        struct cgroup_subsys *ss;
+        struct cgroup *cgrp = cgrp_override ?: css->cgroup;
-        int i;
+        struct cftype *cfts;
-        for_each_subsys(ss, i) {
+        list_for_each_entry(cfts, &css->ss->cfts, node)
-                struct cftype *cfts;
+                cgroup_addrm_files(css, cgrp, cfts, false);
+}
-                if (!(subsys_mask & (1 << i)))
+/**
-                        continue;
+ * css_populate_dir - create subsys files in a cgroup directory
-                list_for_each_entry(cfts, &ss->cfts, node)
+ * @css: target css
-                        cgroup_addrm_files(cgrp, cfts, false);
+ * @cgrp_overried: specify if target cgroup is different from css->cgroup
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css,
+                            struct cgroup *cgrp_override)
+{
+        struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+        struct cftype *cfts, *failed_cfts;
+        int ret;
+        if (!css->ss) {
+                if (cgroup_on_dfl(cgrp))
+                        cfts = cgroup_dfl_base_files;
+                else
+                        cfts = cgroup_legacy_base_files;
+                return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+        }
+        list_for_each_entry(cfts, &css->ss->cfts, node) {
+                ret = cgroup_addrm_files(css, cgrp, cfts, true);
+                if (ret < 0) {
+                        failed_cfts = cfts;
+                        goto err;
+                }
        }
+        return 0;
+err:
+        list_for_each_entry(cfts, &css->ss->cfts, node) {
+                if (cfts == failed_cfts)
+                        break;
+                cgroup_addrm_files(css, cgrp, cfts, false);
+        }
+        return ret;
 }
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask)
 {
+        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        unsigned long tmp_ss_mask;
        int ssid, i, ret;
@@ -1306,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
        if (dst_root == &cgrp_dfl_root)
                tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-        ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+        for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-        if (ret) {
+                struct cgroup *scgrp = &ss->root->cgrp;
-                if (dst_root != &cgrp_dfl_root)
+                int tssid;
-                        return ret;
+                ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+                if (!ret)
+                        continue;
                /*
                 * Rebinding back to the default root is not allowed to
@@ -1317,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                 * be rare.  Moving subsystems back and forth even more so.
                 * Just warn about it and continue.
                 */
-                if (cgrp_dfl_root_visible) {
+                if (dst_root == &cgrp_dfl_root) {
-                        pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+                        if (cgrp_dfl_root_visible) {
-                                ret, ss_mask);
+                                pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
-                        pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+                                        ret, ss_mask);
+                                pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+                        }
+                        continue;
+                }
+                for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+                        if (tssid == ssid)
+                                break;
+                        css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
                }
+                return ret;
        }
        /*
         * Nothing can fail from this point on.  Remove files for the
         * removed subsystems and rebind each subsystem.
         */
-        for_each_subsys_which(ss, ssid, &ss_mask)
-                cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
        for_each_subsys_which(ss, ssid, &ss_mask) {
-                struct cgroup_root *src_root;
+                struct cgroup_root *src_root = ss->root;
-                struct cgroup_subsys_state *css;
+                struct cgroup *scgrp = &src_root->cgrp;
+                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset;
-                src_root = ss->root;
+                WARN_ON(!css || cgroup_css(dcgrp, ss));
-                css = cgroup_css(&src_root->cgrp, ss);
-                WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+                css_clear_dir(css, NULL);
-                RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
-                rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
-                css->cgroup = &dst_root->cgrp;
+                css->cgroup = dcgrp;
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
-                                       &dst_root->cgrp.e_csets[ss->id]);
+                                       &dcgrp->e_csets[ss->id]);
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                src_root->subsys_mask &= ~(1 << ssid);
-                src_root->cgrp.subtree_control &= ~(1 << ssid);
+                scgrp->subtree_control &= ~(1 << ssid);
-                cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+                cgroup_refresh_child_subsys_mask(scgrp);
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
-                if (dst_root != &cgrp_dfl_root) {
+                if (dst_root == &cgrp_dfl_root) {
-                        dst_root->cgrp.subtree_control |= 1 << ssid;
+                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
-                        cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+                } else {
+                        dcgrp->subtree_control |= 1 << ssid;
+                        cgroup_refresh_child_subsys_mask(dcgrp);
+                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }
                if (ss->bind)
                        ss->bind(css);
        }
-        kernfs_activate(dst_root->cgrp.kn);
+        kernfs_activate(dcgrp->kn);
        return 0;
 }
@@ -1497,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->legacy_name))
                                continue;
-                        if (ss->disabled)
+                        if (!cgroup_ssid_enabled(i))
                                continue;
                        /* Mutually exclusive option 'all' + subsystem name */
@@ -1528,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         */
        if (all_ss || (!one_ss && !opts->none && !opts->name))
                for_each_subsys(ss, i)
-                        if (!ss->disabled)
+                        if (cgroup_ssid_enabled(i))
                                opts->subsys_mask |= (1 << i);
        /*
@@ -1624,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        if (use_task_css_set_links)
                goto out_unlock;
@@ -1654,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
                if (!(p->flags & PF_EXITING)) {
                        struct css_set *cset = task_css_set(p);
-                        list_add(&p->cg_list, &cset->tasks);
+                        if (!css_set_populated(cset))
+                                css_set_update_populated(cset, true);
+                        list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
                }
                spin_unlock_irq(&p->sighand->siglock);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 out_unlock:
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1671,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
+        INIT_LIST_HEAD(&cgrp->self.files);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
@@ -1708,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
-        struct cftype *base_files;
        struct css_set *cset;
        int i, ret;
@@ -1725,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
                goto out;
        /*
-         * We're accessing css_set_count without locking css_set_rwsem here,
+         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us. The worst that can happen is that we
         * have some link structures left over
@@ -1747,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
        }
        root_cgrp->kn = root->kf_root->kn;
-        if (root == &cgrp_dfl_root)
+        ret = css_populate_dir(&root_cgrp->self, NULL);
-                base_files = cgroup_dfl_base_files;
-        else
-                base_files = cgroup_legacy_base_files;
-        ret = cgroup_addrm_files(root_cgrp, base_files, true);
        if (ret)
                goto destroy_root;
@@ -1772,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
-        hash_for_each(css_set_table, i, cset, hlist)
+        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
-        up_write(&css_set_rwsem);
+                if (css_set_populated(cset))
+                        cgroup_update_populated(root_cgrp, true);
+        }
+        spin_unlock_bh(&css_set_lock);
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2008,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        char *path = NULL;
        mutex_lock(&cgroup_mutex);
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2021,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
                        path = buf;
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
 }
@@ -2049,6 +2232,49 @@ struct cgroup_taskset {
        struct task_struct      *cur_task;
 };
+#define CGROUP_TASKSET_INIT(tset)       (struct cgroup_taskset){        \
+        .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
+        .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
+        .csets                  = &tset.src_csets,                      \
+}
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset.  This function becomes
+ * noop if @task doesn't need to be migrated.  @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+                               struct cgroup_taskset *tset)
+{
+        struct css_set *cset;
+        lockdep_assert_held(&css_set_lock);
+        /* @task either already exited or can't exit until the end */
+        if (task->flags & PF_EXITING)
+                return;
+        /* leave @task alone if post_fork() hasn't linked it yet */
+        if (list_empty(&task->cg_list))
+                return;
+        cset = task_css_set(task);
+        if (!cset->mg_src_cgrp)
+                return;
+        list_move_tail(&task->cg_list, &cset->mg_tasks);
+        if (list_empty(&cset->mg_node))
+                list_add_tail(&cset->mg_node, &tset->src_csets);
+        if (list_empty(&cset->mg_dst_cset->mg_node))
+                list_move_tail(&cset->mg_dst_cset->mg_node,
+                               &tset->dst_csets);
+}
 /**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
@@ -2096,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 }
 /**
- * cgroup_task_migrate - move a task from one cgroup to another.
+ * cgroup_taskset_migrate - migrate a taskset to a cgroup
- * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tset: taget taskset
- * @tsk: the task being migrated
+ * @dst_cgrp: destination cgroup
- * @new_cset: the new css_set @tsk is being attached to
 *
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
+ * ->can_attach callbacks fails and guarantees that either all or none of
+ * the tasks in @tset are migrated.  @tset is consumed regardless of
+ * success.
 */
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-                                struct task_struct *tsk,
+                                  struct cgroup *dst_cgrp)
-                                struct css_set *new_cset)
 {
-        struct css_set *old_cset;
+        struct cgroup_subsys_state *css, *failed_css = NULL;
+        struct task_struct *task, *tmp_task;
-        lockdep_assert_held(&cgroup_mutex);
+        struct css_set *cset, *tmp_cset;
-        lockdep_assert_held(&css_set_rwsem);
+        int i, ret;
-        /*
+        /* methods shouldn't be called if no task is actually migrating */
-         * We are synchronized through threadgroup_lock() against PF_EXITING
+        if (list_empty(&tset->src_csets))
-         * setting such that we can't race against cgroup_exit() changing the
+                return 0;
-         * css_set to init_css_set and dropping the old one.
-         */
-        WARN_ON_ONCE(tsk->flags & PF_EXITING);
-        old_cset = task_css_set(tsk);
-        get_css_set(new_cset);
+        /* check that we can legitimately attach to the cgroup */
-        rcu_assign_pointer(tsk->cgroups, new_cset);
+        for_each_e_css(css, i, dst_cgrp) {
+                if (css->ss->can_attach) {
+                        ret = css->ss->can_attach(css, tset);
+                        if (ret) {
+                                failed_css = css;
+                                goto out_cancel_attach;
+                        }
+                }
+        }
        /*
-         * Use move_tail so that cgroup_taskset_first() still returns the
+         * Now that we're guaranteed success, proceed to move all tasks to
-         * leader after migration.  This works because cgroup_migrate()
+         * the new cgroup.  There are no failure cases after here, so this
-         * ensures that the dst_cset of the leader is the first on the
+         * is the commit point.
-         * tset's dst_csets list.
         */
-        list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+        spin_lock_bh(&css_set_lock);
+        list_for_each_entry(cset, &tset->src_csets, mg_node) {
+                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+                        struct css_set *from_cset = task_css_set(task);
+                        struct css_set *to_cset = cset->mg_dst_cset;
+                        get_css_set(to_cset);
+                        css_set_move_task(task, from_cset, to_cset, true);
+                        put_css_set_locked(from_cset);
+                }
+        }
+        spin_unlock_bh(&css_set_lock);
        /*
-         * We just gained a reference on old_cset by taking it from the
+         * Migration is committed, all target tasks are now on dst_csets.
-         * task. As trading it for new_cset is protected by cgroup_mutex,
+         * Nothing is sensitive to fork() after this point.  Notify
-         * we're safe to drop it here; it will be freed under RCU.
+         * controllers that migration is complete.
         */
-        put_css_set_locked(old_cset);
+        tset->csets = &tset->dst_csets;
+        for_each_e_css(css, i, dst_cgrp)
+                if (css->ss->attach)
+                        css->ss->attach(css, tset);
+        ret = 0;
+        goto out_release_tset;
+out_cancel_attach:
+        for_each_e_css(css, i, dst_cgrp) {
+                if (css == failed_css)
+                        break;
+                if (css->ss->cancel_attach)
+                        css->ss->cancel_attach(css, tset);
+        }
+out_release_tset:
+        spin_lock_bh(&css_set_lock);
+        list_splice_init(&tset->dst_csets, &tset->src_csets);
+        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+                list_del_init(&cset->mg_node);
+        }
+        spin_unlock_bh(&css_set_lock);
+        return ret;
 }
 /**
@@ -2152,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
        lockdep_assert_held(&cgroup_mutex);
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 /**
@@ -2172,10 +2437,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 * @src_cset and add it to @preloaded_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
- * This function may be called without holding threadgroup_lock even if the
+ * This function may be called without holding cgroup_threadgroup_rwsem
- * target is a process.  Threads may be created and destroyed but as long
+ * even if the target is a process.  Threads may be created and destroyed
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
 */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
                                   struct cgroup *dst_cgrp,
@@ -2184,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        struct cgroup *src_cgrp;
        lockdep_assert_held(&cgroup_mutex);
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2273,12 +2539,12 @@ err:
 /**
 * cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
+ * @cgrp: the destination cgroup
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding threadgroup_lock of @leader.  The
+ * process, the caller must be holding cgroup_threadgroup_rwsem.  The
 * caller is also responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
@@ -2289,115 +2555,29 @@ err:
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-                          bool threadgroup)
+                          struct cgroup *cgrp)
-{
+{
-        struct cgroup_taskset tset = {
+        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-                .src_csets      = LIST_HEAD_INIT(tset.src_csets),
+        struct task_struct *task;
-                .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
-                .csets          = &tset.src_csets,
-        };
-        struct cgroup_subsys_state *css, *failed_css = NULL;
-        struct css_set *cset, *tmp_cset;
-        struct task_struct *task, *tmp_task;
-        int i, ret;
        /*
         * Prevent freeing of tasks while we take a snapshot. Tasks that are
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
-                /* @task either already exited or can't exit until the end */
+                cgroup_taskset_add(task, &tset);
-                if (task->flags & PF_EXITING)
-                        goto next;
-                /* leave @task alone if post_fork() hasn't linked it yet */
-                if (list_empty(&task->cg_list))
-                        goto next;
-                cset = task_css_set(task);
-                if (!cset->mg_src_cgrp)
-                        goto next;
-                /*
-                 * cgroup_taskset_first() must always return the leader.
-                 * Take care to avoid disturbing the ordering.
-                 */
-                list_move_tail(&task->cg_list, &cset->mg_tasks);
-                if (list_empty(&cset->mg_node))
-                        list_add_tail(&cset->mg_node, &tset.src_csets);
-                if (list_empty(&cset->mg_dst_cset->mg_node))
-                        list_move_tail(&cset->mg_dst_cset->mg_node,
-                                       &tset.dst_csets);
-        next:
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
-        /* methods shouldn't be called if no task is actually migrating */
-        if (list_empty(&tset.src_csets))
-                return 0;
-        /* check that we can legitimately attach to the cgroup */
-        for_each_e_css(css, i, cgrp) {
-                if (css->ss->can_attach) {
-                        ret = css->ss->can_attach(css, &tset);
-                        if (ret) {
-                                failed_css = css;
-                                goto out_cancel_attach;
-                        }
-                }
-        }
-        /*
-         * Now that we're guaranteed success, proceed to move all tasks to
-         * the new cgroup.  There are no failure cases after here, so this
-         * is the commit point.
-         */
-        down_write(&css_set_rwsem);
-        list_for_each_entry(cset, &tset.src_csets, mg_node) {
-                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
-                        cgroup_task_migrate(cset->mg_src_cgrp, task,
-                                            cset->mg_dst_cset);
-        }
-        up_write(&css_set_rwsem);
-        /*
-         * Migration is committed, all target tasks are now on dst_csets.
-         * Nothing is sensitive to fork() after this point.  Notify
-         * controllers that migration is complete.
-         */
-        tset.csets = &tset.dst_csets;
-        for_each_e_css(css, i, cgrp)
-                if (css->ss->attach)
-                        css->ss->attach(css, &tset);
-        ret = 0;
-        goto out_release_tset;
-out_cancel_attach:
+        return cgroup_taskset_migrate(&tset, cgrp);
-        for_each_e_css(css, i, cgrp) {
-                if (css == failed_css)
-                        break;
-                if (css->ss->cancel_attach)
-                        css->ss->cancel_attach(css, &tset);
-        }
-out_release_tset:
-        down_write(&css_set_rwsem);
-        list_splice_init(&tset.dst_csets, &tset.src_csets);
-        list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
-                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
-                list_del_init(&cset->mg_node);
-        }
-        up_write(&css_set_rwsem);
-        return ret;
 }
 /**
@@ -2406,7 +2586,7 @@ out_release_tset:
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
                              struct task_struct *leader, bool threadgroup)
@@ -2416,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        int ret;
        /* look up all src csets */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2426,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
        if (!ret)
-                ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+                ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
        cgroup_migrate_finish(&preloaded_csets);
        return ret;
@@ -2459,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                struct cgroup *cgrp;
                struct inode *inode;
-                down_read(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-                up_read(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
                ret = -ENOMEM;
-                inode = kernfs_get_inode(sb, cgrp->procs_kn);
+                inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
                if (inode) {
                        ret = inode_permission(inode, MAY_WRITE);
                        iput(inode);
@@ -2498,14 +2678,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
        if (!cgrp)
                return -ENODEV;
-retry_find_task:
+        percpu_down_write(&cgroup_threadgroup_rwsem);
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
-                        rcu_read_unlock();
                        ret = -ESRCH;
-                        goto out_unlock_cgroup;
+                        goto out_unlock_rcu;
                }
        } else {
                tsk = current;
@@ -2521,37 +2700,23 @@ retry_find_task:
         */
        if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
-                rcu_read_unlock();
+                goto out_unlock_rcu;
-                goto out_unlock_cgroup;
        }
        get_task_struct(tsk);
        rcu_read_unlock();
-        threadgroup_lock(tsk);
-        if (threadgroup) {
-                if (!thread_group_leader(tsk)) {
-                        /*
-                         * a race with de_thread from another thread's exec()
-                         * may strip us of our leadership, if this happens,
-                         * there is no choice but to throw this task away and
-                         * try again; this is
-                         * "double-double-toil-and-trouble-check locking".
-                         */
-                        threadgroup_unlock(tsk);
-                        put_task_struct(tsk);
-                        goto retry_find_task;
-                }
-        }
        ret = cgroup_procs_write_permission(tsk, cgrp, of);
        if (!ret)
                ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-        threadgroup_unlock(tsk);
        put_task_struct(tsk);
-out_unlock_cgroup:
+        goto out_unlock_threadgroup;
+out_unlock_rcu:
+        rcu_read_unlock();
+out_unlock_threadgroup:
+        percpu_up_write(&cgroup_threadgroup_rwsem);
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
 }
@@ -2573,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
                if (root == &cgrp_dfl_root)
                        continue;
-                down_read(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
-                up_read(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
@@ -2690,14 +2855,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
        LIST_HEAD(preloaded_csets);
+        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
        struct cgroup_subsys_state *css;
        struct css_set *src_cset;
        int ret;
        lockdep_assert_held(&cgroup_mutex);
+        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* look up all csses currently attached to @cgrp's subtree */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
                struct cgrp_cset_link *link;
@@ -2709,68 +2877,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                        cgroup_migrate_add_src(link->cset, cgrp,
                                               &preloaded_csets);
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
        if (ret)
                goto out_finish;
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
-                struct task_struct *last_task = NULL, *task;
+                struct task_struct *task, *ntask;
                /* src_csets precede dst_csets, break on the first dst_cset */
                if (!src_cset->mg_src_cgrp)
                        break;
-                /*
+                /* all tasks in src_csets need to be migrated */
-                 * All tasks in src_cset need to be migrated to the
+                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-                 * matching dst_cset.  Empty it process by process.  We
+                        cgroup_taskset_add(task, &tset);
-                 * walk tasks but migrate processes.  The leader might even
-                 * belong to a different cset but such src_cset would also
-                 * be among the target src_csets because the default
-                 * hierarchy enforces per-process membership.
-                 */
-                while (true) {
-                        down_read(&css_set_rwsem);
-                        task = list_first_entry_or_null(&src_cset->tasks,
-                                                struct task_struct, cg_list);
-                        if (task) {
-                                task = task->group_leader;
-                                WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
-                                get_task_struct(task);
-                        }
-                        up_read(&css_set_rwsem);
-                        if (!task)
-                                break;
-                        /* guard against possible infinite loop */
-                        if (WARN(last_task == task,
-                                 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
-                                goto out_finish;
-                        last_task = task;
-                        threadgroup_lock(task);
-                        /* raced against de_thread() from another thread? */
-                        if (!thread_group_leader(task)) {
-                                threadgroup_unlock(task);
-                                put_task_struct(task);
-                                continue;
-                        }
-                        ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
-                        threadgroup_unlock(task);
-                        put_task_struct(task);
-                        if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
-                                goto out_finish;
-                }
        }
+        spin_unlock_bh(&css_set_lock);
+        ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
        cgroup_migrate_finish(&preloaded_csets);
+        percpu_up_write(&cgroup_threadgroup_rwsem);
        return ret;
 }
@@ -2797,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                if (tok[0] == '\0')
                        continue;
                for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-                        if (ss->disabled || strcmp(tok + 1, ss->name))
+                        if (!cgroup_ssid_enabled(ssid) ||
+                            strcmp(tok + 1, ss->name))
                                continue;
                        if (*tok == '+') {
@@ -2921,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                ret = create_css(child, ss,
                                        cgrp->subtree_control & (1 << ssid));
                        else
-                                ret = cgroup_populate_dir(child, 1 << ssid);
+                                ret = css_populate_dir(cgroup_css(child, ss),
+                                                       NULL);
                        if (ret)
                                goto err_undo_css;
                }
@@ -2954,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                        if (css_disable & (1 << ssid)) {
                                kill_css(css);
                        } else {
-                                cgroup_clear_dir(child, 1 << ssid);
+                                css_clear_dir(css, NULL);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
@@ -3002,15 +3135,16 @@ err_undo_css:
                        if (css_enable & (1 << ssid))
                                kill_css(css);
                        else
-                                cgroup_clear_dir(child, 1 << ssid);
+                                css_clear_dir(css, NULL);
                }
        }
        goto out_unlock;
 }
-static int cgroup_populated_show(struct seq_file *seq, void *v)
+static int cgroup_events_show(struct seq_file *seq, void *v)
 {
-        seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+        seq_printf(seq, "populated %d\n",
+                   cgroup_is_populated(seq_css(seq)->cgroup));
        return 0;
 }
@@ -3153,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
        return kernfs_setattr(kn, &iattr);
 }
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+                           struct cftype *cft)
 {
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
@@ -3175,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
                return ret;
        }
-        if (cft->write == cgroup_procs_write)
+        if (cft->file_offset) {
-                cgrp->procs_kn = kn;
+                struct cgroup_file *cfile = (void *)css + cft->file_offset;
-        else if (cft->seq_show == cgroup_populated_show)
-                cgrp->populated_kn = kn;
+                kernfs_get(kn);
+                cfile->kn = kn;
+                list_add(&cfile->node, &css->files);
+        }
        return 0;
 }
 /**
 * cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails.  If addition fails, this
+ * For removals, this function never fails.
- * function doesn't remove files already added.  The caller is responsible
- * for cleaning up.
 */
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
 {
-        struct cftype *cft;
+        struct cftype *cft, *cft_end = NULL;
        int ret;
        lockdep_assert_held(&cgroup_mutex);
-        for (cft = cfts; cft->name[0] != '\0'; cft++) {
+restart:
+        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
@@ -3213,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                        continue;
                if (is_add) {
-                        ret = cgroup_add_file(cgrp, cft);
+                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
-                                return ret;
+                                cft_end = cft;
+                                is_add = false;
+                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
@@ -3243,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
                if (cgroup_is_dead(cgrp))
                        continue;
-                ret = cgroup_addrm_files(cgrp, cfts, is_add);
+                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }
@@ -3355,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        int ret;
-        if (ss->disabled)
+        if (!cgroup_ssid_enabled(ss->id))
                return 0;
        if (!cfts || cfts[0].name[0] == '\0')
@@ -3405,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype *cft;
-        /*
+        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-         * If legacy_flies_on_dfl, we want to show the legacy files on the
+                cft->flags |= __CFTYPE_NOT_ON_DFL;
-         * dfl hierarchy but iff the target subsystem hasn't been updated
-         * for the dfl hierarchy yet.
-         */
-        if (!cgroup_legacy_files_on_dfl ||
-            ss->dfl_cftypes != ss->legacy_cftypes) {
-                for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-                        cft->flags |= __CFTYPE_NOT_ON_DFL;
-        }
        return cgroup_add_cftypes(ss, cfts);
 }
@@ -3430,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        int count = 0;
        struct cgrp_cset_link *link;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return count;
 }
@@ -3665,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
 }
 /**
- * css_advance_task_iter - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
-static void css_advance_task_iter(struct css_task_iter *it)
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
 {
        struct list_head *l = it->cset_pos;
        struct cgrp_cset_link *link;
        struct css_set *cset;
+        lockdep_assert_held(&css_set_lock);
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
                if (l == it->cset_head) {
                        it->cset_pos = NULL;
+                        it->task_pos = NULL;
                        return;
                }
@@ -3691,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
                        link = list_entry(l, struct cgrp_cset_link, cset_link);
                        cset = link->cset;
                }
-        } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+        } while (!css_set_populated(cset));
        it->cset_pos = l;
@@ -3702,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+        /*
+         * We don't keep css_sets locked across iteration steps and thus
+         * need to take steps to ensure that iteration can be resumed after
+         * the lock is re-acquired.  Iteration is performed at two levels -
+         * css_sets and tasks in them.
+         *
+         * Once created, a css_set never leaves its cgroup lists, so a
+         * pinned css_set is guaranteed to stay put and we can resume
+         * iteration afterwards.
+         *
+         * Tasks may leave @cset across iteration steps.  This is resolved
+         * by registering each iterator with the css_set currently being
+         * walked and making css_set_move_task() advance iterators whose
+         * next task is leaving.
+         */
+        if (it->cur_cset) {
+                list_del(&it->iters_node);
+                put_css_set_locked(it->cur_cset);
+        }
+        get_css_set(cset);
+        it->cur_cset = cset;
+        list_add(&it->iters_node, &cset->task_iters);
+}
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+        struct list_head *l = it->task_pos;
+        lockdep_assert_held(&css_set_lock);
+        WARN_ON_ONCE(!l);
+        /*
+         * Advance iterator to find next entry.  cset->tasks is consumed
+         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+         * next cset.
+         */
+        l = l->next;
+        if (l == it->tasks_head)
+                l = it->mg_tasks_head->next;
+        if (l == it->mg_tasks_head)
+                css_task_iter_advance_css_set(it);
+        else
+                it->task_pos = l;
 }
 /**
@@ -3713,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
 */
 void css_task_iter_start(struct cgroup_subsys_state *css,
                         struct css_task_iter *it)
-        __acquires(css_set_rwsem)
 {
        /* no one should try to iterate before mounting cgroups */
        WARN_ON_ONCE(!use_task_css_set_links);
-        down_read(&css_set_rwsem);
+        memset(it, 0, sizeof(*it));
+        spin_lock_bh(&css_set_lock);
        it->ss = css->ss;
@@ -3736,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        it->cset_head = it->cset_pos;
-        css_advance_task_iter(it);
+        css_task_iter_advance_css_set(it);
+        spin_unlock_bh(&css_set_lock);
 }
 /**
@@ -3749,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-        struct task_struct *res;
+        if (it->cur_task) {
-        struct list_head *l = it->task_pos;
+                put_task_struct(it->cur_task);
+                it->cur_task = NULL;
+        }
-        /* If the iterator cg is NULL, we have no tasks */
+        spin_lock_bh(&css_set_lock);
-        if (!it->cset_pos)
-                return NULL;
-        res = list_entry(l, struct task_struct, cg_list);
-        /*
+        if (it->task_pos) {
-         * Advance iterator to find next entry.  cset->tasks is consumed
+                it->cur_task = list_entry(it->task_pos, struct task_struct,
-         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+                                          cg_list);
-         * next cset.
+                get_task_struct(it->cur_task);
-         */
+                css_task_iter_advance(it);
-        l = l->next;
+        }
-        if (l == it->tasks_head)
+        spin_unlock_bh(&css_set_lock);
-                l = it->mg_tasks_head->next;
-        if (l == it->mg_tasks_head)
+        return it->cur_task;
-                css_advance_task_iter(it);
-        else
-                it->task_pos = l;
-        return res;
 }
 /**
@@ -3782,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 * Finish task iteration started by css_task_iter_start().
 */
 void css_task_iter_end(struct css_task_iter *it)
-        __releases(css_set_rwsem)
 {
-        up_read(&css_set_rwsem);
+        if (it->cur_cset) {
+                spin_lock_bh(&css_set_lock);
+                list_del(&it->iters_node);
+                put_css_set_locked(it->cur_cset);
+                spin_unlock_bh(&css_set_lock);
+        }
+        if (it->cur_task)
+                put_task_struct(it->cur_task);
 }
 /**
@@ -3809,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        mutex_lock(&cgroup_mutex);
        /* all tasks in @from are being moved, all csets are source */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
        if (ret)
@@ -3830,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                css_task_iter_end(&it);
                if (task) {
-                        ret = cgroup_migrate(to, task, false);
+                        ret = cgroup_migrate(task, false, to);
                        put_task_struct(task);
                }
        } while (task && !ret);
@@ -4327,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_dfl_base_files[] = {
        {
                .name = "cgroup.procs",
+                .file_offset = offsetof(struct cgroup, procs_file),
                .seq_start = cgroup_pidlist_start,
                .seq_next = cgroup_pidlist_next,
                .seq_stop = cgroup_pidlist_stop,
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.controllers",
@@ -4351,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
                .write = cgroup_subtree_control_write,
        },
        {
-                .name = "cgroup.populated",
+                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
-                .seq_show = cgroup_populated_show,
+                .file_offset = offsetof(struct cgroup, events_file),
+                .seq_show = cgroup_events_show,
        },
        { }     /* terminate */
 };
@@ -4368,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.clone_children",
@@ -4388,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_TASKS,
                .write = cgroup_tasks_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -4405,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
        { }     /* terminate */
 };
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-{
-        struct cgroup_subsys *ss;
-        int i, ret = 0;
-        /* process cftsets of each subsystem */
-        for_each_subsys(ss, i) {
-                struct cftype *cfts;
-                if (!(subsys_mask & (1 << i)))
-                        continue;
-                list_for_each_entry(cfts, &ss->cfts, node) {
-                        ret = cgroup_addrm_files(cgrp, cfts, true);
-                        if (ret < 0)
-                                goto err;
-                }
-        }
-        return 0;
-err:
-        cgroup_clear_dir(cgrp, subsys_mask);
-        return ret;
-}
 /*
 * css destruction is four-stage process.
 *
@@ -4464,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;
+        struct cgroup_file *cfile;
        percpu_ref_exit(&css->refcnt);
+        list_for_each_entry(cfile, &css->files, node)
+                kernfs_put(cfile->kn);
        if (ss) {
                /* css free path */
                int id = css->id;
@@ -4571,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        css->ss = ss;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
+        INIT_LIST_HEAD(&css->files);
        css->serial_nr = css_serial_nr_next++;
        if (cgroup_parent(cgrp)) {
@@ -4653,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
        css->id = err;
        if (visible) {
-                err = cgroup_populate_dir(cgrp, 1 << ss->id);
+                err = css_populate_dir(css, NULL);
                if (err)
                        goto err_free_id;
        }
@@ -4679,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 err_list_del:
        list_del_rcu(&css->sibling);
-        cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+        css_clear_dir(css, NULL);
 err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
@@ -4696,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        struct cgroup_root *root;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
-        struct cftype *base_files;
        int ssid, ret;
        /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4772,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
-        if (cgroup_on_dfl(cgrp))
+        ret = css_populate_dir(&cgrp->self, NULL);
-                base_files = cgroup_dfl_base_files;
-        else
-                base_files = cgroup_legacy_base_files;
-        ret = cgroup_addrm_files(cgrp, base_files, true);
        if (ret)
                goto out_destroy;
@@ -4864,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
-        cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+        css_clear_dir(css, NULL);
        /*
         * Killing would put the base ref, but we need to keep it alive
@@ -4913,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct cgroup_subsys_state *css;
-        bool empty;
        int ssid;
        lockdep_assert_held(&cgroup_mutex);
        /*
-         * css_set_rwsem synchronizes access to ->cset_links and prevents
+         * Only migration can raise populated from zero and we're already
-         * @cgrp from being removed while put_css_set() is in progress.
+         * holding cgroup_mutex.
         */
-        down_read(&css_set_rwsem);
+        if (cgroup_is_populated(cgrp))
-        empty = list_empty(&cgrp->cset_links);
-        up_read(&css_set_rwsem);
-        if (!empty)
                return -EBUSY;
        /*
@@ -5023,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
+        have_free_callback |= (bool)ss->free << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;
        /* At system boot, before all subsystems have been
@@ -5071,6 +5216,8 @@ int __init cgroup_init_early(void)
        return 0;
 }
+static unsigned long cgroup_disable_mask __initdata;
 /**
 * cgroup_init - cgroup initialization
 *
@@ -5081,8 +5228,9 @@ int __init cgroup_init(void)
 {
        struct cgroup_subsys *ss;
        unsigned long key;
-        int ssid, err;
+        int ssid;
+        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
@@ -5116,14 +5264,15 @@ int __init cgroup_init(void)
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
-                if (ss->disabled)
+                if (cgroup_disable_mask & (1 << ssid)) {
+                        static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+                        printk(KERN_INFO "Disabling %s control group subsystem\n",
+                               ss->name);
                        continue;
+                }
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-                if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
-                        ss->dfl_cftypes = ss->legacy_cftypes;
                if (!ss->dfl_cftypes)
                        cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
@@ -5138,17 +5287,10 @@ int __init cgroup_init(void)
                        ss->bind(init_css_set.subsys[ssid]);
        }
-        err = sysfs_create_mount_point(fs_kobj, "cgroup");
+        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
-        if (err)
+        WARN_ON(register_filesystem(&cgroup_fs_type));
-                return err;
+        WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
-        err = register_filesystem(&cgroup_fs_type);
-        if (err < 0) {
-                sysfs_remove_mount_point(fs_kobj, "cgroup");
-                return err;
-        }
-        proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
        return 0;
 }
@@ -5195,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                goto out;
        mutex_lock(&cgroup_mutex);
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        for_each_root(root) {
                struct cgroup_subsys *ss;
@@ -5215,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
                cgrp = task_cgroup_from_root(tsk, root);
-                path = cgroup_path(cgrp, buf, PATH_MAX);
-                if (!path) {
+                /*
-                        retval = -ENAMETOOLONG;
+                 * On traditional hierarchies, all zombie tasks show up as
-                        goto out_unlock;
+                 * belonging to the root cgroup.  On the default hierarchy,
+                 * while a zombie doesn't show up in "cgroup.procs" and
+                 * thus can't be migrated, its /proc/PID/cgroup keeps
+                 * reporting the cgroup it belonged to before exiting.  If
+                 * the cgroup is removed before the zombie is reaped,
+                 * " (deleted)" is appended to the cgroup path.
+                 */
+                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+                        path = cgroup_path(cgrp, buf, PATH_MAX);
+                        if (!path) {
+                                retval = -ENAMETOOLONG;
+                                goto out_unlock;
+                        }
+                } else {
+                        path = "/";
                }
                seq_puts(m, path);
-                seq_putc(m, '\n');
+                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+                        seq_puts(m, " (deleted)\n");
+                else
+                        seq_putc(m, '\n');
        }
        retval = 0;
 out_unlock:
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
 out:
@@ -5251,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        for_each_subsys(ss, i)
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->legacy_name, ss->root->hierarchy_id,
-                           atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+                           atomic_read(&ss->root->nr_cgrps),
+                           cgroup_ssid_enabled(i));
        mutex_unlock(&cgroup_mutex);
        return 0;
@@ -5372,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
         * @child during its iteration.
         *
         * If we won the race, @child is associated with %current's
-         * css_set.  Grabbing css_set_rwsem guarantees both that the
+         * css_set.  Grabbing css_set_lock guarantees both that the
         * association is stable, and, on completion of the parent's
         * migration, @child is visible in the source of migration or
         * already in the destination cgroup.  This guarantee is necessary
@@ -5387,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
        if (use_task_css_set_links) {
                struct css_set *cset;
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
-                        rcu_assign_pointer(child->cgroups, cset);
-                        list_add(&child->cg_list, &cset->tasks);
                        get_css_set(cset);
+                        css_set_move_task(child, NULL, cset, false);
                }
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
        }
        /*
@@ -5429,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
 {
        struct cgroup_subsys *ss;
        struct css_set *cset;
-        bool put_cset = false;
        int i;
        /*
         * Unlink from @tsk from its css_set.  As migration path can't race
-         * with us, we can check cg_list without grabbing css_set_rwsem.
+         * with us, we can check css_set and cg_list without synchronization.
         */
+        cset = task_css_set(tsk);
        if (!list_empty(&tsk->cg_list)) {
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
-                list_del_init(&tsk->cg_list);
+                css_set_move_task(tsk, cset, NULL, false);
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
-                put_cset = true;
+        } else {
+                get_css_set(cset);
        }
-        /* Reassign the task to the init_css_set. */
-        cset = task_css_set(tsk);
-        RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
        /* see cgroup_post_fork() for details */
-        for_each_subsys_which(ss, i, &have_exit_callback) {
+        for_each_subsys_which(ss, i, &have_exit_callback)
-                struct cgroup_subsys_state *old_css = cset->subsys[i];
+                ss->exit(tsk);
-                struct cgroup_subsys_state *css = task_css(tsk, i);
+}
-                ss->exit(css, old_css, tsk);
+void cgroup_free(struct task_struct *task)
-        }
+{
+        struct css_set *cset = task_css_set(task);
+        struct cgroup_subsys *ss;
+        int ssid;
-        if (put_cset)
+        for_each_subsys_which(ss, ssid, &have_free_callback)
-                put_css_set(cset);
+                ss->free(task);
+        put_css_set(cset);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
-        if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+        if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
            !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
                schedule_work(&cgrp->release_agent_work);
 }
@@ -5540,25 +5705,13 @@ static int __init cgroup_disable(char *str)
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;
+                        cgroup_disable_mask |= 1 << i;
-                        ss->disabled = 1;
-                        printk(KERN_INFO "Disabling %s control group subsystem\n",
-                               ss->name);
-                        break;
                }
        }
        return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
-        printk("cgroup: using legacy files on the default hierarchy\n");
-        cgroup_legacy_files_on_dfl = true;
-        return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
 /**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
@@ -5662,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        if (!name_buf)
                return -ENOMEM;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5673,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        kfree(name_buf);
        return 0;
 }
@@ -5684,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
@@ -5707,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        overflow:
                seq_puts(seq, "  ...\n");
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return 0;
 }
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        return (!cgroup_has_tasks(css->cgroup) &&
+        return (!cgroup_is_populated(css->cgroup) &&
                !css_has_online_children(&css->cgroup->self));
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-11-05 17:51:32 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-11-05 17:51:32 -0500
commit	69234acee54407962a20bedf90ef9c96326994b5 (patch)
tree	5e979b1a489d866691c2c65ac3f46b4f29feef68 /kernel/cgroup.c
parent	11eaaadb3ea376c6c194491c2e9bddd647f9d253 (diff)
parent	d57456753787ab158f906f1f8eb58d54a2ccd9f4 (diff)