aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c754
1 files changed, 386 insertions, 368 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..f34c41bfaa37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1349 goto out_unlock;
1398 } 1350 }
1399 1351
1352 /*
1353 * Clear out the files of subsystems that should be removed, do
1354 * this before rebind_subsystems, since rebind_subsystems may
1355 * change this hierarchy's subsys_list.
1356 */
1357 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1358
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1359 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1360 if (ret) {
1361 /* rebind_subsystems failed, re-populate the removed files */
1362 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1363 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1364 goto out_unlock;
1404 } 1365 }
1405 1366
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1367 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1368 cgroup_populate_dir(cgrp, false, added_mask);
1410 1369
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1391 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1392 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1393 INIT_LIST_HEAD(&cgrp->css_sets);
1394 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1395 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1396 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1397 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1410 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1411 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1412 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1413 init_cgroup_housekeeping(cgrp);
1414 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1415}
1456 1416
1457static bool init_root_id(struct cgroupfs_root *root) 1417static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1478
1519 root->subsys_mask = opts->subsys_mask; 1479 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1480 root->flags = opts->flags;
1481 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1482 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1483 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1484 if (opts->name)
1524 strcpy(root->name, opts->name); 1485 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1486 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1487 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1488 return root;
1528} 1489}
1529 1490
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1497 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1498 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1499 spin_unlock(&hierarchy_id_lock);
1500 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1501 kfree(root);
1540} 1502}
1541 1503
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1663
1702 free_cg_links(&tmp_cg_links); 1664 free_cg_links(&tmp_cg_links);
1703 1665
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1666 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1667 BUG_ON(root->number_of_cgroups != 1);
1707 1668
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1711
1751 BUG_ON(root->number_of_cgroups != 1); 1712 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1713 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1714
1755 mutex_lock(&cgroup_mutex); 1715 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1716 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1768 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1769int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1770{
1771 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1772 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1773
1813 cgroup_lock_is_held()); 1774 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1775 "cgroup_path() called without proper locking");
1814 1776
1815 if (!dentry || cgrp == dummytop) { 1777 if (!dentry || cgrp == dummytop) {
1816 /* 1778 /*
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1783 return 0;
1822 } 1784 }
1823 1785
1824 start = buf + buflen; 1786 start = buf + buflen - 1;
1825 1787
1826 *--start = '\0'; 1788 *start = '\0';
1827 for (;;) { 1789 for (;;) {
1828 int len = dentry->d_name.len; 1790 int len = dentry->d_name.len;
1829 1791
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1796 if (!cgrp)
1835 break; 1797 break;
1836 1798
1837 dentry = rcu_dereference_check(cgrp->dentry, 1799 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1800 if (!cgrp->parent)
1840 continue; 1801 continue;
1841 if (--start < buf) 1802 if (--start < buf)
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1891/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1892 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1893 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1894 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1895 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1896static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1897 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1984 }
2026 1985
2027 synchronize_rcu(); 1986 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1987out:
2035 if (retval) { 1988 if (retval) {
2036 for_each_subsys(root, ss) { 1989 for_each_subsys(root, ss) {
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2153 * step 5: success! and cleanup
2201 */ 2154 */
2202 synchronize_rcu(); 2155 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2156 retval = 0;
2205out_put_css_set_refs: 2157out_put_css_set_refs:
2206 if (retval) { 2158 if (retval) {
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2663
2712 /* start off with i_nlink == 2 (for "." entry) */ 2664 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2665 inc_nlink(inode);
2666 inc_nlink(dentry->d_parent->d_inode);
2714 2667
2715 /* start with the directory inode held, so that we can 2668 /*
2716 * populate it without racing with another mkdir */ 2669 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2670 * @inode->i_mutex should nest outside cgroup_mutex but we
2671 * want to populate it immediately without releasing
2672 * cgroup_mutex. As @inode isn't visible to anyone else
2673 * yet, trylock will always succeed without affecting
2674 * lockdep checks.
2675 */
2676 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2677 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2678 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2679 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2684 return 0;
2726} 2685}
2727 2686
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2687/**
2755 * cgroup_file_mode - deduce file mode of a control file 2688 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2689 * @cft: the control file in question
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2724
2792 simple_xattrs_init(&cft->xattrs); 2725 simple_xattrs_init(&cft->xattrs);
2793 2726
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2727 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2728 strcpy(name, subsys->name);
2802 strcat(name, "."); 2729 strcat(name, ".");
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2764 int err, ret = 0;
2838 2765
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2766 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2767 /* does cft->flags tell us to skip this file on @cgrp? */
2768 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2769 continue;
2770 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2771 continue;
2772
2840 if (is_add) 2773 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2774 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2775 else
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2977 write_unlock(&css_set_lock);
3045} 2978}
3046 2979
2980/**
2981 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2982 * @pos: the current position (%NULL to initiate traversal)
2983 * @cgroup: cgroup whose descendants to walk
2984 *
2985 * To be used by cgroup_for_each_descendant_pre(). Find the next
2986 * descendant to visit for pre-order traversal of @cgroup's descendants.
2987 */
2988struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2989 struct cgroup *cgroup)
2990{
2991 struct cgroup *next;
2992
2993 WARN_ON_ONCE(!rcu_read_lock_held());
2994
2995 /* if first iteration, pretend we just visited @cgroup */
2996 if (!pos) {
2997 if (list_empty(&cgroup->children))
2998 return NULL;
2999 pos = cgroup;
3000 }
3001
3002 /* visit the first child if exists */
3003 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3004 if (next)
3005 return next;
3006
3007 /* no child, visit my or the closest ancestor's next sibling */
3008 do {
3009 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3010 sibling);
3011 if (&next->sibling != &pos->parent->children)
3012 return next;
3013
3014 pos = pos->parent;
3015 } while (pos != cgroup);
3016
3017 return NULL;
3018}
3019EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3020
3021static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3022{
3023 struct cgroup *last;
3024
3025 do {
3026 last = pos;
3027 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3028 sibling);
3029 } while (pos);
3030
3031 return last;
3032}
3033
3034/**
3035 * cgroup_next_descendant_post - find the next descendant for post-order walk
3036 * @pos: the current position (%NULL to initiate traversal)
3037 * @cgroup: cgroup whose descendants to walk
3038 *
3039 * To be used by cgroup_for_each_descendant_post(). Find the next
3040 * descendant to visit for post-order traversal of @cgroup's descendants.
3041 */
3042struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3043 struct cgroup *cgroup)
3044{
3045 struct cgroup *next;
3046
3047 WARN_ON_ONCE(!rcu_read_lock_held());
3048
3049 /* if first iteration, visit the leftmost descendant */
3050 if (!pos) {
3051 next = cgroup_leftmost_descendant(cgroup);
3052 return next != cgroup ? next : NULL;
3053 }
3054
3055 /* if there's an unvisited sibling, visit its leftmost descendant */
3056 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3057 if (&next->sibling != &pos->parent->children)
3058 return cgroup_leftmost_descendant(next);
3059
3060 /* no sibling left, visit parent */
3061 next = pos->parent;
3062 return next != cgroup ? next : NULL;
3063}
3064EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3065
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3066void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3067 __acquires(css_set_lock)
3049{ 3068{
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3776 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3777 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3778 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3779 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3780 spin_unlock(&cgrp->event_list_lock);
3762 /* 3781 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3782 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3913,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3913static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3914 struct cftype *cft)
3896{ 3915{
3897 return clone_children(cgrp); 3916 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3917}
3899 3918
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3919static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3921 u64 val)
3903{ 3922{
3904 if (val) 3923 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3925 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3926 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3927 return 0;
3909} 3928}
3910 3929
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4036 css->flags = 0;
4018 css->id = NULL; 4037 css->id = NULL;
4019 if (cgrp == dummytop) 4038 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4039 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4040 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4041 cgrp->subsys[ss->subsys_id] = css;
4023 4042
4024 /* 4043 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4044 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4045 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4046 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4047 * dput() asynchronously from css_put().
4029 */ 4048 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4049 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4050}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4051
4052/* invoke ->post_create() on a new CSS and mark it online if successful */
4053static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4054{
4055 int ret = 0;
4056
4057 lockdep_assert_held(&cgroup_mutex);
4058
4059 if (ss->css_online)
4060 ret = ss->css_online(cgrp);
4061 if (!ret)
4062 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4063 return ret;
4064}
4065
4066/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4067static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4068 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4069{
4070 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4071
4072 lockdep_assert_held(&cgroup_mutex);
4073
4074 if (!(css->flags & CSS_ONLINE))
4075 return;
4076
4077 /*
4078 * css_offline() should be called with cgroup_mutex unlocked. See
4079 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4080 * details. This temporary unlocking should go away once
4081 * cgroup_mutex is unexported from controllers.
4082 */
4083 if (ss->css_offline) {
4084 mutex_unlock(&cgroup_mutex);
4085 ss->css_offline(cgrp);
4086 mutex_lock(&cgroup_mutex);
4087 }
4088
4089 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4090}
4034 4091
4035/* 4092/*
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4106 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4107 struct super_block *sb = root->sb;
4051 4108
4109 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4110 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4111 if (!cgrp)
4054 return -ENOMEM; 4112 return -ENOMEM;
4055 4113
4114 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4115 if (cgrp->id < 0)
4116 goto err_free_cgrp;
4117
4118 /*
4119 * Only live parents can have children. Note that the liveliness
4120 * check isn't strictly necessary because cgroup_mkdir() and
4121 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4122 * anyway so that locking is contained inside cgroup proper and we
4123 * don't get nasty surprises if we ever grow another caller.
4124 */
4125 if (!cgroup_lock_live_group(parent)) {
4126 err = -ENODEV;
4127 goto err_free_id;
4128 }
4129
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4130 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4131 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4132 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4134 * fs */
4061 atomic_inc(&sb->s_active); 4135 atomic_inc(&sb->s_active);
4062 4136
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4137 init_cgroup_housekeeping(cgrp);
4066 4138
4067 cgrp->parent = parent; 4139 cgrp->parent = parent;
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4143 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4144 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4145
4074 if (clone_children(parent)) 4146 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4147 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4148
4077 for_each_subsys(root, ss) { 4149 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4150 struct cgroup_subsys_state *css;
4079 4151
4080 css = ss->create(cgrp); 4152 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4153 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4154 err = PTR_ERR(css);
4083 goto err_destroy; 4155 goto err_free_all;
4084 } 4156 }
4085 init_cgroup_css(css, ss, cgrp); 4157 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4158 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4159 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4160 if (err)
4089 goto err_destroy; 4161 goto err_free_all;
4090 } 4162 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4163 }
4092 if (clone_children(parent) && ss->post_clone) 4164
4093 ss->post_clone(cgrp); 4165 /*
4166 * Create directory. cgroup_create_file() returns with the new
4167 * directory locked on success so that it can be populated without
4168 * dropping cgroup_mutex.
4169 */
4170 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4171 if (err < 0)
4172 goto err_free_all;
4173 lockdep_assert_held(&dentry->d_inode->i_mutex);
4174
4175 /* allocation complete, commit to creation */
4176 dentry->d_fsdata = cgrp;
4177 cgrp->dentry = dentry;
4178 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4179 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4180 root->number_of_cgroups++;
4181
4182 /* each css holds a ref to the cgroup's dentry */
4183 for_each_subsys(root, ss)
4184 dget(dentry);
4185
4186 /* creation succeeded, notify subsystems */
4187 for_each_subsys(root, ss) {
4188 err = online_css(ss, cgrp);
4189 if (err)
4190 goto err_destroy;
4094 4191
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4192 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4193 parent->parent) {
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4199 }
4103 } 4200 }
4104 4201
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4202 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4203 if (err)
4204 goto err_destroy;
4124 4205
4125 mutex_unlock(&cgroup_mutex); 4206 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4207 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4208
4128 return 0; 4209 return 0;
4129 4210
4130 err_remove: 4211err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4212 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4213 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4214 ss->css_free(cgrp);
4140 } 4215 }
4141
4142 mutex_unlock(&cgroup_mutex); 4216 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4217 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4218 deactivate_super(sb);
4146 4219err_free_id:
4220 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4221err_free_cgrp:
4147 kfree(cgrp); 4222 kfree(cgrp);
4148 return err; 4223 return err;
4224
4225err_destroy:
4226 cgroup_destroy_locked(cgrp);
4227 mutex_unlock(&cgroup_mutex);
4228 mutex_unlock(&dentry->d_inode->i_mutex);
4229 return err;
4149} 4230}
4150 4231
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4232static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4278 return 0;
4198} 4279}
4199 4280
4200/* 4281static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4282 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4283{
4284 struct dentry *d = cgrp->dentry;
4285 struct cgroup *parent = cgrp->parent;
4286 DEFINE_WAIT(wait);
4287 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4288 struct cgroup_subsys *ss;
4225 unsigned long flags; 4289 LIST_HEAD(tmp_list);
4226 bool failed = false; 4290
4291 lockdep_assert_held(&d->d_inode->i_mutex);
4292 lockdep_assert_held(&cgroup_mutex);
4227 4293
4228 local_irq_save(flags); 4294 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4295 return -EBUSY;
4229 4296
4230 /* 4297 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4298 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4299 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4300 * attempts fail thus maintaining the removal conditions verified
4301 * above.
4234 */ 4302 */
4235 for_each_subsys(cgrp->root, ss) { 4303 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4304 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4305
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4306 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4307 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4308 }
4309 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4310
4261 local_irq_restore(flags); 4311 /* tell subsystems to initate destruction */
4262 return !failed; 4312 for_each_subsys(cgrp->root, ss)
4263} 4313 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4314
4298 /* 4315 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4316 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4317 * cgroup's dentry and cgroup removal proceeds regardless of css
4318 * refs. On the last put of each css, whenever that may be, the
4319 * extra dentry ref is put so that dentry destruction happens only
4320 * after all css's are released.
4301 */ 4321 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4322 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4323 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4324
4334 raw_spin_lock(&release_list_lock); 4325 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4326 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4327 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4328 raw_spin_unlock(&release_list_lock);
4339 4329
4340 /* delete this cgroup from parent->children */ 4330 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4331 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4332 list_del_init(&cgrp->allcg_node);
4344 4333
4345 d = dget(cgrp->dentry); 4334 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4335 cgroup_d_remove_dir(d);
4348 dput(d); 4336 dput(d);
4349 4337
@@ -4353,21 +4341,35 @@ again:
4353 /* 4341 /*
4354 * Unregister events and notify userspace. 4342 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4343 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4344 * directory to avoid race between userspace and kernelspace. Use
4345 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4346 * cgroup_event_wake() is called with the wait queue head locked,
4347 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4348 */
4358 spin_lock(&cgrp->event_list_lock); 4349 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4350 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4351 spin_unlock(&cgrp->event_list_lock);
4352 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4353 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4354 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4355 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4356 schedule_work(&event->remove);
4364 } 4357 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4358
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4359 return 0;
4369} 4360}
4370 4361
4362static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4363{
4364 int ret;
4365
4366 mutex_lock(&cgroup_mutex);
4367 ret = cgroup_destroy_locked(dentry->d_fsdata);
4368 mutex_unlock(&cgroup_mutex);
4369
4370 return ret;
4371}
4372
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4373static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4374{
4373 INIT_LIST_HEAD(&ss->cftsets); 4375 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4390
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4391 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4392
4393 mutex_lock(&cgroup_mutex);
4394
4391 /* init base cftset */ 4395 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4396 cgroup_init_cftsets(ss);
4393 4397
4394 /* Create the top cgroup state for this subsystem */ 4398 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4399 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4400 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4401 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4402 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4403 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4404 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4407 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4408 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4409 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4410 init_css_set.subsys[ss->subsys_id] = css;
4407 4411
4408 need_forkexit_callback |= ss->fork || ss->exit; 4412 need_forkexit_callback |= ss->fork || ss->exit;
4409 4413
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4417 BUG_ON(!list_empty(&init_task.tasks));
4414 4418
4415 ss->active = 1; 4419 ss->active = 1;
4420 BUG_ON(online_css(ss, dummytop));
4421
4422 mutex_unlock(&cgroup_mutex);
4416 4423
4417 /* this function shouldn't be used with modular subsystems, since they 4424 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4425 * need to register a subsys_id, among other things */
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4437 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4438int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4439{
4433 int i;
4434 struct cgroup_subsys_state *css; 4440 struct cgroup_subsys_state *css;
4441 int i, ret;
4435 4442
4436 /* check name and function validity */ 4443 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4444 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4445 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4446 return -EINVAL;
4440 4447
4441 /* 4448 /*
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4471 subsys[ss->subsys_id] = ss;
4465 4472
4466 /* 4473 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4474 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4475 * struct, so this can happen first (i.e. before the rootnode
4476 * attachment).
4469 */ 4477 */
4470 css = ss->create(dummytop); 4478 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4479 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4480 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4481 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4490 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4491 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4492 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4493 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4494 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4495 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4496 }
4494 4497
4495 /* 4498 /*
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4525 write_unlock(&css_set_lock);
4523 4526
4524 ss->active = 1; 4527 ss->active = 1;
4528 ret = online_css(ss, dummytop);
4529 if (ret)
4530 goto err_unload;
4525 4531
4526 /* success! */ 4532 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4533 mutex_unlock(&cgroup_mutex);
4528 return 0; 4534 return 0;
4535
4536err_unload:
4537 mutex_unlock(&cgroup_mutex);
4538 /* @ss can't be mounted here as try_module_get() would fail */
4539 cgroup_unload_subsys(ss);
4540 return ret;
4529} 4541}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4542EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4543
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4564 BUG_ON(ss->root != &rootnode);
4553 4565
4554 mutex_lock(&cgroup_mutex); 4566 mutex_lock(&cgroup_mutex);
4567
4568 offline_css(ss, dummytop);
4569 ss->active = 0;
4570
4571 if (ss->use_id) {
4572 idr_remove_all(&ss->idr);
4573 idr_destroy(&ss->idr);
4574 }
4575
4555 /* deassign the subsys_id */ 4576 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4577 subsys[ss->subsys_id] = NULL;
4557 4578
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4588 struct css_set *cg = link->cg;
4568 4589
4569 hlist_del(&cg->hlist); 4590 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4591 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4592 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4593 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4595 write_unlock(&css_set_lock);
4576 4596
4577 /* 4597 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4598 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4599 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4600 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4601 * takes care of freeing the css_id.
4582 */ 4602 */
4583 ss->destroy(dummytop); 4603 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4604 dummytop->subsys[ss->subsys_id] = NULL;
4585 4605
4586 mutex_unlock(&cgroup_mutex); 4606 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
4624 4644
4625 BUG_ON(!ss->name); 4645 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4646 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4647 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4648 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4649 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4650 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4651 ss->name, ss->subsys_id);
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4852}
4833 4853
4834/** 4854/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4855 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4856 * @child: the task in question
4865 * 4857 *
4866 * Adds the task to the list running through its css_set if necessary. 4858 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4859 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4860 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4861 * cgroup_iter_start() - to guarantee that the new task ends up on its
4862 * list.
4870 */ 4863 */
4871void cgroup_post_fork(struct task_struct *child) 4864void cgroup_post_fork(struct task_struct *child)
4872{ 4865{
4866 int i;
4867
4873 /* 4868 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4869 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4870 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4884 task_unlock(child);
4890 write_unlock(&css_set_lock); 4885 write_unlock(&css_set_lock);
4891 } 4886 }
4887
4888 /*
4889 * Call ss->fork(). This must happen after @child is linked on
4890 * css_set; otherwise, @child might change state between ->fork()
4891 * and addition to css_set.
4892 */
4893 if (need_forkexit_callback) {
4894 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4895 struct cgroup_subsys *ss = subsys[i];
4896
4897 /*
4898 * fork/exit callbacks are supported only for
4899 * builtin subsystems and we don't need further
4900 * synchronization as they never go away.
4901 */
4902 if (!ss || ss->module)
4903 continue;
4904
4905 if (ss->fork)
4906 ss->fork(child);
4907 }
4908 }
4892} 4909}
4910
4893/** 4911/**
4894 * cgroup_exit - detach cgroup from exiting task 4912 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4913 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5040/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5041bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5042{
5025 do { 5043 while (true) {
5026 int v = css_refcnt(css); 5044 int t, v;
5027 5045
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5046 v = css_refcnt(css);
5047 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5048 if (likely(t == v))
5029 return true; 5049 return true;
5050 else if (t < 0)
5051 return false;
5030 cpu_relax(); 5052 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5053 }
5032
5033 return false;
5034} 5054}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5055EXPORT_SYMBOL_GPL(__css_tryget);
5036 5056
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5069 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5070 check_for_release(cgrp);
5051 } 5071 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5072 break;
5054 case 0: 5073 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5074 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5075 break;
5058 } 5076 }
5059 rcu_read_unlock(); 5077 rcu_read_unlock();
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5457}
5440 5458
5441#ifdef CONFIG_CGROUP_DEBUG 5459#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5460static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5461{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5462 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5463
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5467 return css;
5450} 5468}
5451 5469
5452static void debug_destroy(struct cgroup *cont) 5470static void debug_css_free(struct cgroup *cont)
5453{ 5471{
5454 kfree(cont->subsys[debug_subsys_id]); 5472 kfree(cont->subsys[debug_subsys_id]);
5455} 5473}
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = {
5578 5596
5579struct cgroup_subsys debug_subsys = { 5597struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5598 .name = "debug",
5581 .create = debug_create, 5599 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5600 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5601 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5602 .base_cftypes = debug_files,
5585}; 5603};