aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c754
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/cpuset.c90
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c20
9 files changed, 758 insertions, 677 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..f34c41bfaa37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1349 goto out_unlock;
1398 } 1350 }
1399 1351
1352 /*
1353 * Clear out the files of subsystems that should be removed, do
1354 * this before rebind_subsystems, since rebind_subsystems may
1355 * change this hierarchy's subsys_list.
1356 */
1357 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1358
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1359 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1360 if (ret) {
1361 /* rebind_subsystems failed, re-populate the removed files */
1362 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1363 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1364 goto out_unlock;
1404 } 1365 }
1405 1366
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1367 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1368 cgroup_populate_dir(cgrp, false, added_mask);
1410 1369
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1391 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1392 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1393 INIT_LIST_HEAD(&cgrp->css_sets);
1394 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1395 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1396 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1397 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1410 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1411 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1412 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1413 init_cgroup_housekeeping(cgrp);
1414 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1415}
1456 1416
1457static bool init_root_id(struct cgroupfs_root *root) 1417static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1478
1519 root->subsys_mask = opts->subsys_mask; 1479 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1480 root->flags = opts->flags;
1481 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1482 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1483 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1484 if (opts->name)
1524 strcpy(root->name, opts->name); 1485 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1486 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1487 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1488 return root;
1528} 1489}
1529 1490
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1497 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1498 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1499 spin_unlock(&hierarchy_id_lock);
1500 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1501 kfree(root);
1540} 1502}
1541 1503
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1663
1702 free_cg_links(&tmp_cg_links); 1664 free_cg_links(&tmp_cg_links);
1703 1665
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1666 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1667 BUG_ON(root->number_of_cgroups != 1);
1707 1668
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1711
1751 BUG_ON(root->number_of_cgroups != 1); 1712 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1713 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1714
1755 mutex_lock(&cgroup_mutex); 1715 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1716 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1768 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1769int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1770{
1771 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1772 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1773
1813 cgroup_lock_is_held()); 1774 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1775 "cgroup_path() called without proper locking");
1814 1776
1815 if (!dentry || cgrp == dummytop) { 1777 if (!dentry || cgrp == dummytop) {
1816 /* 1778 /*
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1783 return 0;
1822 } 1784 }
1823 1785
1824 start = buf + buflen; 1786 start = buf + buflen - 1;
1825 1787
1826 *--start = '\0'; 1788 *start = '\0';
1827 for (;;) { 1789 for (;;) {
1828 int len = dentry->d_name.len; 1790 int len = dentry->d_name.len;
1829 1791
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1796 if (!cgrp)
1835 break; 1797 break;
1836 1798
1837 dentry = rcu_dereference_check(cgrp->dentry, 1799 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1800 if (!cgrp->parent)
1840 continue; 1801 continue;
1841 if (--start < buf) 1802 if (--start < buf)
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1891/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1892 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1893 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1894 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1895 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1896static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1897 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1984 }
2026 1985
2027 synchronize_rcu(); 1986 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1987out:
2035 if (retval) { 1988 if (retval) {
2036 for_each_subsys(root, ss) { 1989 for_each_subsys(root, ss) {
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2153 * step 5: success! and cleanup
2201 */ 2154 */
2202 synchronize_rcu(); 2155 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2156 retval = 0;
2205out_put_css_set_refs: 2157out_put_css_set_refs:
2206 if (retval) { 2158 if (retval) {
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2663
2712 /* start off with i_nlink == 2 (for "." entry) */ 2664 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2665 inc_nlink(inode);
2666 inc_nlink(dentry->d_parent->d_inode);
2714 2667
2715 /* start with the directory inode held, so that we can 2668 /*
2716 * populate it without racing with another mkdir */ 2669 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2670 * @inode->i_mutex should nest outside cgroup_mutex but we
2671 * want to populate it immediately without releasing
2672 * cgroup_mutex. As @inode isn't visible to anyone else
2673 * yet, trylock will always succeed without affecting
2674 * lockdep checks.
2675 */
2676 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2677 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2678 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2679 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2684 return 0;
2726} 2685}
2727 2686
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2687/**
2755 * cgroup_file_mode - deduce file mode of a control file 2688 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2689 * @cft: the control file in question
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2724
2792 simple_xattrs_init(&cft->xattrs); 2725 simple_xattrs_init(&cft->xattrs);
2793 2726
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2727 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2728 strcpy(name, subsys->name);
2802 strcat(name, "."); 2729 strcat(name, ".");
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2764 int err, ret = 0;
2838 2765
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2766 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2767 /* does cft->flags tell us to skip this file on @cgrp? */
2768 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2769 continue;
2770 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2771 continue;
2772
2840 if (is_add) 2773 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2774 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2775 else
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2977 write_unlock(&css_set_lock);
3045} 2978}
3046 2979
2980/**
2981 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2982 * @pos: the current position (%NULL to initiate traversal)
2983 * @cgroup: cgroup whose descendants to walk
2984 *
2985 * To be used by cgroup_for_each_descendant_pre(). Find the next
2986 * descendant to visit for pre-order traversal of @cgroup's descendants.
2987 */
2988struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2989 struct cgroup *cgroup)
2990{
2991 struct cgroup *next;
2992
2993 WARN_ON_ONCE(!rcu_read_lock_held());
2994
2995 /* if first iteration, pretend we just visited @cgroup */
2996 if (!pos) {
2997 if (list_empty(&cgroup->children))
2998 return NULL;
2999 pos = cgroup;
3000 }
3001
3002 /* visit the first child if exists */
3003 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3004 if (next)
3005 return next;
3006
3007 /* no child, visit my or the closest ancestor's next sibling */
3008 do {
3009 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3010 sibling);
3011 if (&next->sibling != &pos->parent->children)
3012 return next;
3013
3014 pos = pos->parent;
3015 } while (pos != cgroup);
3016
3017 return NULL;
3018}
3019EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3020
3021static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3022{
3023 struct cgroup *last;
3024
3025 do {
3026 last = pos;
3027 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3028 sibling);
3029 } while (pos);
3030
3031 return last;
3032}
3033
3034/**
3035 * cgroup_next_descendant_post - find the next descendant for post-order walk
3036 * @pos: the current position (%NULL to initiate traversal)
3037 * @cgroup: cgroup whose descendants to walk
3038 *
3039 * To be used by cgroup_for_each_descendant_post(). Find the next
3040 * descendant to visit for post-order traversal of @cgroup's descendants.
3041 */
3042struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3043 struct cgroup *cgroup)
3044{
3045 struct cgroup *next;
3046
3047 WARN_ON_ONCE(!rcu_read_lock_held());
3048
3049 /* if first iteration, visit the leftmost descendant */
3050 if (!pos) {
3051 next = cgroup_leftmost_descendant(cgroup);
3052 return next != cgroup ? next : NULL;
3053 }
3054
3055 /* if there's an unvisited sibling, visit its leftmost descendant */
3056 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3057 if (&next->sibling != &pos->parent->children)
3058 return cgroup_leftmost_descendant(next);
3059
3060 /* no sibling left, visit parent */
3061 next = pos->parent;
3062 return next != cgroup ? next : NULL;
3063}
3064EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3065
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3066void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3067 __acquires(css_set_lock)
3049{ 3068{
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3776 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3777 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3778 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3779 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3780 spin_unlock(&cgrp->event_list_lock);
3762 /* 3781 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3782 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3913,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3913static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3914 struct cftype *cft)
3896{ 3915{
3897 return clone_children(cgrp); 3916 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3917}
3899 3918
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3919static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3921 u64 val)
3903{ 3922{
3904 if (val) 3923 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3925 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3926 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3927 return 0;
3909} 3928}
3910 3929
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4036 css->flags = 0;
4018 css->id = NULL; 4037 css->id = NULL;
4019 if (cgrp == dummytop) 4038 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4039 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4040 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4041 cgrp->subsys[ss->subsys_id] = css;
4023 4042
4024 /* 4043 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4044 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4045 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4046 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4047 * dput() asynchronously from css_put().
4029 */ 4048 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4049 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4050}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4051
4052/* invoke ->post_create() on a new CSS and mark it online if successful */
4053static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4054{
4055 int ret = 0;
4056
4057 lockdep_assert_held(&cgroup_mutex);
4058
4059 if (ss->css_online)
4060 ret = ss->css_online(cgrp);
4061 if (!ret)
4062 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4063 return ret;
4064}
4065
4066/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4067static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4068 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4069{
4070 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4071
4072 lockdep_assert_held(&cgroup_mutex);
4073
4074 if (!(css->flags & CSS_ONLINE))
4075 return;
4076
4077 /*
4078 * css_offline() should be called with cgroup_mutex unlocked. See
4079 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4080 * details. This temporary unlocking should go away once
4081 * cgroup_mutex is unexported from controllers.
4082 */
4083 if (ss->css_offline) {
4084 mutex_unlock(&cgroup_mutex);
4085 ss->css_offline(cgrp);
4086 mutex_lock(&cgroup_mutex);
4087 }
4088
4089 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4090}
4034 4091
4035/* 4092/*
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4106 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4107 struct super_block *sb = root->sb;
4051 4108
4109 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4110 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4111 if (!cgrp)
4054 return -ENOMEM; 4112 return -ENOMEM;
4055 4113
4114 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4115 if (cgrp->id < 0)
4116 goto err_free_cgrp;
4117
4118 /*
4119 * Only live parents can have children. Note that the liveliness
4120 * check isn't strictly necessary because cgroup_mkdir() and
4121 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4122 * anyway so that locking is contained inside cgroup proper and we
4123 * don't get nasty surprises if we ever grow another caller.
4124 */
4125 if (!cgroup_lock_live_group(parent)) {
4126 err = -ENODEV;
4127 goto err_free_id;
4128 }
4129
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4130 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4131 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4132 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4134 * fs */
4061 atomic_inc(&sb->s_active); 4135 atomic_inc(&sb->s_active);
4062 4136
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4137 init_cgroup_housekeeping(cgrp);
4066 4138
4067 cgrp->parent = parent; 4139 cgrp->parent = parent;
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4143 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4144 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4145
4074 if (clone_children(parent)) 4146 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4147 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4148
4077 for_each_subsys(root, ss) { 4149 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4150 struct cgroup_subsys_state *css;
4079 4151
4080 css = ss->create(cgrp); 4152 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4153 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4154 err = PTR_ERR(css);
4083 goto err_destroy; 4155 goto err_free_all;
4084 } 4156 }
4085 init_cgroup_css(css, ss, cgrp); 4157 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4158 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4159 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4160 if (err)
4089 goto err_destroy; 4161 goto err_free_all;
4090 } 4162 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4163 }
4092 if (clone_children(parent) && ss->post_clone) 4164
4093 ss->post_clone(cgrp); 4165 /*
4166 * Create directory. cgroup_create_file() returns with the new
4167 * directory locked on success so that it can be populated without
4168 * dropping cgroup_mutex.
4169 */
4170 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4171 if (err < 0)
4172 goto err_free_all;
4173 lockdep_assert_held(&dentry->d_inode->i_mutex);
4174
4175 /* allocation complete, commit to creation */
4176 dentry->d_fsdata = cgrp;
4177 cgrp->dentry = dentry;
4178 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4179 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4180 root->number_of_cgroups++;
4181
4182 /* each css holds a ref to the cgroup's dentry */
4183 for_each_subsys(root, ss)
4184 dget(dentry);
4185
4186 /* creation succeeded, notify subsystems */
4187 for_each_subsys(root, ss) {
4188 err = online_css(ss, cgrp);
4189 if (err)
4190 goto err_destroy;
4094 4191
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4192 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4193 parent->parent) {
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4199 }
4103 } 4200 }
4104 4201
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4202 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4203 if (err)
4204 goto err_destroy;
4124 4205
4125 mutex_unlock(&cgroup_mutex); 4206 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4207 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4208
4128 return 0; 4209 return 0;
4129 4210
4130 err_remove: 4211err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4212 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4213 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4214 ss->css_free(cgrp);
4140 } 4215 }
4141
4142 mutex_unlock(&cgroup_mutex); 4216 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4217 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4218 deactivate_super(sb);
4146 4219err_free_id:
4220 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4221err_free_cgrp:
4147 kfree(cgrp); 4222 kfree(cgrp);
4148 return err; 4223 return err;
4224
4225err_destroy:
4226 cgroup_destroy_locked(cgrp);
4227 mutex_unlock(&cgroup_mutex);
4228 mutex_unlock(&dentry->d_inode->i_mutex);
4229 return err;
4149} 4230}
4150 4231
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4232static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4278 return 0;
4198} 4279}
4199 4280
4200/* 4281static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4282 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4283{
4284 struct dentry *d = cgrp->dentry;
4285 struct cgroup *parent = cgrp->parent;
4286 DEFINE_WAIT(wait);
4287 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4288 struct cgroup_subsys *ss;
4225 unsigned long flags; 4289 LIST_HEAD(tmp_list);
4226 bool failed = false; 4290
4291 lockdep_assert_held(&d->d_inode->i_mutex);
4292 lockdep_assert_held(&cgroup_mutex);
4227 4293
4228 local_irq_save(flags); 4294 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4295 return -EBUSY;
4229 4296
4230 /* 4297 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4298 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4299 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4300 * attempts fail thus maintaining the removal conditions verified
4301 * above.
4234 */ 4302 */
4235 for_each_subsys(cgrp->root, ss) { 4303 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4304 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4305
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4306 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4307 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4308 }
4309 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4310
4261 local_irq_restore(flags); 4311 /* tell subsystems to initate destruction */
4262 return !failed; 4312 for_each_subsys(cgrp->root, ss)
4263} 4313 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4314
4298 /* 4315 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4316 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4317 * cgroup's dentry and cgroup removal proceeds regardless of css
4318 * refs. On the last put of each css, whenever that may be, the
4319 * extra dentry ref is put so that dentry destruction happens only
4320 * after all css's are released.
4301 */ 4321 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4322 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4323 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4324
4334 raw_spin_lock(&release_list_lock); 4325 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4326 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4327 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4328 raw_spin_unlock(&release_list_lock);
4339 4329
4340 /* delete this cgroup from parent->children */ 4330 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4331 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4332 list_del_init(&cgrp->allcg_node);
4344 4333
4345 d = dget(cgrp->dentry); 4334 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4335 cgroup_d_remove_dir(d);
4348 dput(d); 4336 dput(d);
4349 4337
@@ -4353,21 +4341,35 @@ again:
4353 /* 4341 /*
4354 * Unregister events and notify userspace. 4342 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4343 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4344 * directory to avoid race between userspace and kernelspace. Use
4345 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4346 * cgroup_event_wake() is called with the wait queue head locked,
4347 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4348 */
4358 spin_lock(&cgrp->event_list_lock); 4349 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4350 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4351 spin_unlock(&cgrp->event_list_lock);
4352 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4353 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4354 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4355 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4356 schedule_work(&event->remove);
4364 } 4357 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4358
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4359 return 0;
4369} 4360}
4370 4361
4362static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4363{
4364 int ret;
4365
4366 mutex_lock(&cgroup_mutex);
4367 ret = cgroup_destroy_locked(dentry->d_fsdata);
4368 mutex_unlock(&cgroup_mutex);
4369
4370 return ret;
4371}
4372
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4373static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4374{
4373 INIT_LIST_HEAD(&ss->cftsets); 4375 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4390
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4391 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4392
4393 mutex_lock(&cgroup_mutex);
4394
4391 /* init base cftset */ 4395 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4396 cgroup_init_cftsets(ss);
4393 4397
4394 /* Create the top cgroup state for this subsystem */ 4398 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4399 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4400 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4401 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4402 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4403 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4404 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4407 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4408 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4409 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4410 init_css_set.subsys[ss->subsys_id] = css;
4407 4411
4408 need_forkexit_callback |= ss->fork || ss->exit; 4412 need_forkexit_callback |= ss->fork || ss->exit;
4409 4413
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4417 BUG_ON(!list_empty(&init_task.tasks));
4414 4418
4415 ss->active = 1; 4419 ss->active = 1;
4420 BUG_ON(online_css(ss, dummytop));
4421
4422 mutex_unlock(&cgroup_mutex);
4416 4423
4417 /* this function shouldn't be used with modular subsystems, since they 4424 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4425 * need to register a subsys_id, among other things */
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4437 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4438int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4439{
4433 int i;
4434 struct cgroup_subsys_state *css; 4440 struct cgroup_subsys_state *css;
4441 int i, ret;
4435 4442
4436 /* check name and function validity */ 4443 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4444 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4445 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4446 return -EINVAL;
4440 4447
4441 /* 4448 /*
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4471 subsys[ss->subsys_id] = ss;
4465 4472
4466 /* 4473 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4474 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4475 * struct, so this can happen first (i.e. before the rootnode
4476 * attachment).
4469 */ 4477 */
4470 css = ss->create(dummytop); 4478 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4479 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4480 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4481 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4490 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4491 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4492 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4493 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4494 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4495 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4496 }
4494 4497
4495 /* 4498 /*
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4525 write_unlock(&css_set_lock);
4523 4526
4524 ss->active = 1; 4527 ss->active = 1;
4528 ret = online_css(ss, dummytop);
4529 if (ret)
4530 goto err_unload;
4525 4531
4526 /* success! */ 4532 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4533 mutex_unlock(&cgroup_mutex);
4528 return 0; 4534 return 0;
4535
4536err_unload:
4537 mutex_unlock(&cgroup_mutex);
4538 /* @ss can't be mounted here as try_module_get() would fail */
4539 cgroup_unload_subsys(ss);
4540 return ret;
4529} 4541}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4542EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4543
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4564 BUG_ON(ss->root != &rootnode);
4553 4565
4554 mutex_lock(&cgroup_mutex); 4566 mutex_lock(&cgroup_mutex);
4567
4568 offline_css(ss, dummytop);
4569 ss->active = 0;
4570
4571 if (ss->use_id) {
4572 idr_remove_all(&ss->idr);
4573 idr_destroy(&ss->idr);
4574 }
4575
4555 /* deassign the subsys_id */ 4576 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4577 subsys[ss->subsys_id] = NULL;
4557 4578
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4588 struct css_set *cg = link->cg;
4568 4589
4569 hlist_del(&cg->hlist); 4590 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4591 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4592 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4593 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4595 write_unlock(&css_set_lock);
4576 4596
4577 /* 4597 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4598 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4599 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4600 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4601 * takes care of freeing the css_id.
4582 */ 4602 */
4583 ss->destroy(dummytop); 4603 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4604 dummytop->subsys[ss->subsys_id] = NULL;
4585 4605
4586 mutex_unlock(&cgroup_mutex); 4606 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
4624 4644
4625 BUG_ON(!ss->name); 4645 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4646 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4647 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4648 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4649 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4650 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4651 ss->name, ss->subsys_id);
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4852}
4833 4853
4834/** 4854/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4855 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4856 * @child: the task in question
4865 * 4857 *
4866 * Adds the task to the list running through its css_set if necessary. 4858 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4859 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4860 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4861 * cgroup_iter_start() - to guarantee that the new task ends up on its
4862 * list.
4870 */ 4863 */
4871void cgroup_post_fork(struct task_struct *child) 4864void cgroup_post_fork(struct task_struct *child)
4872{ 4865{
4866 int i;
4867
4873 /* 4868 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4869 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4870 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4884 task_unlock(child);
4890 write_unlock(&css_set_lock); 4885 write_unlock(&css_set_lock);
4891 } 4886 }
4887
4888 /*
4889 * Call ss->fork(). This must happen after @child is linked on
4890 * css_set; otherwise, @child might change state between ->fork()
4891 * and addition to css_set.
4892 */
4893 if (need_forkexit_callback) {
4894 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4895 struct cgroup_subsys *ss = subsys[i];
4896
4897 /*
4898 * fork/exit callbacks are supported only for
4899 * builtin subsystems and we don't need further
4900 * synchronization as they never go away.
4901 */
4902 if (!ss || ss->module)
4903 continue;
4904
4905 if (ss->fork)
4906 ss->fork(child);
4907 }
4908 }
4892} 4909}
4910
4893/** 4911/**
4894 * cgroup_exit - detach cgroup from exiting task 4912 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4913 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5040/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5041bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5042{
5025 do { 5043 while (true) {
5026 int v = css_refcnt(css); 5044 int t, v;
5027 5045
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5046 v = css_refcnt(css);
5047 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5048 if (likely(t == v))
5029 return true; 5049 return true;
5050 else if (t < 0)
5051 return false;
5030 cpu_relax(); 5052 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5053 }
5032
5033 return false;
5034} 5054}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5055EXPORT_SYMBOL_GPL(__css_tryget);
5036 5056
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5069 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5070 check_for_release(cgrp);
5051 } 5071 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5072 break;
5054 case 0: 5073 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5074 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5075 break;
5058 } 5076 }
5059 rcu_read_unlock(); 5077 rcu_read_unlock();
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5457}
5440 5458
5441#ifdef CONFIG_CGROUP_DEBUG 5459#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5460static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5461{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5462 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5463
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5467 return css;
5450} 5468}
5451 5469
5452static void debug_destroy(struct cgroup *cont) 5470static void debug_css_free(struct cgroup *cont)
5453{ 5471{
5454 kfree(cont->subsys[debug_subsys_id]); 5472 kfree(cont->subsys[debug_subsys_id]);
5455} 5473}
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = {
5578 5596
5579struct cgroup_subsys debug_subsys = { 5597struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5598 .name = "debug",
5581 .create = debug_create, 5599 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5600 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5601 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5602 .base_cftypes = debug_files,
5585}; 5603};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..b017887d632f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..f9ff5493171d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7435 7435
7436#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7438{
7439 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7440 7440
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7451 return &jc->css;
7452} 7452}
7453 7453
7454static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7455{
7456 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7493 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499 7499
diff --git a/kernel/fork.c b/kernel/fork.c
index 850dde1e0c84..79de9f99a48d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1137{ 1137{
1138 int retval; 1138 int retval;
1139 struct task_struct *p; 1139 struct task_struct *p;
1140 int cgroup_callbacks_done = 0;
1141 1140
1142 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1143 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1395 INIT_LIST_HEAD(&p->thread_group); 1394 INIT_LIST_HEAD(&p->thread_group);
1396 p->task_works = NULL; 1395 p->task_works = NULL;
1397 1396
1398 /* Now that the task is set up, run cgroup callbacks if
1399 * necessary. We need to run them before the task is visible
1400 * on the tasklist. */
1401 cgroup_fork_callbacks(p);
1402 cgroup_callbacks_done = 1;
1403
1404 /* Need tasklist lock for parent etc handling! */ 1397 /* Need tasklist lock for parent etc handling! */
1405 write_lock_irq(&tasklist_lock); 1398 write_lock_irq(&tasklist_lock);
1406 1399
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup:
1505#endif 1498#endif
1506 if (clone_flags & CLONE_THREAD) 1499 if (clone_flags & CLONE_THREAD)
1507 threadgroup_change_end(current); 1500 threadgroup_change_end(current);
1508 cgroup_exit(p, cgroup_callbacks_done); 1501 cgroup_exit(p, 0);
1509 delayacct_tsk_free(p); 1502 delayacct_tsk_free(p);
1510 module_put(task_thread_info(p)->exec_domain->module); 1503 module_put(task_thread_info(p)->exec_domain->module);
1511bad_fork_cleanup_count: 1504bad_fork_cleanup_count:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5066a61f971..6271b89f87ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7484 struct task_group, css); 7484 struct task_group, css);
7485} 7485}
7486 7486
7487static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7487static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7488{ 7488{
7489 struct task_group *tg, *parent; 7489 struct task_group *tg, *parent;
7490 7490
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7501 return &tg->css; 7501 return &tg->css;
7502} 7502}
7503 7503
7504static void cpu_cgroup_destroy(struct cgroup *cgrp) 7504static void cpu_cgroup_css_free(struct cgroup *cgrp)
7505{ 7505{
7506 struct task_group *tg = cgroup_tg(cgrp); 7506 struct task_group *tg = cgroup_tg(cgrp);
7507 7507
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = {
7861 7861
7862struct cgroup_subsys cpu_cgroup_subsys = { 7862struct cgroup_subsys cpu_cgroup_subsys = {
7863 .name = "cpu", 7863 .name = "cpu",
7864 .create = cpu_cgroup_create, 7864 .css_alloc = cpu_cgroup_css_alloc,
7865 .destroy = cpu_cgroup_destroy, 7865 .css_free = cpu_cgroup_css_free,
7866 .can_attach = cpu_cgroup_can_attach, 7866 .can_attach = cpu_cgroup_can_attach,
7867 .attach = cpu_cgroup_attach, 7867 .attach = cpu_cgroup_attach,
7868 .exit = cpu_cgroup_exit, 7868 .exit = cpu_cgroup_exit,
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7885struct cpuacct root_cpuacct; 7885struct cpuacct root_cpuacct;
7886 7886
7887/* create a new cpu accounting group */ 7887/* create a new cpu accounting group */
7888static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7888static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7889{ 7889{
7890 struct cpuacct *ca; 7890 struct cpuacct *ca;
7891 7891
@@ -7915,7 +7915,7 @@ out:
7915} 7915}
7916 7916
7917/* destroy an existing cpu accounting group */ 7917/* destroy an existing cpu accounting group */
7918static void cpuacct_destroy(struct cgroup *cgrp) 7918static void cpuacct_css_free(struct cgroup *cgrp)
7919{ 7919{
7920 struct cpuacct *ca = cgroup_ca(cgrp); 7920 struct cpuacct *ca = cgroup_ca(cgrp);
7921 7921
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8086 8086
8087struct cgroup_subsys cpuacct_subsys = { 8087struct cgroup_subsys cpuacct_subsys = {
8088 .name = "cpuacct", 8088 .name = "cpuacct",
8089 .create = cpuacct_create, 8089 .css_alloc = cpuacct_css_alloc,
8090 .destroy = cpuacct_destroy, 8090 .css_free = cpuacct_css_free,
8091 .subsys_id = cpuacct_subsys_id, 8091 .subsys_id = cpuacct_subsys_id,
8092 .base_cftypes = files, 8092 .base_cftypes = files,
8093}; 8093};
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..5ffb5626e072 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1908 preempt_disable();
1909 read_unlock(&tasklist_lock); 1909 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1910 preempt_enable_no_resched();
1911 schedule(); 1911 freezable_schedule();
1912 } else { 1912 } else {
1913 /* 1913 /*
1914 * By the time we got the lock, our tracer went away. 1914 * By the time we got the lock, our tracer went away.
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1930 }
1931 1931
1932 /* 1932 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1933 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1934 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1935 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
2092 } 2085 }
2093 2086
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2087 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2088 freezable_schedule();
2096 return true; 2089 return true;
2097 } else { 2090 } else {
2098 /* 2091 /*
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2193 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2194 return 0;
2202 2195
2203relock:
2204 /* 2196 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2197 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2198 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2199 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2200 */
2210 try_to_freeze(); 2201 try_to_freeze();
2211 2202
2203relock:
2212 spin_lock_irq(&sighand->siglock); 2204 spin_lock_irq(&sighand->siglock);
2213 /* 2205 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2206 * Every stopped thread goes here after wakeup. Check to see if