aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
commitd206e09036d6201f90b2719484c8a59526c46125 (patch)
tree84b9057919bcb8cfd1cff47baa5fc74457e77d6d /kernel
parentfef3ff2eb777e76cfa5ae67591982d902c17139c (diff)
parent15ef4ffaa797034d5ff82844daf8f595d7c6d53c (diff)
Merge branch 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "A lot of activities on cgroup side. The big changes are focused on making cgroup hierarchy handling saner. - cgroup_rmdir() had peculiar semantics - it allowed cgroup destruction to be vetoed by individual controllers and tried to drain refcnt synchronously. The vetoing never worked properly and caused good deal of contortions in cgroup. memcg was the last reamining user. Michal Hocko removed the usage and cgroup_rmdir() path has been simplified significantly. This was done in a separate branch so that the memcg people can base further memcg changes on top. - The above allowed cleaning up cgroup lifecycle management and implementation of generic cgroup iterators which are used to improve hierarchy support. - cgroup_freezer updated to allow migration in and out of a frozen cgroup and handle hierarchy. If a cgroup is frozen, all descendant cgroups are frozen. - netcls_cgroup and netprio_cgroup updated to handle hierarchy properly. - Various fixes and cleanups. - Two merge commits. One to pull in memcg and rmdir cleanups (needed to build iterators). The other pulled in cgroup/for-3.7-fixes for device_cgroup fixes so that further device_cgroup patches can be stacked on top." Fixed up a trivial conflict in mm/memcontrol.c as per Tejun (due to commit bea8c150a7 ("memcg: fix hotplugged memory zone oops") in master touching code close to commit 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") in for-3.8) * 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (65 commits) cgroup: update Documentation/cgroups/00-INDEX cgroup_rm_file: don't delete the uncreated files cgroup: remove subsystem files when remounting cgroup cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup: warn about broken hierarchies only after css_online cgroup: list_del_init() on removed events cgroup: fix lockdep warning for event_control cgroup: move list add after list head initilization netprio_cgroup: allow nesting and inherit config on cgroup creation netprio_cgroup: implement netprio[_set]_prio() helpers netprio_cgroup: use cgroup->id instead of cgroup_netprio_state->prioidx netprio_cgroup: reimplement priomap expansion netprio_cgroup: shorten variable names in extend_netdev_table() netprio_cgroup: simplify write_priomap() netcls_cgroup: move config inheritance to ->css_online() and remove .broken_hierarchy marking cgroup: remove obsolete guarantee from cgroup_task_migrate. cgroup: add cgroup->id cgroup, cpuset: remove cgroup_subsys->post_clone() cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c754
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/cpuset.c90
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c20
9 files changed, 758 insertions, 677 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..f34c41bfaa37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1349 goto out_unlock;
1398 } 1350 }
1399 1351
1352 /*
1353 * Clear out the files of subsystems that should be removed, do
1354 * this before rebind_subsystems, since rebind_subsystems may
1355 * change this hierarchy's subsys_list.
1356 */
1357 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1358
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1359 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1360 if (ret) {
1361 /* rebind_subsystems failed, re-populate the removed files */
1362 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1363 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1364 goto out_unlock;
1404 } 1365 }
1405 1366
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1367 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1368 cgroup_populate_dir(cgrp, false, added_mask);
1410 1369
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1391 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1392 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1393 INIT_LIST_HEAD(&cgrp->css_sets);
1394 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1395 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1396 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1397 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1410 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1411 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1412 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1413 init_cgroup_housekeeping(cgrp);
1414 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1415}
1456 1416
1457static bool init_root_id(struct cgroupfs_root *root) 1417static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1478
1519 root->subsys_mask = opts->subsys_mask; 1479 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1480 root->flags = opts->flags;
1481 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1482 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1483 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1484 if (opts->name)
1524 strcpy(root->name, opts->name); 1485 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1486 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1487 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1488 return root;
1528} 1489}
1529 1490
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1497 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1498 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1499 spin_unlock(&hierarchy_id_lock);
1500 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1501 kfree(root);
1540} 1502}
1541 1503
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1663
1702 free_cg_links(&tmp_cg_links); 1664 free_cg_links(&tmp_cg_links);
1703 1665
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1666 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1667 BUG_ON(root->number_of_cgroups != 1);
1707 1668
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1711
1751 BUG_ON(root->number_of_cgroups != 1); 1712 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1713 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1714
1755 mutex_lock(&cgroup_mutex); 1715 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1716 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1768 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1769int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1770{
1771 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1772 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1773
1813 cgroup_lock_is_held()); 1774 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1775 "cgroup_path() called without proper locking");
1814 1776
1815 if (!dentry || cgrp == dummytop) { 1777 if (!dentry || cgrp == dummytop) {
1816 /* 1778 /*
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1783 return 0;
1822 } 1784 }
1823 1785
1824 start = buf + buflen; 1786 start = buf + buflen - 1;
1825 1787
1826 *--start = '\0'; 1788 *start = '\0';
1827 for (;;) { 1789 for (;;) {
1828 int len = dentry->d_name.len; 1790 int len = dentry->d_name.len;
1829 1791
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1796 if (!cgrp)
1835 break; 1797 break;
1836 1798
1837 dentry = rcu_dereference_check(cgrp->dentry, 1799 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1800 if (!cgrp->parent)
1840 continue; 1801 continue;
1841 if (--start < buf) 1802 if (--start < buf)
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1891/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1892 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1893 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1894 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1895 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1896static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1897 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1984 }
2026 1985
2027 synchronize_rcu(); 1986 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1987out:
2035 if (retval) { 1988 if (retval) {
2036 for_each_subsys(root, ss) { 1989 for_each_subsys(root, ss) {
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2153 * step 5: success! and cleanup
2201 */ 2154 */
2202 synchronize_rcu(); 2155 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2156 retval = 0;
2205out_put_css_set_refs: 2157out_put_css_set_refs:
2206 if (retval) { 2158 if (retval) {
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2663
2712 /* start off with i_nlink == 2 (for "." entry) */ 2664 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2665 inc_nlink(inode);
2666 inc_nlink(dentry->d_parent->d_inode);
2714 2667
2715 /* start with the directory inode held, so that we can 2668 /*
2716 * populate it without racing with another mkdir */ 2669 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2670 * @inode->i_mutex should nest outside cgroup_mutex but we
2671 * want to populate it immediately without releasing
2672 * cgroup_mutex. As @inode isn't visible to anyone else
2673 * yet, trylock will always succeed without affecting
2674 * lockdep checks.
2675 */
2676 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2677 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2678 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2679 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2684 return 0;
2726} 2685}
2727 2686
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2687/**
2755 * cgroup_file_mode - deduce file mode of a control file 2688 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2689 * @cft: the control file in question
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2724
2792 simple_xattrs_init(&cft->xattrs); 2725 simple_xattrs_init(&cft->xattrs);
2793 2726
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2727 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2728 strcpy(name, subsys->name);
2802 strcat(name, "."); 2729 strcat(name, ".");
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2764 int err, ret = 0;
2838 2765
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2766 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2767 /* does cft->flags tell us to skip this file on @cgrp? */
2768 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2769 continue;
2770 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2771 continue;
2772
2840 if (is_add) 2773 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2774 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2775 else
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2977 write_unlock(&css_set_lock);
3045} 2978}
3046 2979
2980/**
2981 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2982 * @pos: the current position (%NULL to initiate traversal)
2983 * @cgroup: cgroup whose descendants to walk
2984 *
2985 * To be used by cgroup_for_each_descendant_pre(). Find the next
2986 * descendant to visit for pre-order traversal of @cgroup's descendants.
2987 */
2988struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2989 struct cgroup *cgroup)
2990{
2991 struct cgroup *next;
2992
2993 WARN_ON_ONCE(!rcu_read_lock_held());
2994
2995 /* if first iteration, pretend we just visited @cgroup */
2996 if (!pos) {
2997 if (list_empty(&cgroup->children))
2998 return NULL;
2999 pos = cgroup;
3000 }
3001
3002 /* visit the first child if exists */
3003 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3004 if (next)
3005 return next;
3006
3007 /* no child, visit my or the closest ancestor's next sibling */
3008 do {
3009 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3010 sibling);
3011 if (&next->sibling != &pos->parent->children)
3012 return next;
3013
3014 pos = pos->parent;
3015 } while (pos != cgroup);
3016
3017 return NULL;
3018}
3019EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3020
3021static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3022{
3023 struct cgroup *last;
3024
3025 do {
3026 last = pos;
3027 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3028 sibling);
3029 } while (pos);
3030
3031 return last;
3032}
3033
3034/**
3035 * cgroup_next_descendant_post - find the next descendant for post-order walk
3036 * @pos: the current position (%NULL to initiate traversal)
3037 * @cgroup: cgroup whose descendants to walk
3038 *
3039 * To be used by cgroup_for_each_descendant_post(). Find the next
3040 * descendant to visit for post-order traversal of @cgroup's descendants.
3041 */
3042struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3043 struct cgroup *cgroup)
3044{
3045 struct cgroup *next;
3046
3047 WARN_ON_ONCE(!rcu_read_lock_held());
3048
3049 /* if first iteration, visit the leftmost descendant */
3050 if (!pos) {
3051 next = cgroup_leftmost_descendant(cgroup);
3052 return next != cgroup ? next : NULL;
3053 }
3054
3055 /* if there's an unvisited sibling, visit its leftmost descendant */
3056 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3057 if (&next->sibling != &pos->parent->children)
3058 return cgroup_leftmost_descendant(next);
3059
3060 /* no sibling left, visit parent */
3061 next = pos->parent;
3062 return next != cgroup ? next : NULL;
3063}
3064EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3065
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3066void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3067 __acquires(css_set_lock)
3049{ 3068{
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3776 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3777 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3778 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3779 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3780 spin_unlock(&cgrp->event_list_lock);
3762 /* 3781 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3782 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3913,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3913static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3914 struct cftype *cft)
3896{ 3915{
3897 return clone_children(cgrp); 3916 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3917}
3899 3918
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3919static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3921 u64 val)
3903{ 3922{
3904 if (val) 3923 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3925 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3926 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3927 return 0;
3909} 3928}
3910 3929
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4036 css->flags = 0;
4018 css->id = NULL; 4037 css->id = NULL;
4019 if (cgrp == dummytop) 4038 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4039 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4040 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4041 cgrp->subsys[ss->subsys_id] = css;
4023 4042
4024 /* 4043 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4044 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4045 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4046 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4047 * dput() asynchronously from css_put().
4029 */ 4048 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4049 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4050}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4051
4052/* invoke ->post_create() on a new CSS and mark it online if successful */
4053static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4054{
4055 int ret = 0;
4056
4057 lockdep_assert_held(&cgroup_mutex);
4058
4059 if (ss->css_online)
4060 ret = ss->css_online(cgrp);
4061 if (!ret)
4062 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4063 return ret;
4064}
4065
4066/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4067static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4068 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4069{
4070 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4071
4072 lockdep_assert_held(&cgroup_mutex);
4073
4074 if (!(css->flags & CSS_ONLINE))
4075 return;
4076
4077 /*
4078 * css_offline() should be called with cgroup_mutex unlocked. See
4079 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4080 * details. This temporary unlocking should go away once
4081 * cgroup_mutex is unexported from controllers.
4082 */
4083 if (ss->css_offline) {
4084 mutex_unlock(&cgroup_mutex);
4085 ss->css_offline(cgrp);
4086 mutex_lock(&cgroup_mutex);
4087 }
4088
4089 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4090}
4034 4091
4035/* 4092/*
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4106 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4107 struct super_block *sb = root->sb;
4051 4108
4109 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4110 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4111 if (!cgrp)
4054 return -ENOMEM; 4112 return -ENOMEM;
4055 4113
4114 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4115 if (cgrp->id < 0)
4116 goto err_free_cgrp;
4117
4118 /*
4119 * Only live parents can have children. Note that the liveliness
4120 * check isn't strictly necessary because cgroup_mkdir() and
4121 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4122 * anyway so that locking is contained inside cgroup proper and we
4123 * don't get nasty surprises if we ever grow another caller.
4124 */
4125 if (!cgroup_lock_live_group(parent)) {
4126 err = -ENODEV;
4127 goto err_free_id;
4128 }
4129
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4130 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4131 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4132 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4134 * fs */
4061 atomic_inc(&sb->s_active); 4135 atomic_inc(&sb->s_active);
4062 4136
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4137 init_cgroup_housekeeping(cgrp);
4066 4138
4067 cgrp->parent = parent; 4139 cgrp->parent = parent;
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4143 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4144 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4145
4074 if (clone_children(parent)) 4146 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4147 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4148
4077 for_each_subsys(root, ss) { 4149 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4150 struct cgroup_subsys_state *css;
4079 4151
4080 css = ss->create(cgrp); 4152 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4153 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4154 err = PTR_ERR(css);
4083 goto err_destroy; 4155 goto err_free_all;
4084 } 4156 }
4085 init_cgroup_css(css, ss, cgrp); 4157 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4158 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4159 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4160 if (err)
4089 goto err_destroy; 4161 goto err_free_all;
4090 } 4162 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4163 }
4092 if (clone_children(parent) && ss->post_clone) 4164
4093 ss->post_clone(cgrp); 4165 /*
4166 * Create directory. cgroup_create_file() returns with the new
4167 * directory locked on success so that it can be populated without
4168 * dropping cgroup_mutex.
4169 */
4170 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4171 if (err < 0)
4172 goto err_free_all;
4173 lockdep_assert_held(&dentry->d_inode->i_mutex);
4174
4175 /* allocation complete, commit to creation */
4176 dentry->d_fsdata = cgrp;
4177 cgrp->dentry = dentry;
4178 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4179 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4180 root->number_of_cgroups++;
4181
4182 /* each css holds a ref to the cgroup's dentry */
4183 for_each_subsys(root, ss)
4184 dget(dentry);
4185
4186 /* creation succeeded, notify subsystems */
4187 for_each_subsys(root, ss) {
4188 err = online_css(ss, cgrp);
4189 if (err)
4190 goto err_destroy;
4094 4191
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4192 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4193 parent->parent) {
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4199 }
4103 } 4200 }
4104 4201
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4202 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4203 if (err)
4204 goto err_destroy;
4124 4205
4125 mutex_unlock(&cgroup_mutex); 4206 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4207 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4208
4128 return 0; 4209 return 0;
4129 4210
4130 err_remove: 4211err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4212 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4213 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4214 ss->css_free(cgrp);
4140 } 4215 }
4141
4142 mutex_unlock(&cgroup_mutex); 4216 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4217 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4218 deactivate_super(sb);
4146 4219err_free_id:
4220 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4221err_free_cgrp:
4147 kfree(cgrp); 4222 kfree(cgrp);
4148 return err; 4223 return err;
4224
4225err_destroy:
4226 cgroup_destroy_locked(cgrp);
4227 mutex_unlock(&cgroup_mutex);
4228 mutex_unlock(&dentry->d_inode->i_mutex);
4229 return err;
4149} 4230}
4150 4231
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4232static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4278 return 0;
4198} 4279}
4199 4280
4200/* 4281static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4282 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4283{
4284 struct dentry *d = cgrp->dentry;
4285 struct cgroup *parent = cgrp->parent;
4286 DEFINE_WAIT(wait);
4287 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4288 struct cgroup_subsys *ss;
4225 unsigned long flags; 4289 LIST_HEAD(tmp_list);
4226 bool failed = false; 4290
4291 lockdep_assert_held(&d->d_inode->i_mutex);
4292 lockdep_assert_held(&cgroup_mutex);
4227 4293
4228 local_irq_save(flags); 4294 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4295 return -EBUSY;
4229 4296
4230 /* 4297 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4298 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4299 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4300 * attempts fail thus maintaining the removal conditions verified
4301 * above.
4234 */ 4302 */
4235 for_each_subsys(cgrp->root, ss) { 4303 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4304 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4305
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4306 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4307 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4308 }
4309 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4310
4261 local_irq_restore(flags); 4311 /* tell subsystems to initate destruction */
4262 return !failed; 4312 for_each_subsys(cgrp->root, ss)
4263} 4313 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4314
4298 /* 4315 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4316 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4317 * cgroup's dentry and cgroup removal proceeds regardless of css
4318 * refs. On the last put of each css, whenever that may be, the
4319 * extra dentry ref is put so that dentry destruction happens only
4320 * after all css's are released.
4301 */ 4321 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4322 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4323 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4324
4334 raw_spin_lock(&release_list_lock); 4325 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4326 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4327 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4328 raw_spin_unlock(&release_list_lock);
4339 4329
4340 /* delete this cgroup from parent->children */ 4330 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4331 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4332 list_del_init(&cgrp->allcg_node);
4344 4333
4345 d = dget(cgrp->dentry); 4334 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4335 cgroup_d_remove_dir(d);
4348 dput(d); 4336 dput(d);
4349 4337
@@ -4353,21 +4341,35 @@ again:
4353 /* 4341 /*
4354 * Unregister events and notify userspace. 4342 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4343 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4344 * directory to avoid race between userspace and kernelspace. Use
4345 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4346 * cgroup_event_wake() is called with the wait queue head locked,
4347 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4348 */
4358 spin_lock(&cgrp->event_list_lock); 4349 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4350 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4351 spin_unlock(&cgrp->event_list_lock);
4352 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4353 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4354 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4355 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4356 schedule_work(&event->remove);
4364 } 4357 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4358
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4359 return 0;
4369} 4360}
4370 4361
4362static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4363{
4364 int ret;
4365
4366 mutex_lock(&cgroup_mutex);
4367 ret = cgroup_destroy_locked(dentry->d_fsdata);
4368 mutex_unlock(&cgroup_mutex);
4369
4370 return ret;
4371}
4372
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4373static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4374{
4373 INIT_LIST_HEAD(&ss->cftsets); 4375 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4390
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4391 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4392
4393 mutex_lock(&cgroup_mutex);
4394
4391 /* init base cftset */ 4395 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4396 cgroup_init_cftsets(ss);
4393 4397
4394 /* Create the top cgroup state for this subsystem */ 4398 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4399 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4400 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4401 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4402 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4403 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4404 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4407 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4408 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4409 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4410 init_css_set.subsys[ss->subsys_id] = css;
4407 4411
4408 need_forkexit_callback |= ss->fork || ss->exit; 4412 need_forkexit_callback |= ss->fork || ss->exit;
4409 4413
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4417 BUG_ON(!list_empty(&init_task.tasks));
4414 4418
4415 ss->active = 1; 4419 ss->active = 1;
4420 BUG_ON(online_css(ss, dummytop));
4421
4422 mutex_unlock(&cgroup_mutex);
4416 4423
4417 /* this function shouldn't be used with modular subsystems, since they 4424 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4425 * need to register a subsys_id, among other things */
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4437 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4438int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4439{
4433 int i;
4434 struct cgroup_subsys_state *css; 4440 struct cgroup_subsys_state *css;
4441 int i, ret;
4435 4442
4436 /* check name and function validity */ 4443 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4444 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4445 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4446 return -EINVAL;
4440 4447
4441 /* 4448 /*
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4471 subsys[ss->subsys_id] = ss;
4465 4472
4466 /* 4473 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4474 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4475 * struct, so this can happen first (i.e. before the rootnode
4476 * attachment).
4469 */ 4477 */
4470 css = ss->create(dummytop); 4478 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4479 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4480 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4481 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4490 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4491 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4492 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4493 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4494 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4495 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4496 }
4494 4497
4495 /* 4498 /*
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4525 write_unlock(&css_set_lock);
4523 4526
4524 ss->active = 1; 4527 ss->active = 1;
4528 ret = online_css(ss, dummytop);
4529 if (ret)
4530 goto err_unload;
4525 4531
4526 /* success! */ 4532 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4533 mutex_unlock(&cgroup_mutex);
4528 return 0; 4534 return 0;
4535
4536err_unload:
4537 mutex_unlock(&cgroup_mutex);
4538 /* @ss can't be mounted here as try_module_get() would fail */
4539 cgroup_unload_subsys(ss);
4540 return ret;
4529} 4541}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4542EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4543
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4564 BUG_ON(ss->root != &rootnode);
4553 4565
4554 mutex_lock(&cgroup_mutex); 4566 mutex_lock(&cgroup_mutex);
4567
4568 offline_css(ss, dummytop);
4569 ss->active = 0;
4570
4571 if (ss->use_id) {
4572 idr_remove_all(&ss->idr);
4573 idr_destroy(&ss->idr);
4574 }
4575
4555 /* deassign the subsys_id */ 4576 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4577 subsys[ss->subsys_id] = NULL;
4557 4578
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4588 struct css_set *cg = link->cg;
4568 4589
4569 hlist_del(&cg->hlist); 4590 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4591 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4592 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4593 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4595 write_unlock(&css_set_lock);
4576 4596
4577 /* 4597 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4598 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4599 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4600 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4601 * takes care of freeing the css_id.
4582 */ 4602 */
4583 ss->destroy(dummytop); 4603 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4604 dummytop->subsys[ss->subsys_id] = NULL;
4585 4605
4586 mutex_unlock(&cgroup_mutex); 4606 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
4624 4644
4625 BUG_ON(!ss->name); 4645 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4646 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4647 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4648 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4649 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4650 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4651 ss->name, ss->subsys_id);
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4852}
4833 4853
4834/** 4854/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4855 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4856 * @child: the task in question
4865 * 4857 *
4866 * Adds the task to the list running through its css_set if necessary. 4858 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4859 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4860 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4861 * cgroup_iter_start() - to guarantee that the new task ends up on its
4862 * list.
4870 */ 4863 */
4871void cgroup_post_fork(struct task_struct *child) 4864void cgroup_post_fork(struct task_struct *child)
4872{ 4865{
4866 int i;
4867
4873 /* 4868 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4869 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4870 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4884 task_unlock(child);
4890 write_unlock(&css_set_lock); 4885 write_unlock(&css_set_lock);
4891 } 4886 }
4887
4888 /*
4889 * Call ss->fork(). This must happen after @child is linked on
4890 * css_set; otherwise, @child might change state between ->fork()
4891 * and addition to css_set.
4892 */
4893 if (need_forkexit_callback) {
4894 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4895 struct cgroup_subsys *ss = subsys[i];
4896
4897 /*
4898 * fork/exit callbacks are supported only for
4899 * builtin subsystems and we don't need further
4900 * synchronization as they never go away.
4901 */
4902 if (!ss || ss->module)
4903 continue;
4904
4905 if (ss->fork)
4906 ss->fork(child);
4907 }
4908 }
4892} 4909}
4910
4893/** 4911/**
4894 * cgroup_exit - detach cgroup from exiting task 4912 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4913 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5040/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5041bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5042{
5025 do { 5043 while (true) {
5026 int v = css_refcnt(css); 5044 int t, v;
5027 5045
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5046 v = css_refcnt(css);
5047 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5048 if (likely(t == v))
5029 return true; 5049 return true;
5050 else if (t < 0)
5051 return false;
5030 cpu_relax(); 5052 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5053 }
5032
5033 return false;
5034} 5054}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5055EXPORT_SYMBOL_GPL(__css_tryget);
5036 5056
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5069 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5070 check_for_release(cgrp);
5051 } 5071 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5072 break;
5054 case 0: 5073 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5074 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5075 break;
5058 } 5076 }
5059 rcu_read_unlock(); 5077 rcu_read_unlock();
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5457}
5440 5458
5441#ifdef CONFIG_CGROUP_DEBUG 5459#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5460static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5461{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5462 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5463
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5467 return css;
5450} 5468}
5451 5469
5452static void debug_destroy(struct cgroup *cont) 5470static void debug_css_free(struct cgroup *cont)
5453{ 5471{
5454 kfree(cont->subsys[debug_subsys_id]); 5472 kfree(cont->subsys[debug_subsys_id]);
5455} 5473}
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = {
5578 5596
5579struct cgroup_subsys debug_subsys = { 5597struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5598 .name = "debug",
5581 .create = debug_create, 5599 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5600 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5601 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5602 .base_cftypes = debug_files,
5585}; 5603};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..b017887d632f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..f9ff5493171d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7435 7435
7436#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7438{
7439 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7440 7440
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7451 return &jc->css;
7452} 7452}
7453 7453
7454static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7455{
7456 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7493 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499 7499
diff --git a/kernel/fork.c b/kernel/fork.c
index 850dde1e0c84..79de9f99a48d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1137{ 1137{
1138 int retval; 1138 int retval;
1139 struct task_struct *p; 1139 struct task_struct *p;
1140 int cgroup_callbacks_done = 0;
1141 1140
1142 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1143 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1395 INIT_LIST_HEAD(&p->thread_group); 1394 INIT_LIST_HEAD(&p->thread_group);
1396 p->task_works = NULL; 1395 p->task_works = NULL;
1397 1396
1398 /* Now that the task is set up, run cgroup callbacks if
1399 * necessary. We need to run them before the task is visible
1400 * on the tasklist. */
1401 cgroup_fork_callbacks(p);
1402 cgroup_callbacks_done = 1;
1403
1404 /* Need tasklist lock for parent etc handling! */ 1397 /* Need tasklist lock for parent etc handling! */
1405 write_lock_irq(&tasklist_lock); 1398 write_lock_irq(&tasklist_lock);
1406 1399
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup:
1505#endif 1498#endif
1506 if (clone_flags & CLONE_THREAD) 1499 if (clone_flags & CLONE_THREAD)
1507 threadgroup_change_end(current); 1500 threadgroup_change_end(current);
1508 cgroup_exit(p, cgroup_callbacks_done); 1501 cgroup_exit(p, 0);
1509 delayacct_tsk_free(p); 1502 delayacct_tsk_free(p);
1510 module_put(task_thread_info(p)->exec_domain->module); 1503 module_put(task_thread_info(p)->exec_domain->module);
1511bad_fork_cleanup_count: 1504bad_fork_cleanup_count:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5066a61f971..6271b89f87ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7484 struct task_group, css); 7484 struct task_group, css);
7485} 7485}
7486 7486
7487static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7487static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7488{ 7488{
7489 struct task_group *tg, *parent; 7489 struct task_group *tg, *parent;
7490 7490
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7501 return &tg->css; 7501 return &tg->css;
7502} 7502}
7503 7503
7504static void cpu_cgroup_destroy(struct cgroup *cgrp) 7504static void cpu_cgroup_css_free(struct cgroup *cgrp)
7505{ 7505{
7506 struct task_group *tg = cgroup_tg(cgrp); 7506 struct task_group *tg = cgroup_tg(cgrp);
7507 7507
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = {
7861 7861
7862struct cgroup_subsys cpu_cgroup_subsys = { 7862struct cgroup_subsys cpu_cgroup_subsys = {
7863 .name = "cpu", 7863 .name = "cpu",
7864 .create = cpu_cgroup_create, 7864 .css_alloc = cpu_cgroup_css_alloc,
7865 .destroy = cpu_cgroup_destroy, 7865 .css_free = cpu_cgroup_css_free,
7866 .can_attach = cpu_cgroup_can_attach, 7866 .can_attach = cpu_cgroup_can_attach,
7867 .attach = cpu_cgroup_attach, 7867 .attach = cpu_cgroup_attach,
7868 .exit = cpu_cgroup_exit, 7868 .exit = cpu_cgroup_exit,
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7885struct cpuacct root_cpuacct; 7885struct cpuacct root_cpuacct;
7886 7886
7887/* create a new cpu accounting group */ 7887/* create a new cpu accounting group */
7888static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7888static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7889{ 7889{
7890 struct cpuacct *ca; 7890 struct cpuacct *ca;
7891 7891
@@ -7915,7 +7915,7 @@ out:
7915} 7915}
7916 7916
7917/* destroy an existing cpu accounting group */ 7917/* destroy an existing cpu accounting group */
7918static void cpuacct_destroy(struct cgroup *cgrp) 7918static void cpuacct_css_free(struct cgroup *cgrp)
7919{ 7919{
7920 struct cpuacct *ca = cgroup_ca(cgrp); 7920 struct cpuacct *ca = cgroup_ca(cgrp);
7921 7921
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8086 8086
8087struct cgroup_subsys cpuacct_subsys = { 8087struct cgroup_subsys cpuacct_subsys = {
8088 .name = "cpuacct", 8088 .name = "cpuacct",
8089 .create = cpuacct_create, 8089 .css_alloc = cpuacct_css_alloc,
8090 .destroy = cpuacct_destroy, 8090 .css_free = cpuacct_css_free,
8091 .subsys_id = cpuacct_subsys_id, 8091 .subsys_id = cpuacct_subsys_id,
8092 .base_cftypes = files, 8092 .base_cftypes = files,
8093}; 8093};
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..5ffb5626e072 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1908 preempt_disable();
1909 read_unlock(&tasklist_lock); 1909 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1910 preempt_enable_no_resched();
1911 schedule(); 1911 freezable_schedule();
1912 } else { 1912 } else {
1913 /* 1913 /*
1914 * By the time we got the lock, our tracer went away. 1914 * By the time we got the lock, our tracer went away.
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1930 }
1931 1931
1932 /* 1932 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1933 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1934 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1935 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
2092 } 2085 }
2093 2086
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2087 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2088 freezable_schedule();
2096 return true; 2089 return true;
2097 } else { 2090 } else {
2098 /* 2091 /*
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2193 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2194 return 0;
2202 2195
2203relock:
2204 /* 2196 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2197 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2198 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2199 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2200 */
2210 try_to_freeze(); 2201 try_to_freeze();
2211 2202
2203relock:
2212 spin_lock_irq(&sighand->siglock); 2204 spin_lock_irq(&sighand->siglock);
2213 /* 2205 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2206 * Every stopped thread goes here after wakeup. Check to see if