diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 754 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 514 | ||||
-rw-r--r-- | kernel/cpuset.c | 90 | ||||
-rw-r--r-- | kernel/events/core.c | 8 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/freezer.c | 11 | ||||
-rw-r--r-- | kernel/power/process.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 16 | ||||
-rw-r--r-- | kernel/signal.c | 20 |
9 files changed, 758 insertions, 677 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..f34c41bfaa37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -138,6 +138,9 @@ struct cgroupfs_root { | |||
138 | /* Hierarchy-specific flags */ | 138 | /* Hierarchy-specific flags */ |
139 | unsigned long flags; | 139 | unsigned long flags; |
140 | 140 | ||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
141 | /* The path to use for release notifications. */ | 144 | /* The path to use for release notifications. */ |
142 | char release_agent_path[PATH_MAX]; | 145 | char release_agent_path[PATH_MAX]; |
143 | 146 | ||
@@ -171,8 +174,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 174 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 175 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 176 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 177 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 178 | * should be used for avoiding race. |
176 | */ | 179 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 180 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 181 | /* |
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
242 | */ | 245 | */ |
243 | static int need_forkexit_callback __read_mostly; | 246 | static int need_forkexit_callback __read_mostly; |
244 | 247 | ||
248 | static int cgroup_destroy_locked(struct cgroup *cgrp); | ||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | ||
250 | struct cftype cfts[], bool is_add); | ||
251 | |||
245 | #ifdef CONFIG_PROVE_LOCKING | 252 | #ifdef CONFIG_PROVE_LOCKING |
246 | int cgroup_lock_is_held(void) | 253 | int cgroup_lock_is_held(void) |
247 | { | 254 | { |
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
294 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 301 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
295 | } | 302 | } |
296 | 303 | ||
297 | static int clone_children(const struct cgroup *cgrp) | ||
298 | { | ||
299 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
300 | } | ||
301 | |||
302 | /* | 304 | /* |
303 | * for_each_subsys() allows you to iterate on each subsystem attached to | 305 | * for_each_subsys() allows you to iterate on each subsystem attached to |
304 | * an active hierarchy | 306 | * an active hierarchy |
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
782 | * The task_lock() exception | 784 | * The task_lock() exception |
783 | * | 785 | * |
784 | * The need for this exception arises from the action of | 786 | * The need for this exception arises from the action of |
785 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 787 | * cgroup_attach_task(), which overwrites one task's cgroup pointer with |
786 | * another. It does so using cgroup_mutex, however there are | 788 | * another. It does so using cgroup_mutex, however there are |
787 | * several performance critical places that need to reference | 789 | * several performance critical places that need to reference |
788 | * task->cgroup without the expense of grabbing a system global | 790 | * task->cgroup without the expense of grabbing a system global |
789 | * mutex. Therefore except as noted below, when dereferencing or, as | 791 | * mutex. Therefore except as noted below, when dereferencing or, as |
790 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 792 | * in cgroup_attach_task(), modifying a task's cgroup pointer we use |
791 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 793 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
792 | * the task_struct routinely used for such matters. | 794 | * the task_struct routinely used for such matters. |
793 | * | 795 | * |
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 856 | return inode; |
855 | } | 857 | } |
856 | 858 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 860 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
898 | * Release the subsystem state objects. | 876 | * Release the subsystem state objects. |
899 | */ | 877 | */ |
900 | for_each_subsys(cgrp->root, ss) | 878 | for_each_subsys(cgrp->root, ss) |
901 | ss->destroy(cgrp); | 879 | ss->css_free(cgrp); |
902 | 880 | ||
903 | cgrp->root->number_of_cgroups--; | 881 | cgrp->root->number_of_cgroups--; |
904 | mutex_unlock(&cgroup_mutex); | 882 | mutex_unlock(&cgroup_mutex); |
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
917 | 895 | ||
918 | simple_xattrs_free(&cgrp->xattrs); | 896 | simple_xattrs_free(&cgrp->xattrs); |
919 | 897 | ||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
920 | kfree_rcu(cgrp, rcu_head); | 899 | kfree_rcu(cgrp, rcu_head); |
921 | } else { | 900 | } else { |
922 | struct cfent *cfe = __d_cfe(dentry); | 901 | struct cfent *cfe = __d_cfe(dentry); |
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
987 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 966 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
988 | continue; | 967 | continue; |
989 | list_for_each_entry(set, &ss->cftsets, node) | 968 | list_for_each_entry(set, &ss->cftsets, node) |
990 | cgroup_rm_file(cgrp, set->cfts); | 969 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); |
991 | } | 970 | } |
992 | if (base_files) { | 971 | if (base_files) { |
993 | while (!list_empty(&cgrp->files)) | 972 | while (!list_empty(&cgrp->files)) |
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 994 | } |
1016 | 995 | ||
1017 | /* | 996 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 997 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 998 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 999 | * returns an error, no reference counts are touched. |
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1150 | seq_puts(seq, ",xattr"); | 1102 | seq_puts(seq, ",xattr"); |
1151 | if (strlen(root->release_agent_path)) | 1103 | if (strlen(root->release_agent_path)) |
1152 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1104 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1153 | if (clone_children(&root->top_cgroup)) | 1105 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) |
1154 | seq_puts(seq, ",clone_children"); | 1106 | seq_puts(seq, ",clone_children"); |
1155 | if (strlen(root->name)) | 1107 | if (strlen(root->name)) |
1156 | seq_printf(seq, ",name=%s", root->name); | 1108 | seq_printf(seq, ",name=%s", root->name); |
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts { | |||
1162 | unsigned long subsys_mask; | 1114 | unsigned long subsys_mask; |
1163 | unsigned long flags; | 1115 | unsigned long flags; |
1164 | char *release_agent; | 1116 | char *release_agent; |
1165 | bool clone_children; | 1117 | bool cpuset_clone_children; |
1166 | char *name; | 1118 | char *name; |
1167 | /* User explicitly requested empty subsystem */ | 1119 | /* User explicitly requested empty subsystem */ |
1168 | bool none; | 1120 | bool none; |
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1213 | continue; | 1165 | continue; |
1214 | } | 1166 | } |
1215 | if (!strcmp(token, "clone_children")) { | 1167 | if (!strcmp(token, "clone_children")) { |
1216 | opts->clone_children = true; | 1168 | opts->cpuset_clone_children = true; |
1217 | continue; | 1169 | continue; |
1218 | } | 1170 | } |
1219 | if (!strcmp(token, "xattr")) { | 1171 | if (!strcmp(token, "xattr")) { |
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1397 | goto out_unlock; | 1349 | goto out_unlock; |
1398 | } | 1350 | } |
1399 | 1351 | ||
1352 | /* | ||
1353 | * Clear out the files of subsystems that should be removed, do | ||
1354 | * this before rebind_subsystems, since rebind_subsystems may | ||
1355 | * change this hierarchy's subsys_list. | ||
1356 | */ | ||
1357 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1358 | |||
1400 | ret = rebind_subsystems(root, opts.subsys_mask); | 1359 | ret = rebind_subsystems(root, opts.subsys_mask); |
1401 | if (ret) { | 1360 | if (ret) { |
1361 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1362 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1402 | drop_parsed_module_refcounts(opts.subsys_mask); | 1363 | drop_parsed_module_refcounts(opts.subsys_mask); |
1403 | goto out_unlock; | 1364 | goto out_unlock; |
1404 | } | 1365 | } |
1405 | 1366 | ||
1406 | /* clear out any existing files and repopulate subsystem files */ | ||
1407 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1408 | /* re-populate subsystem files */ | 1367 | /* re-populate subsystem files */ |
1409 | cgroup_populate_dir(cgrp, false, added_mask); | 1368 | cgroup_populate_dir(cgrp, false, added_mask); |
1410 | 1369 | ||
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1432 | INIT_LIST_HEAD(&cgrp->children); | 1391 | INIT_LIST_HEAD(&cgrp->children); |
1433 | INIT_LIST_HEAD(&cgrp->files); | 1392 | INIT_LIST_HEAD(&cgrp->files); |
1434 | INIT_LIST_HEAD(&cgrp->css_sets); | 1393 | INIT_LIST_HEAD(&cgrp->css_sets); |
1394 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1435 | INIT_LIST_HEAD(&cgrp->release_list); | 1395 | INIT_LIST_HEAD(&cgrp->release_list); |
1436 | INIT_LIST_HEAD(&cgrp->pidlists); | 1396 | INIT_LIST_HEAD(&cgrp->pidlists); |
1437 | mutex_init(&cgrp->pidlist_mutex); | 1397 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1450 | root->number_of_cgroups = 1; | 1410 | root->number_of_cgroups = 1; |
1451 | cgrp->root = root; | 1411 | cgrp->root = root; |
1452 | cgrp->top_cgroup = cgrp; | 1412 | cgrp->top_cgroup = cgrp; |
1453 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1454 | init_cgroup_housekeeping(cgrp); | 1413 | init_cgroup_housekeeping(cgrp); |
1414 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1455 | } | 1415 | } |
1456 | 1416 | ||
1457 | static bool init_root_id(struct cgroupfs_root *root) | 1417 | static bool init_root_id(struct cgroupfs_root *root) |
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1518 | 1478 | ||
1519 | root->subsys_mask = opts->subsys_mask; | 1479 | root->subsys_mask = opts->subsys_mask; |
1520 | root->flags = opts->flags; | 1480 | root->flags = opts->flags; |
1481 | ida_init(&root->cgroup_ida); | ||
1521 | if (opts->release_agent) | 1482 | if (opts->release_agent) |
1522 | strcpy(root->release_agent_path, opts->release_agent); | 1483 | strcpy(root->release_agent_path, opts->release_agent); |
1523 | if (opts->name) | 1484 | if (opts->name) |
1524 | strcpy(root->name, opts->name); | 1485 | strcpy(root->name, opts->name); |
1525 | if (opts->clone_children) | 1486 | if (opts->cpuset_clone_children) |
1526 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1487 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); |
1527 | return root; | 1488 | return root; |
1528 | } | 1489 | } |
1529 | 1490 | ||
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) | |||
1536 | spin_lock(&hierarchy_id_lock); | 1497 | spin_lock(&hierarchy_id_lock); |
1537 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1498 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1538 | spin_unlock(&hierarchy_id_lock); | 1499 | spin_unlock(&hierarchy_id_lock); |
1500 | ida_destroy(&root->cgroup_ida); | ||
1539 | kfree(root); | 1501 | kfree(root); |
1540 | } | 1502 | } |
1541 | 1503 | ||
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1701 | 1663 | ||
1702 | free_cg_links(&tmp_cg_links); | 1664 | free_cg_links(&tmp_cg_links); |
1703 | 1665 | ||
1704 | BUG_ON(!list_empty(&root_cgrp->sibling)); | ||
1705 | BUG_ON(!list_empty(&root_cgrp->children)); | 1666 | BUG_ON(!list_empty(&root_cgrp->children)); |
1706 | BUG_ON(root->number_of_cgroups != 1); | 1667 | BUG_ON(root->number_of_cgroups != 1); |
1707 | 1668 | ||
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1750 | 1711 | ||
1751 | BUG_ON(root->number_of_cgroups != 1); | 1712 | BUG_ON(root->number_of_cgroups != 1); |
1752 | BUG_ON(!list_empty(&cgrp->children)); | 1713 | BUG_ON(!list_empty(&cgrp->children)); |
1753 | BUG_ON(!list_empty(&cgrp->sibling)); | ||
1754 | 1714 | ||
1755 | mutex_lock(&cgroup_mutex); | 1715 | mutex_lock(&cgroup_mutex); |
1756 | mutex_lock(&cgroup_root_mutex); | 1716 | mutex_lock(&cgroup_root_mutex); |
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj; | |||
1808 | */ | 1768 | */ |
1809 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1769 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1810 | { | 1770 | { |
1771 | struct dentry *dentry = cgrp->dentry; | ||
1811 | char *start; | 1772 | char *start; |
1812 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1773 | |
1813 | cgroup_lock_is_held()); | 1774 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1775 | "cgroup_path() called without proper locking"); | ||
1814 | 1776 | ||
1815 | if (!dentry || cgrp == dummytop) { | 1777 | if (!dentry || cgrp == dummytop) { |
1816 | /* | 1778 | /* |
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1821 | return 0; | 1783 | return 0; |
1822 | } | 1784 | } |
1823 | 1785 | ||
1824 | start = buf + buflen; | 1786 | start = buf + buflen - 1; |
1825 | 1787 | ||
1826 | *--start = '\0'; | 1788 | *start = '\0'; |
1827 | for (;;) { | 1789 | for (;;) { |
1828 | int len = dentry->d_name.len; | 1790 | int len = dentry->d_name.len; |
1829 | 1791 | ||
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1834 | if (!cgrp) | 1796 | if (!cgrp) |
1835 | break; | 1797 | break; |
1836 | 1798 | ||
1837 | dentry = rcu_dereference_check(cgrp->dentry, | 1799 | dentry = cgrp->dentry; |
1838 | cgroup_lock_is_held()); | ||
1839 | if (!cgrp->parent) | 1800 | if (!cgrp->parent) |
1840 | continue; | 1801 | continue; |
1841 | if (--start < buf) | 1802 | if (--start < buf) |
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1930 | /* | 1891 | /* |
1931 | * cgroup_task_migrate - move a task from one cgroup to another. | 1892 | * cgroup_task_migrate - move a task from one cgroup to another. |
1932 | * | 1893 | * |
1933 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1894 | * Must be called with cgroup_mutex and threadgroup locked. |
1934 | * will already exist. If not set, this function might sleep, and can fail with | ||
1935 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | ||
1936 | */ | 1895 | */ |
1937 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1896 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1938 | struct task_struct *tsk, struct css_set *newcg) | 1897 | struct task_struct *tsk, struct css_set *newcg) |
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2025 | } | 1984 | } |
2026 | 1985 | ||
2027 | synchronize_rcu(); | 1986 | synchronize_rcu(); |
2028 | |||
2029 | /* | ||
2030 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2031 | * is no longer empty. | ||
2032 | */ | ||
2033 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2034 | out: | 1987 | out: |
2035 | if (retval) { | 1988 | if (retval) { |
2036 | for_each_subsys(root, ss) { | 1989 | for_each_subsys(root, ss) { |
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2200 | * step 5: success! and cleanup | 2153 | * step 5: success! and cleanup |
2201 | */ | 2154 | */ |
2202 | synchronize_rcu(); | 2155 | synchronize_rcu(); |
2203 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2204 | retval = 0; | 2156 | retval = 0; |
2205 | out_put_css_set_refs: | 2157 | out_put_css_set_refs: |
2206 | if (retval) { | 2158 | if (retval) { |
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2711 | 2663 | ||
2712 | /* start off with i_nlink == 2 (for "." entry) */ | 2664 | /* start off with i_nlink == 2 (for "." entry) */ |
2713 | inc_nlink(inode); | 2665 | inc_nlink(inode); |
2666 | inc_nlink(dentry->d_parent->d_inode); | ||
2714 | 2667 | ||
2715 | /* start with the directory inode held, so that we can | 2668 | /* |
2716 | * populate it without racing with another mkdir */ | 2669 | * Control reaches here with cgroup_mutex held. |
2717 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2670 | * @inode->i_mutex should nest outside cgroup_mutex but we |
2671 | * want to populate it immediately without releasing | ||
2672 | * cgroup_mutex. As @inode isn't visible to anyone else | ||
2673 | * yet, trylock will always succeed without affecting | ||
2674 | * lockdep checks. | ||
2675 | */ | ||
2676 | WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); | ||
2718 | } else if (S_ISREG(mode)) { | 2677 | } else if (S_ISREG(mode)) { |
2719 | inode->i_size = 0; | 2678 | inode->i_size = 0; |
2720 | inode->i_fop = &cgroup_file_operations; | 2679 | inode->i_fop = &cgroup_file_operations; |
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2725 | return 0; | 2684 | return 0; |
2726 | } | 2685 | } |
2727 | 2686 | ||
2728 | /* | ||
2729 | * cgroup_create_dir - create a directory for an object. | ||
2730 | * @cgrp: the cgroup we create the directory for. It must have a valid | ||
2731 | * ->parent field. And we are going to fill its ->dentry field. | ||
2732 | * @dentry: dentry of the new cgroup | ||
2733 | * @mode: mode to set on new directory. | ||
2734 | */ | ||
2735 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | ||
2736 | umode_t mode) | ||
2737 | { | ||
2738 | struct dentry *parent; | ||
2739 | int error = 0; | ||
2740 | |||
2741 | parent = cgrp->parent->dentry; | ||
2742 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | ||
2743 | if (!error) { | ||
2744 | dentry->d_fsdata = cgrp; | ||
2745 | inc_nlink(parent->d_inode); | ||
2746 | rcu_assign_pointer(cgrp->dentry, dentry); | ||
2747 | dget(dentry); | ||
2748 | } | ||
2749 | dput(dentry); | ||
2750 | |||
2751 | return error; | ||
2752 | } | ||
2753 | |||
2754 | /** | 2687 | /** |
2755 | * cgroup_file_mode - deduce file mode of a control file | 2688 | * cgroup_file_mode - deduce file mode of a control file |
2756 | * @cft: the control file in question | 2689 | * @cft: the control file in question |
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2791 | 2724 | ||
2792 | simple_xattrs_init(&cft->xattrs); | 2725 | simple_xattrs_init(&cft->xattrs); |
2793 | 2726 | ||
2794 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2795 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2796 | return 0; | ||
2797 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2798 | return 0; | ||
2799 | |||
2800 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2727 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2801 | strcpy(name, subsys->name); | 2728 | strcpy(name, subsys->name); |
2802 | strcat(name, "."); | 2729 | strcat(name, "."); |
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2837 | int err, ret = 0; | 2764 | int err, ret = 0; |
2838 | 2765 | ||
2839 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2766 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2767 | /* does cft->flags tell us to skip this file on @cgrp? */ | ||
2768 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2769 | continue; | ||
2770 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2771 | continue; | ||
2772 | |||
2840 | if (is_add) | 2773 | if (is_add) |
2841 | err = cgroup_add_file(cgrp, subsys, cft); | 2774 | err = cgroup_add_file(cgrp, subsys, cft); |
2842 | else | 2775 | else |
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void) | |||
3044 | write_unlock(&css_set_lock); | 2977 | write_unlock(&css_set_lock); |
3045 | } | 2978 | } |
3046 | 2979 | ||
2980 | /** | ||
2981 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | ||
2982 | * @pos: the current position (%NULL to initiate traversal) | ||
2983 | * @cgroup: cgroup whose descendants to walk | ||
2984 | * | ||
2985 | * To be used by cgroup_for_each_descendant_pre(). Find the next | ||
2986 | * descendant to visit for pre-order traversal of @cgroup's descendants. | ||
2987 | */ | ||
2988 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | ||
2989 | struct cgroup *cgroup) | ||
2990 | { | ||
2991 | struct cgroup *next; | ||
2992 | |||
2993 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
2994 | |||
2995 | /* if first iteration, pretend we just visited @cgroup */ | ||
2996 | if (!pos) { | ||
2997 | if (list_empty(&cgroup->children)) | ||
2998 | return NULL; | ||
2999 | pos = cgroup; | ||
3000 | } | ||
3001 | |||
3002 | /* visit the first child if exists */ | ||
3003 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | ||
3004 | if (next) | ||
3005 | return next; | ||
3006 | |||
3007 | /* no child, visit my or the closest ancestor's next sibling */ | ||
3008 | do { | ||
3009 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | ||
3010 | sibling); | ||
3011 | if (&next->sibling != &pos->parent->children) | ||
3012 | return next; | ||
3013 | |||
3014 | pos = pos->parent; | ||
3015 | } while (pos != cgroup); | ||
3016 | |||
3017 | return NULL; | ||
3018 | } | ||
3019 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | ||
3020 | |||
3021 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | ||
3022 | { | ||
3023 | struct cgroup *last; | ||
3024 | |||
3025 | do { | ||
3026 | last = pos; | ||
3027 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | ||
3028 | sibling); | ||
3029 | } while (pos); | ||
3030 | |||
3031 | return last; | ||
3032 | } | ||
3033 | |||
3034 | /** | ||
3035 | * cgroup_next_descendant_post - find the next descendant for post-order walk | ||
3036 | * @pos: the current position (%NULL to initiate traversal) | ||
3037 | * @cgroup: cgroup whose descendants to walk | ||
3038 | * | ||
3039 | * To be used by cgroup_for_each_descendant_post(). Find the next | ||
3040 | * descendant to visit for post-order traversal of @cgroup's descendants. | ||
3041 | */ | ||
3042 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | ||
3043 | struct cgroup *cgroup) | ||
3044 | { | ||
3045 | struct cgroup *next; | ||
3046 | |||
3047 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3048 | |||
3049 | /* if first iteration, visit the leftmost descendant */ | ||
3050 | if (!pos) { | ||
3051 | next = cgroup_leftmost_descendant(cgroup); | ||
3052 | return next != cgroup ? next : NULL; | ||
3053 | } | ||
3054 | |||
3055 | /* if there's an unvisited sibling, visit its leftmost descendant */ | ||
3056 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3057 | if (&next->sibling != &pos->parent->children) | ||
3058 | return cgroup_leftmost_descendant(next); | ||
3059 | |||
3060 | /* no sibling left, visit parent */ | ||
3061 | next = pos->parent; | ||
3062 | return next != cgroup ? next : NULL; | ||
3063 | } | ||
3064 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3065 | |||
3047 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3066 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
3048 | __acquires(css_set_lock) | 3067 | __acquires(css_set_lock) |
3049 | { | 3068 | { |
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3757 | if (flags & POLLHUP) { | 3776 | if (flags & POLLHUP) { |
3758 | __remove_wait_queue(event->wqh, &event->wait); | 3777 | __remove_wait_queue(event->wqh, &event->wait); |
3759 | spin_lock(&cgrp->event_list_lock); | 3778 | spin_lock(&cgrp->event_list_lock); |
3760 | list_del(&event->list); | 3779 | list_del_init(&event->list); |
3761 | spin_unlock(&cgrp->event_list_lock); | 3780 | spin_unlock(&cgrp->event_list_lock); |
3762 | /* | 3781 | /* |
3763 | * We are in atomic context, but cgroup_event_remove() may | 3782 | * We are in atomic context, but cgroup_event_remove() may |
@@ -3894,7 +3913,7 @@ fail: | |||
3894 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3913 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3895 | struct cftype *cft) | 3914 | struct cftype *cft) |
3896 | { | 3915 | { |
3897 | return clone_children(cgrp); | 3916 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3898 | } | 3917 | } |
3899 | 3918 | ||
3900 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3919 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3902 | u64 val) | 3921 | u64 val) |
3903 | { | 3922 | { |
3904 | if (val) | 3923 | if (val) |
3905 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3924 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3906 | else | 3925 | else |
3907 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3926 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3908 | return 0; | 3927 | return 0; |
3909 | } | 3928 | } |
3910 | 3929 | ||
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4017 | css->flags = 0; | 4036 | css->flags = 0; |
4018 | css->id = NULL; | 4037 | css->id = NULL; |
4019 | if (cgrp == dummytop) | 4038 | if (cgrp == dummytop) |
4020 | set_bit(CSS_ROOT, &css->flags); | 4039 | css->flags |= CSS_ROOT; |
4021 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4040 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4022 | cgrp->subsys[ss->subsys_id] = css; | 4041 | cgrp->subsys[ss->subsys_id] = css; |
4023 | 4042 | ||
4024 | /* | 4043 | /* |
4025 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 4044 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4026 | * which is put on the last css_put(). dput() requires process | 4045 | * css_put(). dput() requires process context, which css_put() may |
4027 | * context, which css_put() may be called without. @css->dput_work | 4046 | * be called without. @css->dput_work will be used to invoke |
4028 | * will be used to invoke dput() asynchronously from css_put(). | 4047 | * dput() asynchronously from css_put(). |
4029 | */ | 4048 | */ |
4030 | INIT_WORK(&css->dput_work, css_dput_fn); | 4049 | INIT_WORK(&css->dput_work, css_dput_fn); |
4031 | if (ss->__DEPRECATED_clear_css_refs) | 4050 | } |
4032 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 4051 | |
4052 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | ||
4053 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4054 | { | ||
4055 | int ret = 0; | ||
4056 | |||
4057 | lockdep_assert_held(&cgroup_mutex); | ||
4058 | |||
4059 | if (ss->css_online) | ||
4060 | ret = ss->css_online(cgrp); | ||
4061 | if (!ret) | ||
4062 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | ||
4063 | return ret; | ||
4064 | } | ||
4065 | |||
4066 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | ||
4067 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4068 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4069 | { | ||
4070 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4071 | |||
4072 | lockdep_assert_held(&cgroup_mutex); | ||
4073 | |||
4074 | if (!(css->flags & CSS_ONLINE)) | ||
4075 | return; | ||
4076 | |||
4077 | /* | ||
4078 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4079 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4080 | * details. This temporary unlocking should go away once | ||
4081 | * cgroup_mutex is unexported from controllers. | ||
4082 | */ | ||
4083 | if (ss->css_offline) { | ||
4084 | mutex_unlock(&cgroup_mutex); | ||
4085 | ss->css_offline(cgrp); | ||
4086 | mutex_lock(&cgroup_mutex); | ||
4087 | } | ||
4088 | |||
4089 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | ||
4033 | } | 4090 | } |
4034 | 4091 | ||
4035 | /* | 4092 | /* |
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4049 | struct cgroup_subsys *ss; | 4106 | struct cgroup_subsys *ss; |
4050 | struct super_block *sb = root->sb; | 4107 | struct super_block *sb = root->sb; |
4051 | 4108 | ||
4109 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | ||
4052 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4110 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
4053 | if (!cgrp) | 4111 | if (!cgrp) |
4054 | return -ENOMEM; | 4112 | return -ENOMEM; |
4055 | 4113 | ||
4114 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||
4115 | if (cgrp->id < 0) | ||
4116 | goto err_free_cgrp; | ||
4117 | |||
4118 | /* | ||
4119 | * Only live parents can have children. Note that the liveliness | ||
4120 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4121 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4122 | * anyway so that locking is contained inside cgroup proper and we | ||
4123 | * don't get nasty surprises if we ever grow another caller. | ||
4124 | */ | ||
4125 | if (!cgroup_lock_live_group(parent)) { | ||
4126 | err = -ENODEV; | ||
4127 | goto err_free_id; | ||
4128 | } | ||
4129 | |||
4056 | /* Grab a reference on the superblock so the hierarchy doesn't | 4130 | /* Grab a reference on the superblock so the hierarchy doesn't |
4057 | * get deleted on unmount if there are child cgroups. This | 4131 | * get deleted on unmount if there are child cgroups. This |
4058 | * can be done outside cgroup_mutex, since the sb can't | 4132 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4060 | * fs */ | 4134 | * fs */ |
4061 | atomic_inc(&sb->s_active); | 4135 | atomic_inc(&sb->s_active); |
4062 | 4136 | ||
4063 | mutex_lock(&cgroup_mutex); | ||
4064 | |||
4065 | init_cgroup_housekeeping(cgrp); | 4137 | init_cgroup_housekeeping(cgrp); |
4066 | 4138 | ||
4067 | cgrp->parent = parent; | 4139 | cgrp->parent = parent; |
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4071 | if (notify_on_release(parent)) | 4143 | if (notify_on_release(parent)) |
4072 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4144 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
4073 | 4145 | ||
4074 | if (clone_children(parent)) | 4146 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4075 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4147 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4076 | 4148 | ||
4077 | for_each_subsys(root, ss) { | 4149 | for_each_subsys(root, ss) { |
4078 | struct cgroup_subsys_state *css; | 4150 | struct cgroup_subsys_state *css; |
4079 | 4151 | ||
4080 | css = ss->create(cgrp); | 4152 | css = ss->css_alloc(cgrp); |
4081 | if (IS_ERR(css)) { | 4153 | if (IS_ERR(css)) { |
4082 | err = PTR_ERR(css); | 4154 | err = PTR_ERR(css); |
4083 | goto err_destroy; | 4155 | goto err_free_all; |
4084 | } | 4156 | } |
4085 | init_cgroup_css(css, ss, cgrp); | 4157 | init_cgroup_css(css, ss, cgrp); |
4086 | if (ss->use_id) { | 4158 | if (ss->use_id) { |
4087 | err = alloc_css_id(ss, parent, cgrp); | 4159 | err = alloc_css_id(ss, parent, cgrp); |
4088 | if (err) | 4160 | if (err) |
4089 | goto err_destroy; | 4161 | goto err_free_all; |
4090 | } | 4162 | } |
4091 | /* At error, ->destroy() callback has to free assigned ID. */ | 4163 | } |
4092 | if (clone_children(parent) && ss->post_clone) | 4164 | |
4093 | ss->post_clone(cgrp); | 4165 | /* |
4166 | * Create directory. cgroup_create_file() returns with the new | ||
4167 | * directory locked on success so that it can be populated without | ||
4168 | * dropping cgroup_mutex. | ||
4169 | */ | ||
4170 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | ||
4171 | if (err < 0) | ||
4172 | goto err_free_all; | ||
4173 | lockdep_assert_held(&dentry->d_inode->i_mutex); | ||
4174 | |||
4175 | /* allocation complete, commit to creation */ | ||
4176 | dentry->d_fsdata = cgrp; | ||
4177 | cgrp->dentry = dentry; | ||
4178 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4179 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | ||
4180 | root->number_of_cgroups++; | ||
4181 | |||
4182 | /* each css holds a ref to the cgroup's dentry */ | ||
4183 | for_each_subsys(root, ss) | ||
4184 | dget(dentry); | ||
4185 | |||
4186 | /* creation succeeded, notify subsystems */ | ||
4187 | for_each_subsys(root, ss) { | ||
4188 | err = online_css(ss, cgrp); | ||
4189 | if (err) | ||
4190 | goto err_destroy; | ||
4094 | 4191 | ||
4095 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4192 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4096 | parent->parent) { | 4193 | parent->parent) { |
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4102 | } | 4199 | } |
4103 | } | 4200 | } |
4104 | 4201 | ||
4105 | list_add(&cgrp->sibling, &cgrp->parent->children); | ||
4106 | root->number_of_cgroups++; | ||
4107 | |||
4108 | err = cgroup_create_dir(cgrp, dentry, mode); | ||
4109 | if (err < 0) | ||
4110 | goto err_remove; | ||
4111 | |||
4112 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4113 | for_each_subsys(root, ss) | ||
4114 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4115 | dget(dentry); | ||
4116 | |||
4117 | /* The cgroup directory was pre-locked for us */ | ||
4118 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | ||
4119 | |||
4120 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4121 | |||
4122 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4202 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
4123 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4203 | if (err) |
4204 | goto err_destroy; | ||
4124 | 4205 | ||
4125 | mutex_unlock(&cgroup_mutex); | 4206 | mutex_unlock(&cgroup_mutex); |
4126 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4207 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4127 | 4208 | ||
4128 | return 0; | 4209 | return 0; |
4129 | 4210 | ||
4130 | err_remove: | 4211 | err_free_all: |
4131 | |||
4132 | list_del(&cgrp->sibling); | ||
4133 | root->number_of_cgroups--; | ||
4134 | |||
4135 | err_destroy: | ||
4136 | |||
4137 | for_each_subsys(root, ss) { | 4212 | for_each_subsys(root, ss) { |
4138 | if (cgrp->subsys[ss->subsys_id]) | 4213 | if (cgrp->subsys[ss->subsys_id]) |
4139 | ss->destroy(cgrp); | 4214 | ss->css_free(cgrp); |
4140 | } | 4215 | } |
4141 | |||
4142 | mutex_unlock(&cgroup_mutex); | 4216 | mutex_unlock(&cgroup_mutex); |
4143 | |||
4144 | /* Release the reference count that we took on the superblock */ | 4217 | /* Release the reference count that we took on the superblock */ |
4145 | deactivate_super(sb); | 4218 | deactivate_super(sb); |
4146 | 4219 | err_free_id: | |
4220 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||
4221 | err_free_cgrp: | ||
4147 | kfree(cgrp); | 4222 | kfree(cgrp); |
4148 | return err; | 4223 | return err; |
4224 | |||
4225 | err_destroy: | ||
4226 | cgroup_destroy_locked(cgrp); | ||
4227 | mutex_unlock(&cgroup_mutex); | ||
4228 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
4229 | return err; | ||
4149 | } | 4230 | } |
4150 | 4231 | ||
4151 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 4232 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4197 | return 0; | 4278 | return 0; |
4198 | } | 4279 | } |
4199 | 4280 | ||
4200 | /* | 4281 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4201 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4282 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4202 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4203 | * busy subsystems. Call with cgroup_mutex held | ||
4204 | * | ||
4205 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4206 | * not, cgroup removal behaves differently. | ||
4207 | * | ||
4208 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4209 | * cgroup removal can be committed. This is implemented by | ||
4210 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4211 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4212 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4213 | * removed as soon as the existing user (memcg) is updated. | ||
4214 | * | ||
4215 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4216 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4217 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4218 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4219 | * is put so that dentry destruction happens only after all css's are | ||
4220 | * released. | ||
4221 | */ | ||
4222 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4223 | { | 4283 | { |
4284 | struct dentry *d = cgrp->dentry; | ||
4285 | struct cgroup *parent = cgrp->parent; | ||
4286 | DEFINE_WAIT(wait); | ||
4287 | struct cgroup_event *event, *tmp; | ||
4224 | struct cgroup_subsys *ss; | 4288 | struct cgroup_subsys *ss; |
4225 | unsigned long flags; | 4289 | LIST_HEAD(tmp_list); |
4226 | bool failed = false; | 4290 | |
4291 | lockdep_assert_held(&d->d_inode->i_mutex); | ||
4292 | lockdep_assert_held(&cgroup_mutex); | ||
4227 | 4293 | ||
4228 | local_irq_save(flags); | 4294 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) |
4295 | return -EBUSY; | ||
4229 | 4296 | ||
4230 | /* | 4297 | /* |
4231 | * Block new css_tryget() by deactivating refcnt. If all refcnts | 4298 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4232 | * for subsystems w/ clear_css_refs set were 1 at the moment of | 4299 | * removed. This makes future css_tryget() and child creation |
4233 | * deactivation, we succeeded. | 4300 | * attempts fail thus maintaining the removal conditions verified |
4301 | * above. | ||
4234 | */ | 4302 | */ |
4235 | for_each_subsys(cgrp->root, ss) { | 4303 | for_each_subsys(cgrp->root, ss) { |
4236 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4304 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4237 | 4305 | ||
4238 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4306 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4239 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4307 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4240 | |||
4241 | if (ss->__DEPRECATED_clear_css_refs) | ||
4242 | failed |= css_refcnt(css) != 1; | ||
4243 | } | ||
4244 | |||
4245 | /* | ||
4246 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4247 | * restore refcnts to positive values. Either way, all in-progress | ||
4248 | * css_tryget() will be released. | ||
4249 | */ | ||
4250 | for_each_subsys(cgrp->root, ss) { | ||
4251 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4252 | |||
4253 | if (!failed) { | ||
4254 | set_bit(CSS_REMOVED, &css->flags); | ||
4255 | css_put(css); | ||
4256 | } else { | ||
4257 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4258 | } | ||
4259 | } | 4308 | } |
4309 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4260 | 4310 | ||
4261 | local_irq_restore(flags); | 4311 | /* tell subsystems to initate destruction */ |
4262 | return !failed; | 4312 | for_each_subsys(cgrp->root, ss) |
4263 | } | 4313 | offline_css(ss, cgrp); |
4264 | |||
4265 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4266 | { | ||
4267 | struct cgroup *cgrp = dentry->d_fsdata; | ||
4268 | struct dentry *d; | ||
4269 | struct cgroup *parent; | ||
4270 | DEFINE_WAIT(wait); | ||
4271 | struct cgroup_event *event, *tmp; | ||
4272 | int ret; | ||
4273 | |||
4274 | /* the vfs holds both inode->i_mutex already */ | ||
4275 | again: | ||
4276 | mutex_lock(&cgroup_mutex); | ||
4277 | if (atomic_read(&cgrp->count) != 0) { | ||
4278 | mutex_unlock(&cgroup_mutex); | ||
4279 | return -EBUSY; | ||
4280 | } | ||
4281 | if (!list_empty(&cgrp->children)) { | ||
4282 | mutex_unlock(&cgroup_mutex); | ||
4283 | return -EBUSY; | ||
4284 | } | ||
4285 | mutex_unlock(&cgroup_mutex); | ||
4286 | |||
4287 | /* | ||
4288 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
4289 | * in racy cases, subsystem may have to get css->refcnt after | ||
4290 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
4291 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
4292 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4293 | * and subsystem's reference count handling. Please see css_get/put | ||
4294 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4295 | */ | ||
4296 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4297 | 4314 | ||
4298 | /* | 4315 | /* |
4299 | * Call pre_destroy handlers of subsys. Notify subsystems | 4316 | * Put all the base refs. Each css holds an extra reference to the |
4300 | * that rmdir() request comes. | 4317 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4318 | * refs. On the last put of each css, whenever that may be, the | ||
4319 | * extra dentry ref is put so that dentry destruction happens only | ||
4320 | * after all css's are released. | ||
4301 | */ | 4321 | */ |
4302 | ret = cgroup_call_pre_destroy(cgrp); | 4322 | for_each_subsys(cgrp->root, ss) |
4303 | if (ret) { | 4323 | css_put(cgrp->subsys[ss->subsys_id]); |
4304 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4305 | return ret; | ||
4306 | } | ||
4307 | |||
4308 | mutex_lock(&cgroup_mutex); | ||
4309 | parent = cgrp->parent; | ||
4310 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | ||
4311 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4312 | mutex_unlock(&cgroup_mutex); | ||
4313 | return -EBUSY; | ||
4314 | } | ||
4315 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | ||
4316 | if (!cgroup_clear_css_refs(cgrp)) { | ||
4317 | mutex_unlock(&cgroup_mutex); | ||
4318 | /* | ||
4319 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4320 | * prepare_to_wait(), we need to check this flag. | ||
4321 | */ | ||
4322 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4323 | schedule(); | ||
4324 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4325 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4326 | if (signal_pending(current)) | ||
4327 | return -EINTR; | ||
4328 | goto again; | ||
4329 | } | ||
4330 | /* NO css_tryget() can success after here. */ | ||
4331 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4332 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4333 | 4324 | ||
4334 | raw_spin_lock(&release_list_lock); | 4325 | raw_spin_lock(&release_list_lock); |
4335 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4336 | if (!list_empty(&cgrp->release_list)) | 4326 | if (!list_empty(&cgrp->release_list)) |
4337 | list_del_init(&cgrp->release_list); | 4327 | list_del_init(&cgrp->release_list); |
4338 | raw_spin_unlock(&release_list_lock); | 4328 | raw_spin_unlock(&release_list_lock); |
4339 | 4329 | ||
4340 | /* delete this cgroup from parent->children */ | 4330 | /* delete this cgroup from parent->children */ |
4341 | list_del_init(&cgrp->sibling); | 4331 | list_del_rcu(&cgrp->sibling); |
4342 | |||
4343 | list_del_init(&cgrp->allcg_node); | 4332 | list_del_init(&cgrp->allcg_node); |
4344 | 4333 | ||
4345 | d = dget(cgrp->dentry); | 4334 | dget(d); |
4346 | |||
4347 | cgroup_d_remove_dir(d); | 4335 | cgroup_d_remove_dir(d); |
4348 | dput(d); | 4336 | dput(d); |
4349 | 4337 | ||
@@ -4353,21 +4341,35 @@ again: | |||
4353 | /* | 4341 | /* |
4354 | * Unregister events and notify userspace. | 4342 | * Unregister events and notify userspace. |
4355 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4343 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4356 | * directory to avoid race between userspace and kernelspace | 4344 | * directory to avoid race between userspace and kernelspace. Use |
4345 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4346 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4347 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4357 | */ | 4348 | */ |
4358 | spin_lock(&cgrp->event_list_lock); | 4349 | spin_lock(&cgrp->event_list_lock); |
4359 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4350 | list_splice_init(&cgrp->event_list, &tmp_list); |
4360 | list_del(&event->list); | 4351 | spin_unlock(&cgrp->event_list_lock); |
4352 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4353 | list_del_init(&event->list); | ||
4361 | remove_wait_queue(event->wqh, &event->wait); | 4354 | remove_wait_queue(event->wqh, &event->wait); |
4362 | eventfd_signal(event->eventfd, 1); | 4355 | eventfd_signal(event->eventfd, 1); |
4363 | schedule_work(&event->remove); | 4356 | schedule_work(&event->remove); |
4364 | } | 4357 | } |
4365 | spin_unlock(&cgrp->event_list_lock); | ||
4366 | 4358 | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | return 0; | 4359 | return 0; |
4369 | } | 4360 | } |
4370 | 4361 | ||
4362 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4363 | { | ||
4364 | int ret; | ||
4365 | |||
4366 | mutex_lock(&cgroup_mutex); | ||
4367 | ret = cgroup_destroy_locked(dentry->d_fsdata); | ||
4368 | mutex_unlock(&cgroup_mutex); | ||
4369 | |||
4370 | return ret; | ||
4371 | } | ||
4372 | |||
4371 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | 4373 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) |
4372 | { | 4374 | { |
4373 | INIT_LIST_HEAD(&ss->cftsets); | 4375 | INIT_LIST_HEAD(&ss->cftsets); |
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4388 | 4390 | ||
4389 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4391 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4390 | 4392 | ||
4393 | mutex_lock(&cgroup_mutex); | ||
4394 | |||
4391 | /* init base cftset */ | 4395 | /* init base cftset */ |
4392 | cgroup_init_cftsets(ss); | 4396 | cgroup_init_cftsets(ss); |
4393 | 4397 | ||
4394 | /* Create the top cgroup state for this subsystem */ | 4398 | /* Create the top cgroup state for this subsystem */ |
4395 | list_add(&ss->sibling, &rootnode.subsys_list); | 4399 | list_add(&ss->sibling, &rootnode.subsys_list); |
4396 | ss->root = &rootnode; | 4400 | ss->root = &rootnode; |
4397 | css = ss->create(dummytop); | 4401 | css = ss->css_alloc(dummytop); |
4398 | /* We don't handle early failures gracefully */ | 4402 | /* We don't handle early failures gracefully */ |
4399 | BUG_ON(IS_ERR(css)); | 4403 | BUG_ON(IS_ERR(css)); |
4400 | init_cgroup_css(css, ss, dummytop); | 4404 | init_cgroup_css(css, ss, dummytop); |
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4403 | * pointer to this state - since the subsystem is | 4407 | * pointer to this state - since the subsystem is |
4404 | * newly registered, all tasks and hence the | 4408 | * newly registered, all tasks and hence the |
4405 | * init_css_set is in the subsystem's top cgroup. */ | 4409 | * init_css_set is in the subsystem's top cgroup. */ |
4406 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4410 | init_css_set.subsys[ss->subsys_id] = css; |
4407 | 4411 | ||
4408 | need_forkexit_callback |= ss->fork || ss->exit; | 4412 | need_forkexit_callback |= ss->fork || ss->exit; |
4409 | 4413 | ||
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4413 | BUG_ON(!list_empty(&init_task.tasks)); | 4417 | BUG_ON(!list_empty(&init_task.tasks)); |
4414 | 4418 | ||
4415 | ss->active = 1; | 4419 | ss->active = 1; |
4420 | BUG_ON(online_css(ss, dummytop)); | ||
4421 | |||
4422 | mutex_unlock(&cgroup_mutex); | ||
4416 | 4423 | ||
4417 | /* this function shouldn't be used with modular subsystems, since they | 4424 | /* this function shouldn't be used with modular subsystems, since they |
4418 | * need to register a subsys_id, among other things */ | 4425 | * need to register a subsys_id, among other things */ |
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4430 | */ | 4437 | */ |
4431 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4438 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4432 | { | 4439 | { |
4433 | int i; | ||
4434 | struct cgroup_subsys_state *css; | 4440 | struct cgroup_subsys_state *css; |
4441 | int i, ret; | ||
4435 | 4442 | ||
4436 | /* check name and function validity */ | 4443 | /* check name and function validity */ |
4437 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4444 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4438 | ss->create == NULL || ss->destroy == NULL) | 4445 | ss->css_alloc == NULL || ss->css_free == NULL) |
4439 | return -EINVAL; | 4446 | return -EINVAL; |
4440 | 4447 | ||
4441 | /* | 4448 | /* |
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4464 | subsys[ss->subsys_id] = ss; | 4471 | subsys[ss->subsys_id] = ss; |
4465 | 4472 | ||
4466 | /* | 4473 | /* |
4467 | * no ss->create seems to need anything important in the ss struct, so | 4474 | * no ss->css_alloc seems to need anything important in the ss |
4468 | * this can happen first (i.e. before the rootnode attachment). | 4475 | * struct, so this can happen first (i.e. before the rootnode |
4476 | * attachment). | ||
4469 | */ | 4477 | */ |
4470 | css = ss->create(dummytop); | 4478 | css = ss->css_alloc(dummytop); |
4471 | if (IS_ERR(css)) { | 4479 | if (IS_ERR(css)) { |
4472 | /* failure case - need to deassign the subsys[] slot. */ | 4480 | /* failure case - need to deassign the subsys[] slot. */ |
4473 | subsys[ss->subsys_id] = NULL; | 4481 | subsys[ss->subsys_id] = NULL; |
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4482 | init_cgroup_css(css, ss, dummytop); | 4490 | init_cgroup_css(css, ss, dummytop); |
4483 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4491 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4484 | if (ss->use_id) { | 4492 | if (ss->use_id) { |
4485 | int ret = cgroup_init_idr(ss, css); | 4493 | ret = cgroup_init_idr(ss, css); |
4486 | if (ret) { | 4494 | if (ret) |
4487 | dummytop->subsys[ss->subsys_id] = NULL; | 4495 | goto err_unload; |
4488 | ss->destroy(dummytop); | ||
4489 | subsys[ss->subsys_id] = NULL; | ||
4490 | mutex_unlock(&cgroup_mutex); | ||
4491 | return ret; | ||
4492 | } | ||
4493 | } | 4496 | } |
4494 | 4497 | ||
4495 | /* | 4498 | /* |
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4522 | write_unlock(&css_set_lock); | 4525 | write_unlock(&css_set_lock); |
4523 | 4526 | ||
4524 | ss->active = 1; | 4527 | ss->active = 1; |
4528 | ret = online_css(ss, dummytop); | ||
4529 | if (ret) | ||
4530 | goto err_unload; | ||
4525 | 4531 | ||
4526 | /* success! */ | 4532 | /* success! */ |
4527 | mutex_unlock(&cgroup_mutex); | 4533 | mutex_unlock(&cgroup_mutex); |
4528 | return 0; | 4534 | return 0; |
4535 | |||
4536 | err_unload: | ||
4537 | mutex_unlock(&cgroup_mutex); | ||
4538 | /* @ss can't be mounted here as try_module_get() would fail */ | ||
4539 | cgroup_unload_subsys(ss); | ||
4540 | return ret; | ||
4529 | } | 4541 | } |
4530 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4542 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4531 | 4543 | ||
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4552 | BUG_ON(ss->root != &rootnode); | 4564 | BUG_ON(ss->root != &rootnode); |
4553 | 4565 | ||
4554 | mutex_lock(&cgroup_mutex); | 4566 | mutex_lock(&cgroup_mutex); |
4567 | |||
4568 | offline_css(ss, dummytop); | ||
4569 | ss->active = 0; | ||
4570 | |||
4571 | if (ss->use_id) { | ||
4572 | idr_remove_all(&ss->idr); | ||
4573 | idr_destroy(&ss->idr); | ||
4574 | } | ||
4575 | |||
4555 | /* deassign the subsys_id */ | 4576 | /* deassign the subsys_id */ |
4556 | subsys[ss->subsys_id] = NULL; | 4577 | subsys[ss->subsys_id] = NULL; |
4557 | 4578 | ||
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4567 | struct css_set *cg = link->cg; | 4588 | struct css_set *cg = link->cg; |
4568 | 4589 | ||
4569 | hlist_del(&cg->hlist); | 4590 | hlist_del(&cg->hlist); |
4570 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
4571 | cg->subsys[ss->subsys_id] = NULL; | 4591 | cg->subsys[ss->subsys_id] = NULL; |
4572 | hhead = css_set_hash(cg->subsys); | 4592 | hhead = css_set_hash(cg->subsys); |
4573 | hlist_add_head(&cg->hlist, hhead); | 4593 | hlist_add_head(&cg->hlist, hhead); |
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4575 | write_unlock(&css_set_lock); | 4595 | write_unlock(&css_set_lock); |
4576 | 4596 | ||
4577 | /* | 4597 | /* |
4578 | * remove subsystem's css from the dummytop and free it - need to free | 4598 | * remove subsystem's css from the dummytop and free it - need to |
4579 | * before marking as null because ss->destroy needs the cgrp->subsys | 4599 | * free before marking as null because ss->css_free needs the |
4580 | * pointer to find their state. note that this also takes care of | 4600 | * cgrp->subsys pointer to find their state. note that this also |
4581 | * freeing the css_id. | 4601 | * takes care of freeing the css_id. |
4582 | */ | 4602 | */ |
4583 | ss->destroy(dummytop); | 4603 | ss->css_free(dummytop); |
4584 | dummytop->subsys[ss->subsys_id] = NULL; | 4604 | dummytop->subsys[ss->subsys_id] = NULL; |
4585 | 4605 | ||
4586 | mutex_unlock(&cgroup_mutex); | 4606 | mutex_unlock(&cgroup_mutex); |
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void) | |||
4624 | 4644 | ||
4625 | BUG_ON(!ss->name); | 4645 | BUG_ON(!ss->name); |
4626 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4646 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4627 | BUG_ON(!ss->create); | 4647 | BUG_ON(!ss->css_alloc); |
4628 | BUG_ON(!ss->destroy); | 4648 | BUG_ON(!ss->css_free); |
4629 | if (ss->subsys_id != i) { | 4649 | if (ss->subsys_id != i) { |
4630 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4650 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4631 | ss->name, ss->subsys_id); | 4651 | ss->name, ss->subsys_id); |
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child) | |||
4832 | } | 4852 | } |
4833 | 4853 | ||
4834 | /** | 4854 | /** |
4835 | * cgroup_fork_callbacks - run fork callbacks | ||
4836 | * @child: the new task | ||
4837 | * | ||
4838 | * Called on a new task very soon before adding it to the | ||
4839 | * tasklist. No need to take any locks since no-one can | ||
4840 | * be operating on this task. | ||
4841 | */ | ||
4842 | void cgroup_fork_callbacks(struct task_struct *child) | ||
4843 | { | ||
4844 | if (need_forkexit_callback) { | ||
4845 | int i; | ||
4846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4847 | struct cgroup_subsys *ss = subsys[i]; | ||
4848 | |||
4849 | /* | ||
4850 | * forkexit callbacks are only supported for | ||
4851 | * builtin subsystems. | ||
4852 | */ | ||
4853 | if (!ss || ss->module) | ||
4854 | continue; | ||
4855 | |||
4856 | if (ss->fork) | ||
4857 | ss->fork(child); | ||
4858 | } | ||
4859 | } | ||
4860 | } | ||
4861 | |||
4862 | /** | ||
4863 | * cgroup_post_fork - called on a new task after adding it to the task list | 4855 | * cgroup_post_fork - called on a new task after adding it to the task list |
4864 | * @child: the task in question | 4856 | * @child: the task in question |
4865 | * | 4857 | * |
4866 | * Adds the task to the list running through its css_set if necessary. | 4858 | * Adds the task to the list running through its css_set if necessary and |
4867 | * Has to be after the task is visible on the task list in case we race | 4859 | * call the subsystem fork() callbacks. Has to be after the task is |
4868 | * with the first call to cgroup_iter_start() - to guarantee that the | 4860 | * visible on the task list in case we race with the first call to |
4869 | * new task ends up on its list. | 4861 | * cgroup_iter_start() - to guarantee that the new task ends up on its |
4862 | * list. | ||
4870 | */ | 4863 | */ |
4871 | void cgroup_post_fork(struct task_struct *child) | 4864 | void cgroup_post_fork(struct task_struct *child) |
4872 | { | 4865 | { |
4866 | int i; | ||
4867 | |||
4873 | /* | 4868 | /* |
4874 | * use_task_css_set_links is set to 1 before we walk the tasklist | 4869 | * use_task_css_set_links is set to 1 before we walk the tasklist |
4875 | * under the tasklist_lock and we read it here after we added the child | 4870 | * under the tasklist_lock and we read it here after we added the child |
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child) | |||
4889 | task_unlock(child); | 4884 | task_unlock(child); |
4890 | write_unlock(&css_set_lock); | 4885 | write_unlock(&css_set_lock); |
4891 | } | 4886 | } |
4887 | |||
4888 | /* | ||
4889 | * Call ss->fork(). This must happen after @child is linked on | ||
4890 | * css_set; otherwise, @child might change state between ->fork() | ||
4891 | * and addition to css_set. | ||
4892 | */ | ||
4893 | if (need_forkexit_callback) { | ||
4894 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4895 | struct cgroup_subsys *ss = subsys[i]; | ||
4896 | |||
4897 | /* | ||
4898 | * fork/exit callbacks are supported only for | ||
4899 | * builtin subsystems and we don't need further | ||
4900 | * synchronization as they never go away. | ||
4901 | */ | ||
4902 | if (!ss || ss->module) | ||
4903 | continue; | ||
4904 | |||
4905 | if (ss->fork) | ||
4906 | ss->fork(child); | ||
4907 | } | ||
4908 | } | ||
4892 | } | 4909 | } |
4910 | |||
4893 | /** | 4911 | /** |
4894 | * cgroup_exit - detach cgroup from exiting task | 4912 | * cgroup_exit - detach cgroup from exiting task |
4895 | * @tsk: pointer to task_struct of exiting process | 4913 | * @tsk: pointer to task_struct of exiting process |
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5022 | /* Caller must verify that the css is not for root cgroup */ | 5040 | /* Caller must verify that the css is not for root cgroup */ |
5023 | bool __css_tryget(struct cgroup_subsys_state *css) | 5041 | bool __css_tryget(struct cgroup_subsys_state *css) |
5024 | { | 5042 | { |
5025 | do { | 5043 | while (true) { |
5026 | int v = css_refcnt(css); | 5044 | int t, v; |
5027 | 5045 | ||
5028 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 5046 | v = css_refcnt(css); |
5047 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5048 | if (likely(t == v)) | ||
5029 | return true; | 5049 | return true; |
5050 | else if (t < 0) | ||
5051 | return false; | ||
5030 | cpu_relax(); | 5052 | cpu_relax(); |
5031 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 5053 | } |
5032 | |||
5033 | return false; | ||
5034 | } | 5054 | } |
5035 | EXPORT_SYMBOL_GPL(__css_tryget); | 5055 | EXPORT_SYMBOL_GPL(__css_tryget); |
5036 | 5056 | ||
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5049 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 5069 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5050 | check_for_release(cgrp); | 5070 | check_for_release(cgrp); |
5051 | } | 5071 | } |
5052 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5053 | break; | 5072 | break; |
5054 | case 0: | 5073 | case 0: |
5055 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 5074 | schedule_work(&css->dput_work); |
5056 | schedule_work(&css->dput_work); | ||
5057 | break; | 5075 | break; |
5058 | } | 5076 | } |
5059 | rcu_read_unlock(); | 5077 | rcu_read_unlock(); |
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5439 | } | 5457 | } |
5440 | 5458 | ||
5441 | #ifdef CONFIG_CGROUP_DEBUG | 5459 | #ifdef CONFIG_CGROUP_DEBUG |
5442 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | 5460 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) |
5443 | { | 5461 | { |
5444 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5462 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5445 | 5463 | ||
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | |||
5449 | return css; | 5467 | return css; |
5450 | } | 5468 | } |
5451 | 5469 | ||
5452 | static void debug_destroy(struct cgroup *cont) | 5470 | static void debug_css_free(struct cgroup *cont) |
5453 | { | 5471 | { |
5454 | kfree(cont->subsys[debug_subsys_id]); | 5472 | kfree(cont->subsys[debug_subsys_id]); |
5455 | } | 5473 | } |
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = { | |||
5578 | 5596 | ||
5579 | struct cgroup_subsys debug_subsys = { | 5597 | struct cgroup_subsys debug_subsys = { |
5580 | .name = "debug", | 5598 | .name = "debug", |
5581 | .create = debug_create, | 5599 | .css_alloc = debug_css_alloc, |
5582 | .destroy = debug_destroy, | 5600 | .css_free = debug_css_free, |
5583 | .subsys_id = debug_subsys_id, | 5601 | .subsys_id = debug_subsys_id, |
5584 | .base_cftypes = debug_files, | 5602 | .base_cftypes = debug_files, |
5585 | }; | 5603 | }; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce98981..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -22,24 +22,33 @@ | |||
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | 24 | ||
25 | enum freezer_state { | 25 | /* |
26 | CGROUP_THAWED = 0, | 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
27 | CGROUP_FREEZING, | 27 | * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared |
28 | CGROUP_FROZEN, | 28 | * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING |
29 | * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of | ||
30 | * its ancestors has FREEZING_SELF set. | ||
31 | */ | ||
32 | enum freezer_state_flags { | ||
33 | CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ | ||
34 | CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ | ||
35 | CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ | ||
36 | CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ | ||
37 | |||
38 | /* mask for all FREEZING flags */ | ||
39 | CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, | ||
29 | }; | 40 | }; |
30 | 41 | ||
31 | struct freezer { | 42 | struct freezer { |
32 | struct cgroup_subsys_state css; | 43 | struct cgroup_subsys_state css; |
33 | enum freezer_state state; | 44 | unsigned int state; |
34 | spinlock_t lock; /* protects _writes_ to state */ | 45 | spinlock_t lock; |
35 | }; | 46 | }; |
36 | 47 | ||
37 | static inline struct freezer *cgroup_freezer( | 48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) |
38 | struct cgroup *cgroup) | ||
39 | { | 49 | { |
40 | return container_of( | 50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), |
41 | cgroup_subsys_state(cgroup, freezer_subsys_id), | 51 | struct freezer, css); |
42 | struct freezer, css); | ||
43 | } | 52 | } |
44 | 53 | ||
45 | static inline struct freezer *task_freezer(struct task_struct *task) | 54 | static inline struct freezer *task_freezer(struct task_struct *task) |
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 57 | struct freezer, css); |
49 | } | 58 | } |
50 | 59 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | ||
61 | { | ||
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | ||
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | ||
68 | |||
51 | bool cgroup_freezing(struct task_struct *task) | 69 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 70 | { |
53 | enum freezer_state state; | ||
54 | bool ret; | 71 | bool ret; |
55 | 72 | ||
56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
57 | state = task_freezer(task)->state; | 74 | ret = task_freezer(task)->state & CGROUP_FREEZING; |
58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; | ||
59 | rcu_read_unlock(); | 75 | rcu_read_unlock(); |
60 | 76 | ||
61 | return ret; | 77 | return ret; |
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task) | |||
65 | * cgroups_write_string() limits the size of freezer state strings to | 81 | * cgroups_write_string() limits the size of freezer state strings to |
66 | * CGROUP_LOCAL_BUFFER_SIZE | 82 | * CGROUP_LOCAL_BUFFER_SIZE |
67 | */ | 83 | */ |
68 | static const char *freezer_state_strs[] = { | 84 | static const char *freezer_state_strs(unsigned int state) |
69 | "THAWED", | 85 | { |
70 | "FREEZING", | 86 | if (state & CGROUP_FROZEN) |
71 | "FROZEN", | 87 | return "FROZEN"; |
88 | if (state & CGROUP_FREEZING) | ||
89 | return "FREEZING"; | ||
90 | return "THAWED"; | ||
72 | }; | 91 | }; |
73 | 92 | ||
74 | /* | ||
75 | * State diagram | ||
76 | * Transitions are caused by userspace writes to the freezer.state file. | ||
77 | * The values in parenthesis are state labels. The rest are edge labels. | ||
78 | * | ||
79 | * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) | ||
80 | * ^ ^ | | | ||
81 | * | \_______THAWED_______/ | | ||
82 | * \__________________________THAWED____________/ | ||
83 | */ | ||
84 | |||
85 | struct cgroup_subsys freezer_subsys; | 93 | struct cgroup_subsys freezer_subsys; |
86 | 94 | ||
87 | /* Locks taken and their ordering | 95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) |
88 | * ------------------------------ | ||
89 | * cgroup_mutex (AKA cgroup_lock) | ||
90 | * freezer->lock | ||
91 | * css_set_lock | ||
92 | * task->alloc_lock (AKA task_lock) | ||
93 | * task->sighand->siglock | ||
94 | * | ||
95 | * cgroup code forces css_set_lock to be taken before task->alloc_lock | ||
96 | * | ||
97 | * freezer_create(), freezer_destroy(): | ||
98 | * cgroup_mutex [ by cgroup core ] | ||
99 | * | ||
100 | * freezer_can_attach(): | ||
101 | * cgroup_mutex (held by caller of can_attach) | ||
102 | * | ||
103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | ||
104 | * freezer->lock | ||
105 | * sighand->siglock (if the cgroup is freezing) | ||
106 | * | ||
107 | * freezer_read(): | ||
108 | * cgroup_mutex | ||
109 | * freezer->lock | ||
110 | * write_lock css_set_lock (cgroup iterator start) | ||
111 | * task->alloc_lock | ||
112 | * read_lock css_set_lock (cgroup iterator start) | ||
113 | * | ||
114 | * freezer_write() (freeze): | ||
115 | * cgroup_mutex | ||
116 | * freezer->lock | ||
117 | * write_lock css_set_lock (cgroup iterator start) | ||
118 | * task->alloc_lock | ||
119 | * read_lock css_set_lock (cgroup iterator start) | ||
120 | * sighand->siglock (fake signal delivery inside freeze_task()) | ||
121 | * | ||
122 | * freezer_write() (unfreeze): | ||
123 | * cgroup_mutex | ||
124 | * freezer->lock | ||
125 | * write_lock css_set_lock (cgroup iterator start) | ||
126 | * task->alloc_lock | ||
127 | * read_lock css_set_lock (cgroup iterator start) | ||
128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) | ||
129 | * sighand->siglock | ||
130 | */ | ||
131 | static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | ||
132 | { | 96 | { |
133 | struct freezer *freezer; | 97 | struct freezer *freezer; |
134 | 98 | ||
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | |||
137 | return ERR_PTR(-ENOMEM); | 101 | return ERR_PTR(-ENOMEM); |
138 | 102 | ||
139 | spin_lock_init(&freezer->lock); | 103 | spin_lock_init(&freezer->lock); |
140 | freezer->state = CGROUP_THAWED; | ||
141 | return &freezer->css; | 104 | return &freezer->css; |
142 | } | 105 | } |
143 | 106 | ||
144 | static void freezer_destroy(struct cgroup *cgroup) | 107 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | ||
109 | * @cgroup: cgroup being created | ||
110 | * | ||
111 | * We're committing to creation of @cgroup. Mark it online and inherit | ||
112 | * parent's freezing state while holding both parent's and our | ||
113 | * freezer->lock. | ||
114 | */ | ||
115 | static int freezer_css_online(struct cgroup *cgroup) | ||
116 | { | ||
117 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
118 | struct freezer *parent = parent_freezer(freezer); | ||
119 | |||
120 | /* | ||
121 | * The following double locking and freezing state inheritance | ||
122 | * guarantee that @cgroup can never escape ancestors' freezing | ||
123 | * states. See cgroup_for_each_descendant_pre() for details. | ||
124 | */ | ||
125 | if (parent) | ||
126 | spin_lock_irq(&parent->lock); | ||
127 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
128 | |||
129 | freezer->state |= CGROUP_FREEZER_ONLINE; | ||
130 | |||
131 | if (parent && (parent->state & CGROUP_FREEZING)) { | ||
132 | freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; | ||
133 | atomic_inc(&system_freezing_cnt); | ||
134 | } | ||
135 | |||
136 | spin_unlock(&freezer->lock); | ||
137 | if (parent) | ||
138 | spin_unlock_irq(&parent->lock); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * freezer_css_offline - initiate destruction of @cgroup | ||
145 | * @cgroup: cgroup being destroyed | ||
146 | * | ||
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | ||
148 | * if it was holding one. | ||
149 | */ | ||
150 | static void freezer_css_offline(struct cgroup *cgroup) | ||
145 | { | 151 | { |
146 | struct freezer *freezer = cgroup_freezer(cgroup); | 152 | struct freezer *freezer = cgroup_freezer(cgroup); |
147 | 153 | ||
148 | if (freezer->state != CGROUP_THAWED) | 154 | spin_lock_irq(&freezer->lock); |
155 | |||
156 | if (freezer->state & CGROUP_FREEZING) | ||
149 | atomic_dec(&system_freezing_cnt); | 157 | atomic_dec(&system_freezing_cnt); |
150 | kfree(freezer); | 158 | |
159 | freezer->state = 0; | ||
160 | |||
161 | spin_unlock_irq(&freezer->lock); | ||
151 | } | 162 | } |
152 | 163 | ||
153 | /* task is frozen or will freeze immediately when next it gets woken */ | 164 | static void freezer_css_free(struct cgroup *cgroup) |
154 | static bool is_task_frozen_enough(struct task_struct *task) | ||
155 | { | 165 | { |
156 | return frozen(task) || | 166 | kfree(cgroup_freezer(cgroup)); |
157 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
158 | } | 167 | } |
159 | 168 | ||
160 | /* | 169 | /* |
161 | * The call to cgroup_lock() in the freezer.state write method prevents | 170 | * Tasks can be migrated into a different freezer anytime regardless of its |
162 | * a write to that file racing against an attach, and hence the | 171 | * current state. freezer_attach() is responsible for making new tasks |
163 | * can_attach() result will remain valid until the attach completes. | 172 | * conform to the current state. |
173 | * | ||
174 | * Freezer state changes and task migration are synchronized via | ||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | ||
176 | * current state and all following state changes can see the new tasks. | ||
164 | */ | 177 | */ |
165 | static int freezer_can_attach(struct cgroup *new_cgroup, | 178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) |
166 | struct cgroup_taskset *tset) | ||
167 | { | 179 | { |
168 | struct freezer *freezer; | 180 | struct freezer *freezer = cgroup_freezer(new_cgrp); |
169 | struct task_struct *task; | 181 | struct task_struct *task; |
182 | bool clear_frozen = false; | ||
183 | |||
184 | spin_lock_irq(&freezer->lock); | ||
170 | 185 | ||
171 | /* | 186 | /* |
172 | * Anything frozen can't move or be moved to/from. | 187 | * Make the new tasks conform to the current state of @new_cgrp. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | ||
189 | * revert it to FREEZING and let update_if_frozen() determine the | ||
190 | * correct state later. | ||
191 | * | ||
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | ||
193 | * current state before executing the following - !frozen tasks may | ||
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | ||
173 | */ | 195 | */ |
174 | cgroup_taskset_for_each(task, new_cgroup, tset) | 196 | cgroup_taskset_for_each(task, new_cgrp, tset) { |
175 | if (cgroup_freezing(task)) | 197 | if (!(freezer->state & CGROUP_FREEZING)) { |
176 | return -EBUSY; | 198 | __thaw_task(task); |
199 | } else { | ||
200 | freeze_task(task); | ||
201 | freezer->state &= ~CGROUP_FROZEN; | ||
202 | clear_frozen = true; | ||
203 | } | ||
204 | } | ||
177 | 205 | ||
178 | freezer = cgroup_freezer(new_cgroup); | 206 | spin_unlock_irq(&freezer->lock); |
179 | if (freezer->state != CGROUP_THAWED) | ||
180 | return -EBUSY; | ||
181 | 207 | ||
182 | return 0; | 208 | /* |
209 | * Propagate FROZEN clearing upwards. We may race with | ||
210 | * update_if_frozen(), but as long as both work bottom-up, either | ||
211 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
212 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
213 | * left FROZEN. | ||
214 | */ | ||
215 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | ||
216 | spin_lock_irq(&freezer->lock); | ||
217 | freezer->state &= ~CGROUP_FROZEN; | ||
218 | clear_frozen = freezer->state & CGROUP_FREEZING; | ||
219 | spin_unlock_irq(&freezer->lock); | ||
220 | } | ||
183 | } | 221 | } |
184 | 222 | ||
185 | static void freezer_fork(struct task_struct *task) | 223 | static void freezer_fork(struct task_struct *task) |
186 | { | 224 | { |
187 | struct freezer *freezer; | 225 | struct freezer *freezer; |
188 | 226 | ||
189 | /* | ||
190 | * No lock is needed, since the task isn't on tasklist yet, | ||
191 | * so it can't be moved to another cgroup, which means the | ||
192 | * freezer won't be removed and will be valid during this | ||
193 | * function call. Nevertheless, apply RCU read-side critical | ||
194 | * section to suppress RCU lockdep false positives. | ||
195 | */ | ||
196 | rcu_read_lock(); | 227 | rcu_read_lock(); |
197 | freezer = task_freezer(task); | 228 | freezer = task_freezer(task); |
198 | rcu_read_unlock(); | ||
199 | 229 | ||
200 | /* | 230 | /* |
201 | * The root cgroup is non-freezable, so we can skip the | 231 | * The root cgroup is non-freezable, so we can skip the |
202 | * following check. | 232 | * following check. |
203 | */ | 233 | */ |
204 | if (!freezer->css.cgroup->parent) | 234 | if (!freezer->css.cgroup->parent) |
205 | return; | 235 | goto out; |
206 | 236 | ||
207 | spin_lock_irq(&freezer->lock); | 237 | spin_lock_irq(&freezer->lock); |
208 | BUG_ON(freezer->state == CGROUP_FROZEN); | 238 | if (freezer->state & CGROUP_FREEZING) |
209 | |||
210 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | ||
211 | if (freezer->state == CGROUP_FREEZING) | ||
212 | freeze_task(task); | 239 | freeze_task(task); |
213 | spin_unlock_irq(&freezer->lock); | 240 | spin_unlock_irq(&freezer->lock); |
241 | out: | ||
242 | rcu_read_unlock(); | ||
214 | } | 243 | } |
215 | 244 | ||
216 | /* | 245 | /** |
217 | * caller must hold freezer->lock | 246 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | ||
248 | * | ||
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | ||
250 | * calling this function. If the current state is FREEZING but not FROZEN, | ||
251 | * this function checks whether all tasks of this cgroup and the descendant | ||
252 | * cgroups finished freezing and, if so, sets FROZEN. | ||
253 | * | ||
254 | * The caller is responsible for grabbing RCU read lock and calling | ||
255 | * update_if_frozen() on all descendants prior to invoking this function. | ||
256 | * | ||
257 | * Task states and freezer state might disagree while tasks are being | ||
258 | * migrated into or out of @cgroup, so we can't verify task states against | ||
259 | * @freezer state here. See freezer_attach() for details. | ||
218 | */ | 260 | */ |
219 | static void update_if_frozen(struct cgroup *cgroup, | 261 | static void update_if_frozen(struct cgroup *cgroup) |
220 | struct freezer *freezer) | ||
221 | { | 262 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
264 | struct cgroup *pos; | ||
222 | struct cgroup_iter it; | 265 | struct cgroup_iter it; |
223 | struct task_struct *task; | 266 | struct task_struct *task; |
224 | unsigned int nfrozen = 0, ntotal = 0; | ||
225 | enum freezer_state old_state = freezer->state; | ||
226 | 267 | ||
227 | cgroup_iter_start(cgroup, &it); | 268 | WARN_ON_ONCE(!rcu_read_lock_held()); |
228 | while ((task = cgroup_iter_next(cgroup, &it))) { | 269 | |
229 | ntotal++; | 270 | spin_lock_irq(&freezer->lock); |
230 | if (freezing(task) && is_task_frozen_enough(task)) | 271 | |
231 | nfrozen++; | 272 | if (!(freezer->state & CGROUP_FREEZING) || |
273 | (freezer->state & CGROUP_FROZEN)) | ||
274 | goto out_unlock; | ||
275 | |||
276 | /* are all (live) children frozen? */ | ||
277 | cgroup_for_each_child(pos, cgroup) { | ||
278 | struct freezer *child = cgroup_freezer(pos); | ||
279 | |||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | ||
281 | !(child->state & CGROUP_FROZEN)) | ||
282 | goto out_unlock; | ||
232 | } | 283 | } |
233 | 284 | ||
234 | if (old_state == CGROUP_THAWED) { | 285 | /* are all tasks frozen? */ |
235 | BUG_ON(nfrozen > 0); | 286 | cgroup_iter_start(cgroup, &it); |
236 | } else if (old_state == CGROUP_FREEZING) { | 287 | |
237 | if (nfrozen == ntotal) | 288 | while ((task = cgroup_iter_next(cgroup, &it))) { |
238 | freezer->state = CGROUP_FROZEN; | 289 | if (freezing(task)) { |
239 | } else { /* old_state == CGROUP_FROZEN */ | 290 | /* |
240 | BUG_ON(nfrozen != ntotal); | 291 | * freezer_should_skip() indicates that the task |
292 | * should be skipped when determining freezing | ||
293 | * completion. Consider it frozen in addition to | ||
294 | * the usual frozen condition. | ||
295 | */ | ||
296 | if (!frozen(task) && !freezer_should_skip(task)) | ||
297 | goto out_iter_end; | ||
298 | } | ||
241 | } | 299 | } |
242 | 300 | ||
301 | freezer->state |= CGROUP_FROZEN; | ||
302 | out_iter_end: | ||
243 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
304 | out_unlock: | ||
305 | spin_unlock_irq(&freezer->lock); | ||
244 | } | 306 | } |
245 | 307 | ||
246 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, |
247 | struct seq_file *m) | 309 | struct seq_file *m) |
248 | { | 310 | { |
249 | struct freezer *freezer; | 311 | struct cgroup *pos; |
250 | enum freezer_state state; | ||
251 | 312 | ||
252 | if (!cgroup_lock_live_group(cgroup)) | 313 | rcu_read_lock(); |
253 | return -ENODEV; | ||
254 | 314 | ||
255 | freezer = cgroup_freezer(cgroup); | 315 | /* update states bottom-up */ |
256 | spin_lock_irq(&freezer->lock); | 316 | cgroup_for_each_descendant_post(pos, cgroup) |
257 | state = freezer->state; | 317 | update_if_frozen(pos); |
258 | if (state == CGROUP_FREEZING) { | 318 | update_if_frozen(cgroup); |
259 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 319 | |
260 | * only partially frozen when we exitted write. */ | 320 | rcu_read_unlock(); |
261 | update_if_frozen(cgroup, freezer); | ||
262 | state = freezer->state; | ||
263 | } | ||
264 | spin_unlock_irq(&freezer->lock); | ||
265 | cgroup_unlock(); | ||
266 | 321 | ||
267 | seq_puts(m, freezer_state_strs[state]); | 322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); |
268 | seq_putc(m, '\n'); | 323 | seq_putc(m, '\n'); |
269 | return 0; | 324 | return 0; |
270 | } | 325 | } |
271 | 326 | ||
272 | static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 327 | static void freeze_cgroup(struct freezer *freezer) |
273 | { | 328 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | ||
274 | struct cgroup_iter it; | 330 | struct cgroup_iter it; |
275 | struct task_struct *task; | 331 | struct task_struct *task; |
276 | unsigned int num_cant_freeze_now = 0; | ||
277 | 332 | ||
278 | cgroup_iter_start(cgroup, &it); | 333 | cgroup_iter_start(cgroup, &it); |
279 | while ((task = cgroup_iter_next(cgroup, &it))) { | 334 | while ((task = cgroup_iter_next(cgroup, &it))) |
280 | if (!freeze_task(task)) | 335 | freeze_task(task); |
281 | continue; | ||
282 | if (is_task_frozen_enough(task)) | ||
283 | continue; | ||
284 | if (!freezing(task) && !freezer_should_skip(task)) | ||
285 | num_cant_freeze_now++; | ||
286 | } | ||
287 | cgroup_iter_end(cgroup, &it); | 336 | cgroup_iter_end(cgroup, &it); |
288 | |||
289 | return num_cant_freeze_now ? -EBUSY : 0; | ||
290 | } | 337 | } |
291 | 338 | ||
292 | static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 339 | static void unfreeze_cgroup(struct freezer *freezer) |
293 | { | 340 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | ||
294 | struct cgroup_iter it; | 342 | struct cgroup_iter it; |
295 | struct task_struct *task; | 343 | struct task_struct *task; |
296 | 344 | ||
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | cgroup_iter_end(cgroup, &it); | 348 | cgroup_iter_end(cgroup, &it); |
301 | } | 349 | } |
302 | 350 | ||
303 | static int freezer_change_state(struct cgroup *cgroup, | 351 | /** |
304 | enum freezer_state goal_state) | 352 | * freezer_apply_state - apply state change to a single cgroup_freezer |
353 | * @freezer: freezer to apply state change to | ||
354 | * @freeze: whether to freeze or unfreeze | ||
355 | * @state: CGROUP_FREEZING_* flag to set or clear | ||
356 | * | ||
357 | * Set or clear @state on @cgroup according to @freeze, and perform | ||
358 | * freezing or thawing as necessary. | ||
359 | */ | ||
360 | static void freezer_apply_state(struct freezer *freezer, bool freeze, | ||
361 | unsigned int state) | ||
305 | { | 362 | { |
306 | struct freezer *freezer; | 363 | /* also synchronizes against task migration, see freezer_attach() */ |
307 | int retval = 0; | 364 | lockdep_assert_held(&freezer->lock); |
308 | |||
309 | freezer = cgroup_freezer(cgroup); | ||
310 | 365 | ||
311 | spin_lock_irq(&freezer->lock); | 366 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
367 | return; | ||
312 | 368 | ||
313 | update_if_frozen(cgroup, freezer); | 369 | if (freeze) { |
314 | 370 | if (!(freezer->state & CGROUP_FREEZING)) | |
315 | switch (goal_state) { | ||
316 | case CGROUP_THAWED: | ||
317 | if (freezer->state != CGROUP_THAWED) | ||
318 | atomic_dec(&system_freezing_cnt); | ||
319 | freezer->state = CGROUP_THAWED; | ||
320 | unfreeze_cgroup(cgroup, freezer); | ||
321 | break; | ||
322 | case CGROUP_FROZEN: | ||
323 | if (freezer->state == CGROUP_THAWED) | ||
324 | atomic_inc(&system_freezing_cnt); | 371 | atomic_inc(&system_freezing_cnt); |
325 | freezer->state = CGROUP_FREEZING; | 372 | freezer->state |= state; |
326 | retval = try_to_freeze_cgroup(cgroup, freezer); | 373 | freeze_cgroup(freezer); |
327 | break; | 374 | } else { |
328 | default: | 375 | bool was_freezing = freezer->state & CGROUP_FREEZING; |
329 | BUG(); | 376 | |
377 | freezer->state &= ~state; | ||
378 | |||
379 | if (!(freezer->state & CGROUP_FREEZING)) { | ||
380 | if (was_freezing) | ||
381 | atomic_dec(&system_freezing_cnt); | ||
382 | freezer->state &= ~CGROUP_FROZEN; | ||
383 | unfreeze_cgroup(freezer); | ||
384 | } | ||
330 | } | 385 | } |
386 | } | ||
331 | 387 | ||
388 | /** | ||
389 | * freezer_change_state - change the freezing state of a cgroup_freezer | ||
390 | * @freezer: freezer of interest | ||
391 | * @freeze: whether to freeze or thaw | ||
392 | * | ||
393 | * Freeze or thaw @freezer according to @freeze. The operations are | ||
394 | * recursive - all descendants of @freezer will be affected. | ||
395 | */ | ||
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | ||
397 | { | ||
398 | struct cgroup *pos; | ||
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
332 | spin_unlock_irq(&freezer->lock); | 403 | spin_unlock_irq(&freezer->lock); |
333 | 404 | ||
334 | return retval; | 405 | /* |
406 | * Update all its descendants in pre-order traversal. Each | ||
407 | * descendant will try to inherit its parent's FREEZING state as | ||
408 | * CGROUP_FREEZING_PARENT. | ||
409 | */ | ||
410 | rcu_read_lock(); | ||
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | ||
412 | struct freezer *pos_f = cgroup_freezer(pos); | ||
413 | struct freezer *parent = parent_freezer(pos_f); | ||
414 | |||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | ||
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | ||
422 | CGROUP_FREEZING_PARENT); | ||
423 | spin_unlock_irq(&pos_f->lock); | ||
424 | } | ||
425 | rcu_read_unlock(); | ||
335 | } | 426 | } |
336 | 427 | ||
337 | static int freezer_write(struct cgroup *cgroup, | 428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, |
338 | struct cftype *cft, | ||
339 | const char *buffer) | 429 | const char *buffer) |
340 | { | 430 | { |
341 | int retval; | 431 | bool freeze; |
342 | enum freezer_state goal_state; | ||
343 | 432 | ||
344 | if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) | 433 | if (strcmp(buffer, freezer_state_strs(0)) == 0) |
345 | goal_state = CGROUP_THAWED; | 434 | freeze = false; |
346 | else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) | 435 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) |
347 | goal_state = CGROUP_FROZEN; | 436 | freeze = true; |
348 | else | 437 | else |
349 | return -EINVAL; | 438 | return -EINVAL; |
350 | 439 | ||
351 | if (!cgroup_lock_live_group(cgroup)) | 440 | freezer_change_state(cgroup_freezer(cgroup), freeze); |
352 | return -ENODEV; | 441 | return 0; |
353 | retval = freezer_change_state(cgroup, goal_state); | 442 | } |
354 | cgroup_unlock(); | 443 | |
355 | return retval; | 444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) |
445 | { | ||
446 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
447 | |||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | ||
449 | } | ||
450 | |||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | ||
452 | { | ||
453 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
454 | |||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | ||
356 | } | 456 | } |
357 | 457 | ||
358 | static struct cftype files[] = { | 458 | static struct cftype files[] = { |
@@ -362,23 +462,27 @@ static struct cftype files[] = { | |||
362 | .read_seq_string = freezer_read, | 462 | .read_seq_string = freezer_read, |
363 | .write_string = freezer_write, | 463 | .write_string = freezer_write, |
364 | }, | 464 | }, |
465 | { | ||
466 | .name = "self_freezing", | ||
467 | .flags = CFTYPE_NOT_ON_ROOT, | ||
468 | .read_u64 = freezer_self_freezing_read, | ||
469 | }, | ||
470 | { | ||
471 | .name = "parent_freezing", | ||
472 | .flags = CFTYPE_NOT_ON_ROOT, | ||
473 | .read_u64 = freezer_parent_freezing_read, | ||
474 | }, | ||
365 | { } /* terminate */ | 475 | { } /* terminate */ |
366 | }; | 476 | }; |
367 | 477 | ||
368 | struct cgroup_subsys freezer_subsys = { | 478 | struct cgroup_subsys freezer_subsys = { |
369 | .name = "freezer", | 479 | .name = "freezer", |
370 | .create = freezer_create, | 480 | .css_alloc = freezer_css_alloc, |
371 | .destroy = freezer_destroy, | 481 | .css_online = freezer_css_online, |
482 | .css_offline = freezer_css_offline, | ||
483 | .css_free = freezer_css_free, | ||
372 | .subsys_id = freezer_subsys_id, | 484 | .subsys_id = freezer_subsys_id, |
373 | .can_attach = freezer_can_attach, | 485 | .attach = freezer_attach, |
374 | .fork = freezer_fork, | 486 | .fork = freezer_fork, |
375 | .base_cftypes = files, | 487 | .base_cftypes = files, |
376 | |||
377 | /* | ||
378 | * freezer subsys doesn't handle hierarchy at all. Frozen state | ||
379 | * should be inherited through the hierarchy - if a parent is | ||
380 | * frozen, all its children should be frozen. Fix it and remove | ||
381 | * the following. | ||
382 | */ | ||
383 | .broken_hierarchy = true, | ||
384 | }; | 488 | }; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..b017887d632f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1784,56 +1784,20 @@ static struct cftype files[] = { | |||
1784 | }; | 1784 | }; |
1785 | 1785 | ||
1786 | /* | 1786 | /* |
1787 | * post_clone() is called during cgroup_create() when the | 1787 | * cpuset_css_alloc - allocate a cpuset css |
1788 | * clone_children mount argument was specified. The cgroup | ||
1789 | * can not yet have any tasks. | ||
1790 | * | ||
1791 | * Currently we refuse to set up the cgroup - thereby | ||
1792 | * refusing the task to be entered, and as a result refusing | ||
1793 | * the sys_unshare() or clone() which initiated it - if any | ||
1794 | * sibling cpusets have exclusive cpus or mem. | ||
1795 | * | ||
1796 | * If this becomes a problem for some users who wish to | ||
1797 | * allow that scenario, then cpuset_post_clone() could be | ||
1798 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1799 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex | ||
1800 | * held. | ||
1801 | */ | ||
1802 | static void cpuset_post_clone(struct cgroup *cgroup) | ||
1803 | { | ||
1804 | struct cgroup *parent, *child; | ||
1805 | struct cpuset *cs, *parent_cs; | ||
1806 | |||
1807 | parent = cgroup->parent; | ||
1808 | list_for_each_entry(child, &parent->children, sibling) { | ||
1809 | cs = cgroup_cs(child); | ||
1810 | if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) | ||
1811 | return; | ||
1812 | } | ||
1813 | cs = cgroup_cs(cgroup); | ||
1814 | parent_cs = cgroup_cs(parent); | ||
1815 | |||
1816 | mutex_lock(&callback_mutex); | ||
1817 | cs->mems_allowed = parent_cs->mems_allowed; | ||
1818 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | ||
1819 | mutex_unlock(&callback_mutex); | ||
1820 | return; | ||
1821 | } | ||
1822 | |||
1823 | /* | ||
1824 | * cpuset_create - create a cpuset | ||
1825 | * cont: control group that the new cpuset will be part of | 1788 | * cont: control group that the new cpuset will be part of |
1826 | */ | 1789 | */ |
1827 | 1790 | ||
1828 | static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | 1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
1829 | { | 1792 | { |
1830 | struct cpuset *cs; | 1793 | struct cgroup *parent_cg = cont->parent; |
1831 | struct cpuset *parent; | 1794 | struct cgroup *tmp_cg; |
1795 | struct cpuset *parent, *cs; | ||
1832 | 1796 | ||
1833 | if (!cont->parent) { | 1797 | if (!parent_cg) |
1834 | return &top_cpuset.css; | 1798 | return &top_cpuset.css; |
1835 | } | 1799 | parent = cgroup_cs(parent_cg); |
1836 | parent = cgroup_cs(cont->parent); | 1800 | |
1837 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); |
1838 | if (!cs) | 1802 | if (!cs) |
1839 | return ERR_PTR(-ENOMEM); | 1803 | return ERR_PTR(-ENOMEM); |
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1855 | 1819 | ||
1856 | cs->parent = parent; | 1820 | cs->parent = parent; |
1857 | number_of_cpusets++; | 1821 | number_of_cpusets++; |
1858 | return &cs->css ; | 1822 | |
1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | ||
1824 | goto skip_clone; | ||
1825 | |||
1826 | /* | ||
1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | ||
1828 | * set. This flag handling is implemented in cgroup core for | ||
1829 | * histrical reasons - the flag may be specified during mount. | ||
1830 | * | ||
1831 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | ||
1832 | * refuse to clone the configuration - thereby refusing the task to | ||
1833 | * be entered, and as a result refusing the sys_unshare() or | ||
1834 | * clone() which initiated it. If this becomes a problem for some | ||
1835 | * users who wish to allow that scenario, then this could be | ||
1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1837 | * (and likewise for mems) to the new cgroup. | ||
1838 | */ | ||
1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | ||
1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | ||
1841 | |||
1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | ||
1843 | goto skip_clone; | ||
1844 | } | ||
1845 | |||
1846 | mutex_lock(&callback_mutex); | ||
1847 | cs->mems_allowed = parent->mems_allowed; | ||
1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | ||
1849 | mutex_unlock(&callback_mutex); | ||
1850 | skip_clone: | ||
1851 | return &cs->css; | ||
1859 | } | 1852 | } |
1860 | 1853 | ||
1861 | /* | 1854 | /* |
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1864 | * will call async_rebuild_sched_domains(). | 1857 | * will call async_rebuild_sched_domains(). |
1865 | */ | 1858 | */ |
1866 | 1859 | ||
1867 | static void cpuset_destroy(struct cgroup *cont) | 1860 | static void cpuset_css_free(struct cgroup *cont) |
1868 | { | 1861 | { |
1869 | struct cpuset *cs = cgroup_cs(cont); | 1862 | struct cpuset *cs = cgroup_cs(cont); |
1870 | 1863 | ||
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont) | |||
1878 | 1871 | ||
1879 | struct cgroup_subsys cpuset_subsys = { | 1872 | struct cgroup_subsys cpuset_subsys = { |
1880 | .name = "cpuset", | 1873 | .name = "cpuset", |
1881 | .create = cpuset_create, | 1874 | .css_alloc = cpuset_css_alloc, |
1882 | .destroy = cpuset_destroy, | 1875 | .css_free = cpuset_css_free, |
1883 | .can_attach = cpuset_can_attach, | 1876 | .can_attach = cpuset_can_attach, |
1884 | .attach = cpuset_attach, | 1877 | .attach = cpuset_attach, |
1885 | .post_clone = cpuset_post_clone, | ||
1886 | .subsys_id = cpuset_subsys_id, | 1878 | .subsys_id = cpuset_subsys_id, |
1887 | .base_cftypes = files, | 1879 | .base_cftypes = files, |
1888 | .early_init = 1, | 1880 | .early_init = 1, |
diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134d..f9ff5493171d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -7434,7 +7434,7 @@ unlock: | |||
7434 | device_initcall(perf_event_sysfs_init); | 7434 | device_initcall(perf_event_sysfs_init); |
7435 | 7435 | ||
7436 | #ifdef CONFIG_CGROUP_PERF | 7436 | #ifdef CONFIG_CGROUP_PERF |
7437 | static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | 7437 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) |
7438 | { | 7438 | { |
7439 | struct perf_cgroup *jc; | 7439 | struct perf_cgroup *jc; |
7440 | 7440 | ||
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | |||
7451 | return &jc->css; | 7451 | return &jc->css; |
7452 | } | 7452 | } |
7453 | 7453 | ||
7454 | static void perf_cgroup_destroy(struct cgroup *cont) | 7454 | static void perf_cgroup_css_free(struct cgroup *cont) |
7455 | { | 7455 | { |
7456 | struct perf_cgroup *jc; | 7456 | struct perf_cgroup *jc; |
7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7492 | struct cgroup_subsys perf_subsys = { | 7492 | struct cgroup_subsys perf_subsys = { |
7493 | .name = "perf_event", | 7493 | .name = "perf_event", |
7494 | .subsys_id = perf_subsys_id, | 7494 | .subsys_id = perf_subsys_id, |
7495 | .create = perf_cgroup_create, | 7495 | .css_alloc = perf_cgroup_css_alloc, |
7496 | .destroy = perf_cgroup_destroy, | 7496 | .css_free = perf_cgroup_css_free, |
7497 | .exit = perf_cgroup_exit, | 7497 | .exit = perf_cgroup_exit, |
7498 | .attach = perf_cgroup_attach, | 7498 | .attach = perf_cgroup_attach, |
7499 | 7499 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 850dde1e0c84..79de9f99a48d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1137 | { | 1137 | { |
1138 | int retval; | 1138 | int retval; |
1139 | struct task_struct *p; | 1139 | struct task_struct *p; |
1140 | int cgroup_callbacks_done = 0; | ||
1141 | 1140 | ||
1142 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1141 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1143 | return ERR_PTR(-EINVAL); | 1142 | return ERR_PTR(-EINVAL); |
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1395 | INIT_LIST_HEAD(&p->thread_group); | 1394 | INIT_LIST_HEAD(&p->thread_group); |
1396 | p->task_works = NULL; | 1395 | p->task_works = NULL; |
1397 | 1396 | ||
1398 | /* Now that the task is set up, run cgroup callbacks if | ||
1399 | * necessary. We need to run them before the task is visible | ||
1400 | * on the tasklist. */ | ||
1401 | cgroup_fork_callbacks(p); | ||
1402 | cgroup_callbacks_done = 1; | ||
1403 | |||
1404 | /* Need tasklist lock for parent etc handling! */ | 1397 | /* Need tasklist lock for parent etc handling! */ |
1405 | write_lock_irq(&tasklist_lock); | 1398 | write_lock_irq(&tasklist_lock); |
1406 | 1399 | ||
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup: | |||
1505 | #endif | 1498 | #endif |
1506 | if (clone_flags & CLONE_THREAD) | 1499 | if (clone_flags & CLONE_THREAD) |
1507 | threadgroup_change_end(current); | 1500 | threadgroup_change_end(current); |
1508 | cgroup_exit(p, cgroup_callbacks_done); | 1501 | cgroup_exit(p, 0); |
1509 | delayacct_tsk_free(p); | 1502 | delayacct_tsk_free(p); |
1510 | module_put(task_thread_info(p)->exec_domain->module); | 1503 | module_put(task_thread_info(p)->exec_domain->module); |
1511 | bad_fork_cleanup_count: | 1504 | bad_fork_cleanup_count: |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) | |||
116 | return false; | 116 | return false; |
117 | } | 117 | } |
118 | 118 | ||
119 | if (!(p->flags & PF_KTHREAD)) { | 119 | if (!(p->flags & PF_KTHREAD)) |
120 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
121 | /* | 121 | else |
122 | * fake_signal_wake_up() goes through p's scheduler | ||
123 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
124 | * TASK_RUNNING transition can't race with task state | ||
125 | * testing in try_to_freeze_tasks(). | ||
126 | */ | ||
127 | } else { | ||
128 | wake_up_state(p, TASK_INTERRUPTIBLE); | 122 | wake_up_state(p, TASK_INTERRUPTIBLE); |
129 | } | ||
130 | 123 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | 124 | spin_unlock_irqrestore(&freezer_lock, flags); |
132 | return true; | 125 | return true; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e13..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
48 | if (p == current || !freeze_task(p)) | 48 | if (p == current || !freeze_task(p)) |
49 | continue; | 49 | continue; |
50 | 50 | ||
51 | /* | 51 | if (!freezer_should_skip(p)) |
52 | * Now that we've done set_freeze_flag, don't | ||
53 | * perturb a task in TASK_STOPPED or TASK_TRACED. | ||
54 | * It is "frozen enough". If the task does wake | ||
55 | * up, it will immediately call try_to_freeze. | ||
56 | * | ||
57 | * Because freeze_task() goes through p's scheduler lock, it's | ||
58 | * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING | ||
59 | * transition can't race with task state testing here. | ||
60 | */ | ||
61 | if (!task_is_stopped_or_traced(p) && | ||
62 | !freezer_should_skip(p)) | ||
63 | todo++; | 52 | todo++; |
64 | } while_each_thread(g, p); | 53 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 54 | read_unlock(&tasklist_lock); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5066a61f971..6271b89f87ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7484 | struct task_group, css); | 7484 | struct task_group, css); |
7485 | } | 7485 | } |
7486 | 7486 | ||
7487 | static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | 7487 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) |
7488 | { | 7488 | { |
7489 | struct task_group *tg, *parent; | 7489 | struct task_group *tg, *parent; |
7490 | 7490 | ||
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | |||
7501 | return &tg->css; | 7501 | return &tg->css; |
7502 | } | 7502 | } |
7503 | 7503 | ||
7504 | static void cpu_cgroup_destroy(struct cgroup *cgrp) | 7504 | static void cpu_cgroup_css_free(struct cgroup *cgrp) |
7505 | { | 7505 | { |
7506 | struct task_group *tg = cgroup_tg(cgrp); | 7506 | struct task_group *tg = cgroup_tg(cgrp); |
7507 | 7507 | ||
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = { | |||
7861 | 7861 | ||
7862 | struct cgroup_subsys cpu_cgroup_subsys = { | 7862 | struct cgroup_subsys cpu_cgroup_subsys = { |
7863 | .name = "cpu", | 7863 | .name = "cpu", |
7864 | .create = cpu_cgroup_create, | 7864 | .css_alloc = cpu_cgroup_css_alloc, |
7865 | .destroy = cpu_cgroup_destroy, | 7865 | .css_free = cpu_cgroup_css_free, |
7866 | .can_attach = cpu_cgroup_can_attach, | 7866 | .can_attach = cpu_cgroup_can_attach, |
7867 | .attach = cpu_cgroup_attach, | 7867 | .attach = cpu_cgroup_attach, |
7868 | .exit = cpu_cgroup_exit, | 7868 | .exit = cpu_cgroup_exit, |
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7885 | struct cpuacct root_cpuacct; | 7885 | struct cpuacct root_cpuacct; |
7886 | 7886 | ||
7887 | /* create a new cpu accounting group */ | 7887 | /* create a new cpu accounting group */ |
7888 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7888 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) |
7889 | { | 7889 | { |
7890 | struct cpuacct *ca; | 7890 | struct cpuacct *ca; |
7891 | 7891 | ||
@@ -7915,7 +7915,7 @@ out: | |||
7915 | } | 7915 | } |
7916 | 7916 | ||
7917 | /* destroy an existing cpu accounting group */ | 7917 | /* destroy an existing cpu accounting group */ |
7918 | static void cpuacct_destroy(struct cgroup *cgrp) | 7918 | static void cpuacct_css_free(struct cgroup *cgrp) |
7919 | { | 7919 | { |
7920 | struct cpuacct *ca = cgroup_ca(cgrp); | 7920 | struct cpuacct *ca = cgroup_ca(cgrp); |
7921 | 7921 | ||
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
8086 | 8086 | ||
8087 | struct cgroup_subsys cpuacct_subsys = { | 8087 | struct cgroup_subsys cpuacct_subsys = { |
8088 | .name = "cpuacct", | 8088 | .name = "cpuacct", |
8089 | .create = cpuacct_create, | 8089 | .css_alloc = cpuacct_css_alloc, |
8090 | .destroy = cpuacct_destroy, | 8090 | .css_free = cpuacct_css_free, |
8091 | .subsys_id = cpuacct_subsys_id, | 8091 | .subsys_id = cpuacct_subsys_id, |
8092 | .base_cftypes = files, | 8092 | .base_cftypes = files, |
8093 | }; | 8093 | }; |
diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..5ffb5626e072 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1908 | preempt_disable(); | 1908 | preempt_disable(); |
1909 | read_unlock(&tasklist_lock); | 1909 | read_unlock(&tasklist_lock); |
1910 | preempt_enable_no_resched(); | 1910 | preempt_enable_no_resched(); |
1911 | schedule(); | 1911 | freezable_schedule(); |
1912 | } else { | 1912 | } else { |
1913 | /* | 1913 | /* |
1914 | * By the time we got the lock, our tracer went away. | 1914 | * By the time we got the lock, our tracer went away. |
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1930 | } | 1930 | } |
1931 | 1931 | ||
1932 | /* | 1932 | /* |
1933 | * While in TASK_TRACED, we were considered "frozen enough". | ||
1934 | * Now that we woke up, it's crucial if we're supposed to be | ||
1935 | * frozen that we freeze now before running anything substantial. | ||
1936 | */ | ||
1937 | try_to_freeze(); | ||
1938 | |||
1939 | /* | ||
1940 | * We are back. Now reacquire the siglock before touching | 1933 | * We are back. Now reacquire the siglock before touching |
1941 | * last_siginfo, so that we are sure to have synchronized with | 1934 | * last_siginfo, so that we are sure to have synchronized with |
1942 | * any signal-sending on another CPU that wants to examine it. | 1935 | * any signal-sending on another CPU that wants to examine it. |
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr) | |||
2092 | } | 2085 | } |
2093 | 2086 | ||
2094 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2087 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
2095 | schedule(); | 2088 | freezable_schedule(); |
2096 | return true; | 2089 | return true; |
2097 | } else { | 2090 | } else { |
2098 | /* | 2091 | /* |
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2200 | if (unlikely(uprobe_deny_signal())) | 2193 | if (unlikely(uprobe_deny_signal())) |
2201 | return 0; | 2194 | return 0; |
2202 | 2195 | ||
2203 | relock: | ||
2204 | /* | 2196 | /* |
2205 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2197 | * Do this once, we can't return to user-mode if freezing() == T. |
2206 | * While in TASK_STOPPED, we were considered "frozen enough". | 2198 | * do_signal_stop() and ptrace_stop() do freezable_schedule() and |
2207 | * Now that we woke up, it's crucial if we're supposed to be | 2199 | * thus do not need another check after return. |
2208 | * frozen that we freeze now before running anything substantial. | ||
2209 | */ | 2200 | */ |
2210 | try_to_freeze(); | 2201 | try_to_freeze(); |
2211 | 2202 | ||
2203 | relock: | ||
2212 | spin_lock_irq(&sighand->siglock); | 2204 | spin_lock_irq(&sighand->siglock); |
2213 | /* | 2205 | /* |
2214 | * Every stopped thread goes here after wakeup. Check to see if | 2206 | * Every stopped thread goes here after wakeup. Check to see if |