diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 754 |
1 files changed, 386 insertions, 368 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..f34c41bfaa37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -138,6 +138,9 @@ struct cgroupfs_root { | |||
138 | /* Hierarchy-specific flags */ | 138 | /* Hierarchy-specific flags */ |
139 | unsigned long flags; | 139 | unsigned long flags; |
140 | 140 | ||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
141 | /* The path to use for release notifications. */ | 144 | /* The path to use for release notifications. */ |
142 | char release_agent_path[PATH_MAX]; | 145 | char release_agent_path[PATH_MAX]; |
143 | 146 | ||
@@ -171,8 +174,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 174 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 175 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 176 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 177 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 178 | * should be used for avoiding race. |
176 | */ | 179 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 180 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 181 | /* |
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
242 | */ | 245 | */ |
243 | static int need_forkexit_callback __read_mostly; | 246 | static int need_forkexit_callback __read_mostly; |
244 | 247 | ||
248 | static int cgroup_destroy_locked(struct cgroup *cgrp); | ||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | ||
250 | struct cftype cfts[], bool is_add); | ||
251 | |||
245 | #ifdef CONFIG_PROVE_LOCKING | 252 | #ifdef CONFIG_PROVE_LOCKING |
246 | int cgroup_lock_is_held(void) | 253 | int cgroup_lock_is_held(void) |
247 | { | 254 | { |
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
294 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 301 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
295 | } | 302 | } |
296 | 303 | ||
297 | static int clone_children(const struct cgroup *cgrp) | ||
298 | { | ||
299 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
300 | } | ||
301 | |||
302 | /* | 304 | /* |
303 | * for_each_subsys() allows you to iterate on each subsystem attached to | 305 | * for_each_subsys() allows you to iterate on each subsystem attached to |
304 | * an active hierarchy | 306 | * an active hierarchy |
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
782 | * The task_lock() exception | 784 | * The task_lock() exception |
783 | * | 785 | * |
784 | * The need for this exception arises from the action of | 786 | * The need for this exception arises from the action of |
785 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 787 | * cgroup_attach_task(), which overwrites one task's cgroup pointer with |
786 | * another. It does so using cgroup_mutex, however there are | 788 | * another. It does so using cgroup_mutex, however there are |
787 | * several performance critical places that need to reference | 789 | * several performance critical places that need to reference |
788 | * task->cgroup without the expense of grabbing a system global | 790 | * task->cgroup without the expense of grabbing a system global |
789 | * mutex. Therefore except as noted below, when dereferencing or, as | 791 | * mutex. Therefore except as noted below, when dereferencing or, as |
790 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 792 | * in cgroup_attach_task(), modifying a task's cgroup pointer we use |
791 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 793 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
792 | * the task_struct routinely used for such matters. | 794 | * the task_struct routinely used for such matters. |
793 | * | 795 | * |
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 856 | return inode; |
855 | } | 857 | } |
856 | 858 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 860 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
898 | * Release the subsystem state objects. | 876 | * Release the subsystem state objects. |
899 | */ | 877 | */ |
900 | for_each_subsys(cgrp->root, ss) | 878 | for_each_subsys(cgrp->root, ss) |
901 | ss->destroy(cgrp); | 879 | ss->css_free(cgrp); |
902 | 880 | ||
903 | cgrp->root->number_of_cgroups--; | 881 | cgrp->root->number_of_cgroups--; |
904 | mutex_unlock(&cgroup_mutex); | 882 | mutex_unlock(&cgroup_mutex); |
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
917 | 895 | ||
918 | simple_xattrs_free(&cgrp->xattrs); | 896 | simple_xattrs_free(&cgrp->xattrs); |
919 | 897 | ||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
920 | kfree_rcu(cgrp, rcu_head); | 899 | kfree_rcu(cgrp, rcu_head); |
921 | } else { | 900 | } else { |
922 | struct cfent *cfe = __d_cfe(dentry); | 901 | struct cfent *cfe = __d_cfe(dentry); |
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
987 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 966 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
988 | continue; | 967 | continue; |
989 | list_for_each_entry(set, &ss->cftsets, node) | 968 | list_for_each_entry(set, &ss->cftsets, node) |
990 | cgroup_rm_file(cgrp, set->cfts); | 969 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); |
991 | } | 970 | } |
992 | if (base_files) { | 971 | if (base_files) { |
993 | while (!list_empty(&cgrp->files)) | 972 | while (!list_empty(&cgrp->files)) |
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 994 | } |
1016 | 995 | ||
1017 | /* | 996 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 997 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 998 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 999 | * returns an error, no reference counts are touched. |
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1150 | seq_puts(seq, ",xattr"); | 1102 | seq_puts(seq, ",xattr"); |
1151 | if (strlen(root->release_agent_path)) | 1103 | if (strlen(root->release_agent_path)) |
1152 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1104 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1153 | if (clone_children(&root->top_cgroup)) | 1105 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) |
1154 | seq_puts(seq, ",clone_children"); | 1106 | seq_puts(seq, ",clone_children"); |
1155 | if (strlen(root->name)) | 1107 | if (strlen(root->name)) |
1156 | seq_printf(seq, ",name=%s", root->name); | 1108 | seq_printf(seq, ",name=%s", root->name); |
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts { | |||
1162 | unsigned long subsys_mask; | 1114 | unsigned long subsys_mask; |
1163 | unsigned long flags; | 1115 | unsigned long flags; |
1164 | char *release_agent; | 1116 | char *release_agent; |
1165 | bool clone_children; | 1117 | bool cpuset_clone_children; |
1166 | char *name; | 1118 | char *name; |
1167 | /* User explicitly requested empty subsystem */ | 1119 | /* User explicitly requested empty subsystem */ |
1168 | bool none; | 1120 | bool none; |
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1213 | continue; | 1165 | continue; |
1214 | } | 1166 | } |
1215 | if (!strcmp(token, "clone_children")) { | 1167 | if (!strcmp(token, "clone_children")) { |
1216 | opts->clone_children = true; | 1168 | opts->cpuset_clone_children = true; |
1217 | continue; | 1169 | continue; |
1218 | } | 1170 | } |
1219 | if (!strcmp(token, "xattr")) { | 1171 | if (!strcmp(token, "xattr")) { |
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1397 | goto out_unlock; | 1349 | goto out_unlock; |
1398 | } | 1350 | } |
1399 | 1351 | ||
1352 | /* | ||
1353 | * Clear out the files of subsystems that should be removed, do | ||
1354 | * this before rebind_subsystems, since rebind_subsystems may | ||
1355 | * change this hierarchy's subsys_list. | ||
1356 | */ | ||
1357 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1358 | |||
1400 | ret = rebind_subsystems(root, opts.subsys_mask); | 1359 | ret = rebind_subsystems(root, opts.subsys_mask); |
1401 | if (ret) { | 1360 | if (ret) { |
1361 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1362 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1402 | drop_parsed_module_refcounts(opts.subsys_mask); | 1363 | drop_parsed_module_refcounts(opts.subsys_mask); |
1403 | goto out_unlock; | 1364 | goto out_unlock; |
1404 | } | 1365 | } |
1405 | 1366 | ||
1406 | /* clear out any existing files and repopulate subsystem files */ | ||
1407 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1408 | /* re-populate subsystem files */ | 1367 | /* re-populate subsystem files */ |
1409 | cgroup_populate_dir(cgrp, false, added_mask); | 1368 | cgroup_populate_dir(cgrp, false, added_mask); |
1410 | 1369 | ||
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1432 | INIT_LIST_HEAD(&cgrp->children); | 1391 | INIT_LIST_HEAD(&cgrp->children); |
1433 | INIT_LIST_HEAD(&cgrp->files); | 1392 | INIT_LIST_HEAD(&cgrp->files); |
1434 | INIT_LIST_HEAD(&cgrp->css_sets); | 1393 | INIT_LIST_HEAD(&cgrp->css_sets); |
1394 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1435 | INIT_LIST_HEAD(&cgrp->release_list); | 1395 | INIT_LIST_HEAD(&cgrp->release_list); |
1436 | INIT_LIST_HEAD(&cgrp->pidlists); | 1396 | INIT_LIST_HEAD(&cgrp->pidlists); |
1437 | mutex_init(&cgrp->pidlist_mutex); | 1397 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1450 | root->number_of_cgroups = 1; | 1410 | root->number_of_cgroups = 1; |
1451 | cgrp->root = root; | 1411 | cgrp->root = root; |
1452 | cgrp->top_cgroup = cgrp; | 1412 | cgrp->top_cgroup = cgrp; |
1453 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1454 | init_cgroup_housekeeping(cgrp); | 1413 | init_cgroup_housekeeping(cgrp); |
1414 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1455 | } | 1415 | } |
1456 | 1416 | ||
1457 | static bool init_root_id(struct cgroupfs_root *root) | 1417 | static bool init_root_id(struct cgroupfs_root *root) |
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1518 | 1478 | ||
1519 | root->subsys_mask = opts->subsys_mask; | 1479 | root->subsys_mask = opts->subsys_mask; |
1520 | root->flags = opts->flags; | 1480 | root->flags = opts->flags; |
1481 | ida_init(&root->cgroup_ida); | ||
1521 | if (opts->release_agent) | 1482 | if (opts->release_agent) |
1522 | strcpy(root->release_agent_path, opts->release_agent); | 1483 | strcpy(root->release_agent_path, opts->release_agent); |
1523 | if (opts->name) | 1484 | if (opts->name) |
1524 | strcpy(root->name, opts->name); | 1485 | strcpy(root->name, opts->name); |
1525 | if (opts->clone_children) | 1486 | if (opts->cpuset_clone_children) |
1526 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1487 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); |
1527 | return root; | 1488 | return root; |
1528 | } | 1489 | } |
1529 | 1490 | ||
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) | |||
1536 | spin_lock(&hierarchy_id_lock); | 1497 | spin_lock(&hierarchy_id_lock); |
1537 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1498 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1538 | spin_unlock(&hierarchy_id_lock); | 1499 | spin_unlock(&hierarchy_id_lock); |
1500 | ida_destroy(&root->cgroup_ida); | ||
1539 | kfree(root); | 1501 | kfree(root); |
1540 | } | 1502 | } |
1541 | 1503 | ||
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1701 | 1663 | ||
1702 | free_cg_links(&tmp_cg_links); | 1664 | free_cg_links(&tmp_cg_links); |
1703 | 1665 | ||
1704 | BUG_ON(!list_empty(&root_cgrp->sibling)); | ||
1705 | BUG_ON(!list_empty(&root_cgrp->children)); | 1666 | BUG_ON(!list_empty(&root_cgrp->children)); |
1706 | BUG_ON(root->number_of_cgroups != 1); | 1667 | BUG_ON(root->number_of_cgroups != 1); |
1707 | 1668 | ||
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1750 | 1711 | ||
1751 | BUG_ON(root->number_of_cgroups != 1); | 1712 | BUG_ON(root->number_of_cgroups != 1); |
1752 | BUG_ON(!list_empty(&cgrp->children)); | 1713 | BUG_ON(!list_empty(&cgrp->children)); |
1753 | BUG_ON(!list_empty(&cgrp->sibling)); | ||
1754 | 1714 | ||
1755 | mutex_lock(&cgroup_mutex); | 1715 | mutex_lock(&cgroup_mutex); |
1756 | mutex_lock(&cgroup_root_mutex); | 1716 | mutex_lock(&cgroup_root_mutex); |
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj; | |||
1808 | */ | 1768 | */ |
1809 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1769 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1810 | { | 1770 | { |
1771 | struct dentry *dentry = cgrp->dentry; | ||
1811 | char *start; | 1772 | char *start; |
1812 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1773 | |
1813 | cgroup_lock_is_held()); | 1774 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1775 | "cgroup_path() called without proper locking"); | ||
1814 | 1776 | ||
1815 | if (!dentry || cgrp == dummytop) { | 1777 | if (!dentry || cgrp == dummytop) { |
1816 | /* | 1778 | /* |
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1821 | return 0; | 1783 | return 0; |
1822 | } | 1784 | } |
1823 | 1785 | ||
1824 | start = buf + buflen; | 1786 | start = buf + buflen - 1; |
1825 | 1787 | ||
1826 | *--start = '\0'; | 1788 | *start = '\0'; |
1827 | for (;;) { | 1789 | for (;;) { |
1828 | int len = dentry->d_name.len; | 1790 | int len = dentry->d_name.len; |
1829 | 1791 | ||
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1834 | if (!cgrp) | 1796 | if (!cgrp) |
1835 | break; | 1797 | break; |
1836 | 1798 | ||
1837 | dentry = rcu_dereference_check(cgrp->dentry, | 1799 | dentry = cgrp->dentry; |
1838 | cgroup_lock_is_held()); | ||
1839 | if (!cgrp->parent) | 1800 | if (!cgrp->parent) |
1840 | continue; | 1801 | continue; |
1841 | if (--start < buf) | 1802 | if (--start < buf) |
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1930 | /* | 1891 | /* |
1931 | * cgroup_task_migrate - move a task from one cgroup to another. | 1892 | * cgroup_task_migrate - move a task from one cgroup to another. |
1932 | * | 1893 | * |
1933 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1894 | * Must be called with cgroup_mutex and threadgroup locked. |
1934 | * will already exist. If not set, this function might sleep, and can fail with | ||
1935 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | ||
1936 | */ | 1895 | */ |
1937 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1896 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1938 | struct task_struct *tsk, struct css_set *newcg) | 1897 | struct task_struct *tsk, struct css_set *newcg) |
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2025 | } | 1984 | } |
2026 | 1985 | ||
2027 | synchronize_rcu(); | 1986 | synchronize_rcu(); |
2028 | |||
2029 | /* | ||
2030 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2031 | * is no longer empty. | ||
2032 | */ | ||
2033 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2034 | out: | 1987 | out: |
2035 | if (retval) { | 1988 | if (retval) { |
2036 | for_each_subsys(root, ss) { | 1989 | for_each_subsys(root, ss) { |
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2200 | * step 5: success! and cleanup | 2153 | * step 5: success! and cleanup |
2201 | */ | 2154 | */ |
2202 | synchronize_rcu(); | 2155 | synchronize_rcu(); |
2203 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2204 | retval = 0; | 2156 | retval = 0; |
2205 | out_put_css_set_refs: | 2157 | out_put_css_set_refs: |
2206 | if (retval) { | 2158 | if (retval) { |
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2711 | 2663 | ||
2712 | /* start off with i_nlink == 2 (for "." entry) */ | 2664 | /* start off with i_nlink == 2 (for "." entry) */ |
2713 | inc_nlink(inode); | 2665 | inc_nlink(inode); |
2666 | inc_nlink(dentry->d_parent->d_inode); | ||
2714 | 2667 | ||
2715 | /* start with the directory inode held, so that we can | 2668 | /* |
2716 | * populate it without racing with another mkdir */ | 2669 | * Control reaches here with cgroup_mutex held. |
2717 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2670 | * @inode->i_mutex should nest outside cgroup_mutex but we |
2671 | * want to populate it immediately without releasing | ||
2672 | * cgroup_mutex. As @inode isn't visible to anyone else | ||
2673 | * yet, trylock will always succeed without affecting | ||
2674 | * lockdep checks. | ||
2675 | */ | ||
2676 | WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); | ||
2718 | } else if (S_ISREG(mode)) { | 2677 | } else if (S_ISREG(mode)) { |
2719 | inode->i_size = 0; | 2678 | inode->i_size = 0; |
2720 | inode->i_fop = &cgroup_file_operations; | 2679 | inode->i_fop = &cgroup_file_operations; |
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2725 | return 0; | 2684 | return 0; |
2726 | } | 2685 | } |
2727 | 2686 | ||
2728 | /* | ||
2729 | * cgroup_create_dir - create a directory for an object. | ||
2730 | * @cgrp: the cgroup we create the directory for. It must have a valid | ||
2731 | * ->parent field. And we are going to fill its ->dentry field. | ||
2732 | * @dentry: dentry of the new cgroup | ||
2733 | * @mode: mode to set on new directory. | ||
2734 | */ | ||
2735 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | ||
2736 | umode_t mode) | ||
2737 | { | ||
2738 | struct dentry *parent; | ||
2739 | int error = 0; | ||
2740 | |||
2741 | parent = cgrp->parent->dentry; | ||
2742 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | ||
2743 | if (!error) { | ||
2744 | dentry->d_fsdata = cgrp; | ||
2745 | inc_nlink(parent->d_inode); | ||
2746 | rcu_assign_pointer(cgrp->dentry, dentry); | ||
2747 | dget(dentry); | ||
2748 | } | ||
2749 | dput(dentry); | ||
2750 | |||
2751 | return error; | ||
2752 | } | ||
2753 | |||
2754 | /** | 2687 | /** |
2755 | * cgroup_file_mode - deduce file mode of a control file | 2688 | * cgroup_file_mode - deduce file mode of a control file |
2756 | * @cft: the control file in question | 2689 | * @cft: the control file in question |
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2791 | 2724 | ||
2792 | simple_xattrs_init(&cft->xattrs); | 2725 | simple_xattrs_init(&cft->xattrs); |
2793 | 2726 | ||
2794 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2795 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2796 | return 0; | ||
2797 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2798 | return 0; | ||
2799 | |||
2800 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2727 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2801 | strcpy(name, subsys->name); | 2728 | strcpy(name, subsys->name); |
2802 | strcat(name, "."); | 2729 | strcat(name, "."); |
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2837 | int err, ret = 0; | 2764 | int err, ret = 0; |
2838 | 2765 | ||
2839 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2766 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2767 | /* does cft->flags tell us to skip this file on @cgrp? */ | ||
2768 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2769 | continue; | ||
2770 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2771 | continue; | ||
2772 | |||
2840 | if (is_add) | 2773 | if (is_add) |
2841 | err = cgroup_add_file(cgrp, subsys, cft); | 2774 | err = cgroup_add_file(cgrp, subsys, cft); |
2842 | else | 2775 | else |
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void) | |||
3044 | write_unlock(&css_set_lock); | 2977 | write_unlock(&css_set_lock); |
3045 | } | 2978 | } |
3046 | 2979 | ||
2980 | /** | ||
2981 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | ||
2982 | * @pos: the current position (%NULL to initiate traversal) | ||
2983 | * @cgroup: cgroup whose descendants to walk | ||
2984 | * | ||
2985 | * To be used by cgroup_for_each_descendant_pre(). Find the next | ||
2986 | * descendant to visit for pre-order traversal of @cgroup's descendants. | ||
2987 | */ | ||
2988 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | ||
2989 | struct cgroup *cgroup) | ||
2990 | { | ||
2991 | struct cgroup *next; | ||
2992 | |||
2993 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
2994 | |||
2995 | /* if first iteration, pretend we just visited @cgroup */ | ||
2996 | if (!pos) { | ||
2997 | if (list_empty(&cgroup->children)) | ||
2998 | return NULL; | ||
2999 | pos = cgroup; | ||
3000 | } | ||
3001 | |||
3002 | /* visit the first child if exists */ | ||
3003 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | ||
3004 | if (next) | ||
3005 | return next; | ||
3006 | |||
3007 | /* no child, visit my or the closest ancestor's next sibling */ | ||
3008 | do { | ||
3009 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | ||
3010 | sibling); | ||
3011 | if (&next->sibling != &pos->parent->children) | ||
3012 | return next; | ||
3013 | |||
3014 | pos = pos->parent; | ||
3015 | } while (pos != cgroup); | ||
3016 | |||
3017 | return NULL; | ||
3018 | } | ||
3019 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | ||
3020 | |||
3021 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | ||
3022 | { | ||
3023 | struct cgroup *last; | ||
3024 | |||
3025 | do { | ||
3026 | last = pos; | ||
3027 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | ||
3028 | sibling); | ||
3029 | } while (pos); | ||
3030 | |||
3031 | return last; | ||
3032 | } | ||
3033 | |||
3034 | /** | ||
3035 | * cgroup_next_descendant_post - find the next descendant for post-order walk | ||
3036 | * @pos: the current position (%NULL to initiate traversal) | ||
3037 | * @cgroup: cgroup whose descendants to walk | ||
3038 | * | ||
3039 | * To be used by cgroup_for_each_descendant_post(). Find the next | ||
3040 | * descendant to visit for post-order traversal of @cgroup's descendants. | ||
3041 | */ | ||
3042 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | ||
3043 | struct cgroup *cgroup) | ||
3044 | { | ||
3045 | struct cgroup *next; | ||
3046 | |||
3047 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3048 | |||
3049 | /* if first iteration, visit the leftmost descendant */ | ||
3050 | if (!pos) { | ||
3051 | next = cgroup_leftmost_descendant(cgroup); | ||
3052 | return next != cgroup ? next : NULL; | ||
3053 | } | ||
3054 | |||
3055 | /* if there's an unvisited sibling, visit its leftmost descendant */ | ||
3056 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3057 | if (&next->sibling != &pos->parent->children) | ||
3058 | return cgroup_leftmost_descendant(next); | ||
3059 | |||
3060 | /* no sibling left, visit parent */ | ||
3061 | next = pos->parent; | ||
3062 | return next != cgroup ? next : NULL; | ||
3063 | } | ||
3064 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3065 | |||
3047 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3066 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
3048 | __acquires(css_set_lock) | 3067 | __acquires(css_set_lock) |
3049 | { | 3068 | { |
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3757 | if (flags & POLLHUP) { | 3776 | if (flags & POLLHUP) { |
3758 | __remove_wait_queue(event->wqh, &event->wait); | 3777 | __remove_wait_queue(event->wqh, &event->wait); |
3759 | spin_lock(&cgrp->event_list_lock); | 3778 | spin_lock(&cgrp->event_list_lock); |
3760 | list_del(&event->list); | 3779 | list_del_init(&event->list); |
3761 | spin_unlock(&cgrp->event_list_lock); | 3780 | spin_unlock(&cgrp->event_list_lock); |
3762 | /* | 3781 | /* |
3763 | * We are in atomic context, but cgroup_event_remove() may | 3782 | * We are in atomic context, but cgroup_event_remove() may |
@@ -3894,7 +3913,7 @@ fail: | |||
3894 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3913 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3895 | struct cftype *cft) | 3914 | struct cftype *cft) |
3896 | { | 3915 | { |
3897 | return clone_children(cgrp); | 3916 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3898 | } | 3917 | } |
3899 | 3918 | ||
3900 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3919 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3902 | u64 val) | 3921 | u64 val) |
3903 | { | 3922 | { |
3904 | if (val) | 3923 | if (val) |
3905 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3924 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3906 | else | 3925 | else |
3907 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3926 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3908 | return 0; | 3927 | return 0; |
3909 | } | 3928 | } |
3910 | 3929 | ||
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4017 | css->flags = 0; | 4036 | css->flags = 0; |
4018 | css->id = NULL; | 4037 | css->id = NULL; |
4019 | if (cgrp == dummytop) | 4038 | if (cgrp == dummytop) |
4020 | set_bit(CSS_ROOT, &css->flags); | 4039 | css->flags |= CSS_ROOT; |
4021 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4040 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4022 | cgrp->subsys[ss->subsys_id] = css; | 4041 | cgrp->subsys[ss->subsys_id] = css; |
4023 | 4042 | ||
4024 | /* | 4043 | /* |
4025 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 4044 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4026 | * which is put on the last css_put(). dput() requires process | 4045 | * css_put(). dput() requires process context, which css_put() may |
4027 | * context, which css_put() may be called without. @css->dput_work | 4046 | * be called without. @css->dput_work will be used to invoke |
4028 | * will be used to invoke dput() asynchronously from css_put(). | 4047 | * dput() asynchronously from css_put(). |
4029 | */ | 4048 | */ |
4030 | INIT_WORK(&css->dput_work, css_dput_fn); | 4049 | INIT_WORK(&css->dput_work, css_dput_fn); |
4031 | if (ss->__DEPRECATED_clear_css_refs) | 4050 | } |
4032 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 4051 | |
4052 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | ||
4053 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4054 | { | ||
4055 | int ret = 0; | ||
4056 | |||
4057 | lockdep_assert_held(&cgroup_mutex); | ||
4058 | |||
4059 | if (ss->css_online) | ||
4060 | ret = ss->css_online(cgrp); | ||
4061 | if (!ret) | ||
4062 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | ||
4063 | return ret; | ||
4064 | } | ||
4065 | |||
4066 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | ||
4067 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4068 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4069 | { | ||
4070 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4071 | |||
4072 | lockdep_assert_held(&cgroup_mutex); | ||
4073 | |||
4074 | if (!(css->flags & CSS_ONLINE)) | ||
4075 | return; | ||
4076 | |||
4077 | /* | ||
4078 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4079 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4080 | * details. This temporary unlocking should go away once | ||
4081 | * cgroup_mutex is unexported from controllers. | ||
4082 | */ | ||
4083 | if (ss->css_offline) { | ||
4084 | mutex_unlock(&cgroup_mutex); | ||
4085 | ss->css_offline(cgrp); | ||
4086 | mutex_lock(&cgroup_mutex); | ||
4087 | } | ||
4088 | |||
4089 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | ||
4033 | } | 4090 | } |
4034 | 4091 | ||
4035 | /* | 4092 | /* |
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4049 | struct cgroup_subsys *ss; | 4106 | struct cgroup_subsys *ss; |
4050 | struct super_block *sb = root->sb; | 4107 | struct super_block *sb = root->sb; |
4051 | 4108 | ||
4109 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | ||
4052 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4110 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
4053 | if (!cgrp) | 4111 | if (!cgrp) |
4054 | return -ENOMEM; | 4112 | return -ENOMEM; |
4055 | 4113 | ||
4114 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||
4115 | if (cgrp->id < 0) | ||
4116 | goto err_free_cgrp; | ||
4117 | |||
4118 | /* | ||
4119 | * Only live parents can have children. Note that the liveliness | ||
4120 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4121 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4122 | * anyway so that locking is contained inside cgroup proper and we | ||
4123 | * don't get nasty surprises if we ever grow another caller. | ||
4124 | */ | ||
4125 | if (!cgroup_lock_live_group(parent)) { | ||
4126 | err = -ENODEV; | ||
4127 | goto err_free_id; | ||
4128 | } | ||
4129 | |||
4056 | /* Grab a reference on the superblock so the hierarchy doesn't | 4130 | /* Grab a reference on the superblock so the hierarchy doesn't |
4057 | * get deleted on unmount if there are child cgroups. This | 4131 | * get deleted on unmount if there are child cgroups. This |
4058 | * can be done outside cgroup_mutex, since the sb can't | 4132 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4060 | * fs */ | 4134 | * fs */ |
4061 | atomic_inc(&sb->s_active); | 4135 | atomic_inc(&sb->s_active); |
4062 | 4136 | ||
4063 | mutex_lock(&cgroup_mutex); | ||
4064 | |||
4065 | init_cgroup_housekeeping(cgrp); | 4137 | init_cgroup_housekeeping(cgrp); |
4066 | 4138 | ||
4067 | cgrp->parent = parent; | 4139 | cgrp->parent = parent; |
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4071 | if (notify_on_release(parent)) | 4143 | if (notify_on_release(parent)) |
4072 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4144 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
4073 | 4145 | ||
4074 | if (clone_children(parent)) | 4146 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4075 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4147 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4076 | 4148 | ||
4077 | for_each_subsys(root, ss) { | 4149 | for_each_subsys(root, ss) { |
4078 | struct cgroup_subsys_state *css; | 4150 | struct cgroup_subsys_state *css; |
4079 | 4151 | ||
4080 | css = ss->create(cgrp); | 4152 | css = ss->css_alloc(cgrp); |
4081 | if (IS_ERR(css)) { | 4153 | if (IS_ERR(css)) { |
4082 | err = PTR_ERR(css); | 4154 | err = PTR_ERR(css); |
4083 | goto err_destroy; | 4155 | goto err_free_all; |
4084 | } | 4156 | } |
4085 | init_cgroup_css(css, ss, cgrp); | 4157 | init_cgroup_css(css, ss, cgrp); |
4086 | if (ss->use_id) { | 4158 | if (ss->use_id) { |
4087 | err = alloc_css_id(ss, parent, cgrp); | 4159 | err = alloc_css_id(ss, parent, cgrp); |
4088 | if (err) | 4160 | if (err) |
4089 | goto err_destroy; | 4161 | goto err_free_all; |
4090 | } | 4162 | } |
4091 | /* At error, ->destroy() callback has to free assigned ID. */ | 4163 | } |
4092 | if (clone_children(parent) && ss->post_clone) | 4164 | |
4093 | ss->post_clone(cgrp); | 4165 | /* |
4166 | * Create directory. cgroup_create_file() returns with the new | ||
4167 | * directory locked on success so that it can be populated without | ||
4168 | * dropping cgroup_mutex. | ||
4169 | */ | ||
4170 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | ||
4171 | if (err < 0) | ||
4172 | goto err_free_all; | ||
4173 | lockdep_assert_held(&dentry->d_inode->i_mutex); | ||
4174 | |||
4175 | /* allocation complete, commit to creation */ | ||
4176 | dentry->d_fsdata = cgrp; | ||
4177 | cgrp->dentry = dentry; | ||
4178 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4179 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | ||
4180 | root->number_of_cgroups++; | ||
4181 | |||
4182 | /* each css holds a ref to the cgroup's dentry */ | ||
4183 | for_each_subsys(root, ss) | ||
4184 | dget(dentry); | ||
4185 | |||
4186 | /* creation succeeded, notify subsystems */ | ||
4187 | for_each_subsys(root, ss) { | ||
4188 | err = online_css(ss, cgrp); | ||
4189 | if (err) | ||
4190 | goto err_destroy; | ||
4094 | 4191 | ||
4095 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4192 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4096 | parent->parent) { | 4193 | parent->parent) { |
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4102 | } | 4199 | } |
4103 | } | 4200 | } |
4104 | 4201 | ||
4105 | list_add(&cgrp->sibling, &cgrp->parent->children); | ||
4106 | root->number_of_cgroups++; | ||
4107 | |||
4108 | err = cgroup_create_dir(cgrp, dentry, mode); | ||
4109 | if (err < 0) | ||
4110 | goto err_remove; | ||
4111 | |||
4112 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4113 | for_each_subsys(root, ss) | ||
4114 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4115 | dget(dentry); | ||
4116 | |||
4117 | /* The cgroup directory was pre-locked for us */ | ||
4118 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | ||
4119 | |||
4120 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4121 | |||
4122 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4202 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
4123 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4203 | if (err) |
4204 | goto err_destroy; | ||
4124 | 4205 | ||
4125 | mutex_unlock(&cgroup_mutex); | 4206 | mutex_unlock(&cgroup_mutex); |
4126 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4207 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4127 | 4208 | ||
4128 | return 0; | 4209 | return 0; |
4129 | 4210 | ||
4130 | err_remove: | 4211 | err_free_all: |
4131 | |||
4132 | list_del(&cgrp->sibling); | ||
4133 | root->number_of_cgroups--; | ||
4134 | |||
4135 | err_destroy: | ||
4136 | |||
4137 | for_each_subsys(root, ss) { | 4212 | for_each_subsys(root, ss) { |
4138 | if (cgrp->subsys[ss->subsys_id]) | 4213 | if (cgrp->subsys[ss->subsys_id]) |
4139 | ss->destroy(cgrp); | 4214 | ss->css_free(cgrp); |
4140 | } | 4215 | } |
4141 | |||
4142 | mutex_unlock(&cgroup_mutex); | 4216 | mutex_unlock(&cgroup_mutex); |
4143 | |||
4144 | /* Release the reference count that we took on the superblock */ | 4217 | /* Release the reference count that we took on the superblock */ |
4145 | deactivate_super(sb); | 4218 | deactivate_super(sb); |
4146 | 4219 | err_free_id: | |
4220 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||
4221 | err_free_cgrp: | ||
4147 | kfree(cgrp); | 4222 | kfree(cgrp); |
4148 | return err; | 4223 | return err; |
4224 | |||
4225 | err_destroy: | ||
4226 | cgroup_destroy_locked(cgrp); | ||
4227 | mutex_unlock(&cgroup_mutex); | ||
4228 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
4229 | return err; | ||
4149 | } | 4230 | } |
4150 | 4231 | ||
4151 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 4232 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4197 | return 0; | 4278 | return 0; |
4198 | } | 4279 | } |
4199 | 4280 | ||
4200 | /* | 4281 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4201 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4282 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4202 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4203 | * busy subsystems. Call with cgroup_mutex held | ||
4204 | * | ||
4205 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4206 | * not, cgroup removal behaves differently. | ||
4207 | * | ||
4208 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4209 | * cgroup removal can be committed. This is implemented by | ||
4210 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4211 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4212 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4213 | * removed as soon as the existing user (memcg) is updated. | ||
4214 | * | ||
4215 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4216 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4217 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4218 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4219 | * is put so that dentry destruction happens only after all css's are | ||
4220 | * released. | ||
4221 | */ | ||
4222 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4223 | { | 4283 | { |
4284 | struct dentry *d = cgrp->dentry; | ||
4285 | struct cgroup *parent = cgrp->parent; | ||
4286 | DEFINE_WAIT(wait); | ||
4287 | struct cgroup_event *event, *tmp; | ||
4224 | struct cgroup_subsys *ss; | 4288 | struct cgroup_subsys *ss; |
4225 | unsigned long flags; | 4289 | LIST_HEAD(tmp_list); |
4226 | bool failed = false; | 4290 | |
4291 | lockdep_assert_held(&d->d_inode->i_mutex); | ||
4292 | lockdep_assert_held(&cgroup_mutex); | ||
4227 | 4293 | ||
4228 | local_irq_save(flags); | 4294 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) |
4295 | return -EBUSY; | ||
4229 | 4296 | ||
4230 | /* | 4297 | /* |
4231 | * Block new css_tryget() by deactivating refcnt. If all refcnts | 4298 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4232 | * for subsystems w/ clear_css_refs set were 1 at the moment of | 4299 | * removed. This makes future css_tryget() and child creation |
4233 | * deactivation, we succeeded. | 4300 | * attempts fail thus maintaining the removal conditions verified |
4301 | * above. | ||
4234 | */ | 4302 | */ |
4235 | for_each_subsys(cgrp->root, ss) { | 4303 | for_each_subsys(cgrp->root, ss) { |
4236 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4304 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4237 | 4305 | ||
4238 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4306 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4239 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4307 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4240 | |||
4241 | if (ss->__DEPRECATED_clear_css_refs) | ||
4242 | failed |= css_refcnt(css) != 1; | ||
4243 | } | ||
4244 | |||
4245 | /* | ||
4246 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4247 | * restore refcnts to positive values. Either way, all in-progress | ||
4248 | * css_tryget() will be released. | ||
4249 | */ | ||
4250 | for_each_subsys(cgrp->root, ss) { | ||
4251 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4252 | |||
4253 | if (!failed) { | ||
4254 | set_bit(CSS_REMOVED, &css->flags); | ||
4255 | css_put(css); | ||
4256 | } else { | ||
4257 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4258 | } | ||
4259 | } | 4308 | } |
4309 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4260 | 4310 | ||
4261 | local_irq_restore(flags); | 4311 | /* tell subsystems to initate destruction */ |
4262 | return !failed; | 4312 | for_each_subsys(cgrp->root, ss) |
4263 | } | 4313 | offline_css(ss, cgrp); |
4264 | |||
4265 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4266 | { | ||
4267 | struct cgroup *cgrp = dentry->d_fsdata; | ||
4268 | struct dentry *d; | ||
4269 | struct cgroup *parent; | ||
4270 | DEFINE_WAIT(wait); | ||
4271 | struct cgroup_event *event, *tmp; | ||
4272 | int ret; | ||
4273 | |||
4274 | /* the vfs holds both inode->i_mutex already */ | ||
4275 | again: | ||
4276 | mutex_lock(&cgroup_mutex); | ||
4277 | if (atomic_read(&cgrp->count) != 0) { | ||
4278 | mutex_unlock(&cgroup_mutex); | ||
4279 | return -EBUSY; | ||
4280 | } | ||
4281 | if (!list_empty(&cgrp->children)) { | ||
4282 | mutex_unlock(&cgroup_mutex); | ||
4283 | return -EBUSY; | ||
4284 | } | ||
4285 | mutex_unlock(&cgroup_mutex); | ||
4286 | |||
4287 | /* | ||
4288 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
4289 | * in racy cases, subsystem may have to get css->refcnt after | ||
4290 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
4291 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
4292 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4293 | * and subsystem's reference count handling. Please see css_get/put | ||
4294 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4295 | */ | ||
4296 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4297 | 4314 | ||
4298 | /* | 4315 | /* |
4299 | * Call pre_destroy handlers of subsys. Notify subsystems | 4316 | * Put all the base refs. Each css holds an extra reference to the |
4300 | * that rmdir() request comes. | 4317 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4318 | * refs. On the last put of each css, whenever that may be, the | ||
4319 | * extra dentry ref is put so that dentry destruction happens only | ||
4320 | * after all css's are released. | ||
4301 | */ | 4321 | */ |
4302 | ret = cgroup_call_pre_destroy(cgrp); | 4322 | for_each_subsys(cgrp->root, ss) |
4303 | if (ret) { | 4323 | css_put(cgrp->subsys[ss->subsys_id]); |
4304 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4305 | return ret; | ||
4306 | } | ||
4307 | |||
4308 | mutex_lock(&cgroup_mutex); | ||
4309 | parent = cgrp->parent; | ||
4310 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | ||
4311 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4312 | mutex_unlock(&cgroup_mutex); | ||
4313 | return -EBUSY; | ||
4314 | } | ||
4315 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | ||
4316 | if (!cgroup_clear_css_refs(cgrp)) { | ||
4317 | mutex_unlock(&cgroup_mutex); | ||
4318 | /* | ||
4319 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4320 | * prepare_to_wait(), we need to check this flag. | ||
4321 | */ | ||
4322 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4323 | schedule(); | ||
4324 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4325 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4326 | if (signal_pending(current)) | ||
4327 | return -EINTR; | ||
4328 | goto again; | ||
4329 | } | ||
4330 | /* NO css_tryget() can success after here. */ | ||
4331 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4332 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4333 | 4324 | ||
4334 | raw_spin_lock(&release_list_lock); | 4325 | raw_spin_lock(&release_list_lock); |
4335 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4336 | if (!list_empty(&cgrp->release_list)) | 4326 | if (!list_empty(&cgrp->release_list)) |
4337 | list_del_init(&cgrp->release_list); | 4327 | list_del_init(&cgrp->release_list); |
4338 | raw_spin_unlock(&release_list_lock); | 4328 | raw_spin_unlock(&release_list_lock); |
4339 | 4329 | ||
4340 | /* delete this cgroup from parent->children */ | 4330 | /* delete this cgroup from parent->children */ |
4341 | list_del_init(&cgrp->sibling); | 4331 | list_del_rcu(&cgrp->sibling); |
4342 | |||
4343 | list_del_init(&cgrp->allcg_node); | 4332 | list_del_init(&cgrp->allcg_node); |
4344 | 4333 | ||
4345 | d = dget(cgrp->dentry); | 4334 | dget(d); |
4346 | |||
4347 | cgroup_d_remove_dir(d); | 4335 | cgroup_d_remove_dir(d); |
4348 | dput(d); | 4336 | dput(d); |
4349 | 4337 | ||
@@ -4353,21 +4341,35 @@ again: | |||
4353 | /* | 4341 | /* |
4354 | * Unregister events and notify userspace. | 4342 | * Unregister events and notify userspace. |
4355 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4343 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4356 | * directory to avoid race between userspace and kernelspace | 4344 | * directory to avoid race between userspace and kernelspace. Use |
4345 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4346 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4347 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4357 | */ | 4348 | */ |
4358 | spin_lock(&cgrp->event_list_lock); | 4349 | spin_lock(&cgrp->event_list_lock); |
4359 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4350 | list_splice_init(&cgrp->event_list, &tmp_list); |
4360 | list_del(&event->list); | 4351 | spin_unlock(&cgrp->event_list_lock); |
4352 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4353 | list_del_init(&event->list); | ||
4361 | remove_wait_queue(event->wqh, &event->wait); | 4354 | remove_wait_queue(event->wqh, &event->wait); |
4362 | eventfd_signal(event->eventfd, 1); | 4355 | eventfd_signal(event->eventfd, 1); |
4363 | schedule_work(&event->remove); | 4356 | schedule_work(&event->remove); |
4364 | } | 4357 | } |
4365 | spin_unlock(&cgrp->event_list_lock); | ||
4366 | 4358 | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | return 0; | 4359 | return 0; |
4369 | } | 4360 | } |
4370 | 4361 | ||
4362 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4363 | { | ||
4364 | int ret; | ||
4365 | |||
4366 | mutex_lock(&cgroup_mutex); | ||
4367 | ret = cgroup_destroy_locked(dentry->d_fsdata); | ||
4368 | mutex_unlock(&cgroup_mutex); | ||
4369 | |||
4370 | return ret; | ||
4371 | } | ||
4372 | |||
4371 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | 4373 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) |
4372 | { | 4374 | { |
4373 | INIT_LIST_HEAD(&ss->cftsets); | 4375 | INIT_LIST_HEAD(&ss->cftsets); |
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4388 | 4390 | ||
4389 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4391 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4390 | 4392 | ||
4393 | mutex_lock(&cgroup_mutex); | ||
4394 | |||
4391 | /* init base cftset */ | 4395 | /* init base cftset */ |
4392 | cgroup_init_cftsets(ss); | 4396 | cgroup_init_cftsets(ss); |
4393 | 4397 | ||
4394 | /* Create the top cgroup state for this subsystem */ | 4398 | /* Create the top cgroup state for this subsystem */ |
4395 | list_add(&ss->sibling, &rootnode.subsys_list); | 4399 | list_add(&ss->sibling, &rootnode.subsys_list); |
4396 | ss->root = &rootnode; | 4400 | ss->root = &rootnode; |
4397 | css = ss->create(dummytop); | 4401 | css = ss->css_alloc(dummytop); |
4398 | /* We don't handle early failures gracefully */ | 4402 | /* We don't handle early failures gracefully */ |
4399 | BUG_ON(IS_ERR(css)); | 4403 | BUG_ON(IS_ERR(css)); |
4400 | init_cgroup_css(css, ss, dummytop); | 4404 | init_cgroup_css(css, ss, dummytop); |
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4403 | * pointer to this state - since the subsystem is | 4407 | * pointer to this state - since the subsystem is |
4404 | * newly registered, all tasks and hence the | 4408 | * newly registered, all tasks and hence the |
4405 | * init_css_set is in the subsystem's top cgroup. */ | 4409 | * init_css_set is in the subsystem's top cgroup. */ |
4406 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4410 | init_css_set.subsys[ss->subsys_id] = css; |
4407 | 4411 | ||
4408 | need_forkexit_callback |= ss->fork || ss->exit; | 4412 | need_forkexit_callback |= ss->fork || ss->exit; |
4409 | 4413 | ||
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4413 | BUG_ON(!list_empty(&init_task.tasks)); | 4417 | BUG_ON(!list_empty(&init_task.tasks)); |
4414 | 4418 | ||
4415 | ss->active = 1; | 4419 | ss->active = 1; |
4420 | BUG_ON(online_css(ss, dummytop)); | ||
4421 | |||
4422 | mutex_unlock(&cgroup_mutex); | ||
4416 | 4423 | ||
4417 | /* this function shouldn't be used with modular subsystems, since they | 4424 | /* this function shouldn't be used with modular subsystems, since they |
4418 | * need to register a subsys_id, among other things */ | 4425 | * need to register a subsys_id, among other things */ |
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4430 | */ | 4437 | */ |
4431 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4438 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4432 | { | 4439 | { |
4433 | int i; | ||
4434 | struct cgroup_subsys_state *css; | 4440 | struct cgroup_subsys_state *css; |
4441 | int i, ret; | ||
4435 | 4442 | ||
4436 | /* check name and function validity */ | 4443 | /* check name and function validity */ |
4437 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4444 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4438 | ss->create == NULL || ss->destroy == NULL) | 4445 | ss->css_alloc == NULL || ss->css_free == NULL) |
4439 | return -EINVAL; | 4446 | return -EINVAL; |
4440 | 4447 | ||
4441 | /* | 4448 | /* |
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4464 | subsys[ss->subsys_id] = ss; | 4471 | subsys[ss->subsys_id] = ss; |
4465 | 4472 | ||
4466 | /* | 4473 | /* |
4467 | * no ss->create seems to need anything important in the ss struct, so | 4474 | * no ss->css_alloc seems to need anything important in the ss |
4468 | * this can happen first (i.e. before the rootnode attachment). | 4475 | * struct, so this can happen first (i.e. before the rootnode |
4476 | * attachment). | ||
4469 | */ | 4477 | */ |
4470 | css = ss->create(dummytop); | 4478 | css = ss->css_alloc(dummytop); |
4471 | if (IS_ERR(css)) { | 4479 | if (IS_ERR(css)) { |
4472 | /* failure case - need to deassign the subsys[] slot. */ | 4480 | /* failure case - need to deassign the subsys[] slot. */ |
4473 | subsys[ss->subsys_id] = NULL; | 4481 | subsys[ss->subsys_id] = NULL; |
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4482 | init_cgroup_css(css, ss, dummytop); | 4490 | init_cgroup_css(css, ss, dummytop); |
4483 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4491 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4484 | if (ss->use_id) { | 4492 | if (ss->use_id) { |
4485 | int ret = cgroup_init_idr(ss, css); | 4493 | ret = cgroup_init_idr(ss, css); |
4486 | if (ret) { | 4494 | if (ret) |
4487 | dummytop->subsys[ss->subsys_id] = NULL; | 4495 | goto err_unload; |
4488 | ss->destroy(dummytop); | ||
4489 | subsys[ss->subsys_id] = NULL; | ||
4490 | mutex_unlock(&cgroup_mutex); | ||
4491 | return ret; | ||
4492 | } | ||
4493 | } | 4496 | } |
4494 | 4497 | ||
4495 | /* | 4498 | /* |
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4522 | write_unlock(&css_set_lock); | 4525 | write_unlock(&css_set_lock); |
4523 | 4526 | ||
4524 | ss->active = 1; | 4527 | ss->active = 1; |
4528 | ret = online_css(ss, dummytop); | ||
4529 | if (ret) | ||
4530 | goto err_unload; | ||
4525 | 4531 | ||
4526 | /* success! */ | 4532 | /* success! */ |
4527 | mutex_unlock(&cgroup_mutex); | 4533 | mutex_unlock(&cgroup_mutex); |
4528 | return 0; | 4534 | return 0; |
4535 | |||
4536 | err_unload: | ||
4537 | mutex_unlock(&cgroup_mutex); | ||
4538 | /* @ss can't be mounted here as try_module_get() would fail */ | ||
4539 | cgroup_unload_subsys(ss); | ||
4540 | return ret; | ||
4529 | } | 4541 | } |
4530 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4542 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4531 | 4543 | ||
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4552 | BUG_ON(ss->root != &rootnode); | 4564 | BUG_ON(ss->root != &rootnode); |
4553 | 4565 | ||
4554 | mutex_lock(&cgroup_mutex); | 4566 | mutex_lock(&cgroup_mutex); |
4567 | |||
4568 | offline_css(ss, dummytop); | ||
4569 | ss->active = 0; | ||
4570 | |||
4571 | if (ss->use_id) { | ||
4572 | idr_remove_all(&ss->idr); | ||
4573 | idr_destroy(&ss->idr); | ||
4574 | } | ||
4575 | |||
4555 | /* deassign the subsys_id */ | 4576 | /* deassign the subsys_id */ |
4556 | subsys[ss->subsys_id] = NULL; | 4577 | subsys[ss->subsys_id] = NULL; |
4557 | 4578 | ||
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4567 | struct css_set *cg = link->cg; | 4588 | struct css_set *cg = link->cg; |
4568 | 4589 | ||
4569 | hlist_del(&cg->hlist); | 4590 | hlist_del(&cg->hlist); |
4570 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
4571 | cg->subsys[ss->subsys_id] = NULL; | 4591 | cg->subsys[ss->subsys_id] = NULL; |
4572 | hhead = css_set_hash(cg->subsys); | 4592 | hhead = css_set_hash(cg->subsys); |
4573 | hlist_add_head(&cg->hlist, hhead); | 4593 | hlist_add_head(&cg->hlist, hhead); |
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4575 | write_unlock(&css_set_lock); | 4595 | write_unlock(&css_set_lock); |
4576 | 4596 | ||
4577 | /* | 4597 | /* |
4578 | * remove subsystem's css from the dummytop and free it - need to free | 4598 | * remove subsystem's css from the dummytop and free it - need to |
4579 | * before marking as null because ss->destroy needs the cgrp->subsys | 4599 | * free before marking as null because ss->css_free needs the |
4580 | * pointer to find their state. note that this also takes care of | 4600 | * cgrp->subsys pointer to find their state. note that this also |
4581 | * freeing the css_id. | 4601 | * takes care of freeing the css_id. |
4582 | */ | 4602 | */ |
4583 | ss->destroy(dummytop); | 4603 | ss->css_free(dummytop); |
4584 | dummytop->subsys[ss->subsys_id] = NULL; | 4604 | dummytop->subsys[ss->subsys_id] = NULL; |
4585 | 4605 | ||
4586 | mutex_unlock(&cgroup_mutex); | 4606 | mutex_unlock(&cgroup_mutex); |
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void) | |||
4624 | 4644 | ||
4625 | BUG_ON(!ss->name); | 4645 | BUG_ON(!ss->name); |
4626 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4646 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4627 | BUG_ON(!ss->create); | 4647 | BUG_ON(!ss->css_alloc); |
4628 | BUG_ON(!ss->destroy); | 4648 | BUG_ON(!ss->css_free); |
4629 | if (ss->subsys_id != i) { | 4649 | if (ss->subsys_id != i) { |
4630 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4650 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4631 | ss->name, ss->subsys_id); | 4651 | ss->name, ss->subsys_id); |
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child) | |||
4832 | } | 4852 | } |
4833 | 4853 | ||
4834 | /** | 4854 | /** |
4835 | * cgroup_fork_callbacks - run fork callbacks | ||
4836 | * @child: the new task | ||
4837 | * | ||
4838 | * Called on a new task very soon before adding it to the | ||
4839 | * tasklist. No need to take any locks since no-one can | ||
4840 | * be operating on this task. | ||
4841 | */ | ||
4842 | void cgroup_fork_callbacks(struct task_struct *child) | ||
4843 | { | ||
4844 | if (need_forkexit_callback) { | ||
4845 | int i; | ||
4846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4847 | struct cgroup_subsys *ss = subsys[i]; | ||
4848 | |||
4849 | /* | ||
4850 | * forkexit callbacks are only supported for | ||
4851 | * builtin subsystems. | ||
4852 | */ | ||
4853 | if (!ss || ss->module) | ||
4854 | continue; | ||
4855 | |||
4856 | if (ss->fork) | ||
4857 | ss->fork(child); | ||
4858 | } | ||
4859 | } | ||
4860 | } | ||
4861 | |||
4862 | /** | ||
4863 | * cgroup_post_fork - called on a new task after adding it to the task list | 4855 | * cgroup_post_fork - called on a new task after adding it to the task list |
4864 | * @child: the task in question | 4856 | * @child: the task in question |
4865 | * | 4857 | * |
4866 | * Adds the task to the list running through its css_set if necessary. | 4858 | * Adds the task to the list running through its css_set if necessary and |
4867 | * Has to be after the task is visible on the task list in case we race | 4859 | * call the subsystem fork() callbacks. Has to be after the task is |
4868 | * with the first call to cgroup_iter_start() - to guarantee that the | 4860 | * visible on the task list in case we race with the first call to |
4869 | * new task ends up on its list. | 4861 | * cgroup_iter_start() - to guarantee that the new task ends up on its |
4862 | * list. | ||
4870 | */ | 4863 | */ |
4871 | void cgroup_post_fork(struct task_struct *child) | 4864 | void cgroup_post_fork(struct task_struct *child) |
4872 | { | 4865 | { |
4866 | int i; | ||
4867 | |||
4873 | /* | 4868 | /* |
4874 | * use_task_css_set_links is set to 1 before we walk the tasklist | 4869 | * use_task_css_set_links is set to 1 before we walk the tasklist |
4875 | * under the tasklist_lock and we read it here after we added the child | 4870 | * under the tasklist_lock and we read it here after we added the child |
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child) | |||
4889 | task_unlock(child); | 4884 | task_unlock(child); |
4890 | write_unlock(&css_set_lock); | 4885 | write_unlock(&css_set_lock); |
4891 | } | 4886 | } |
4887 | |||
4888 | /* | ||
4889 | * Call ss->fork(). This must happen after @child is linked on | ||
4890 | * css_set; otherwise, @child might change state between ->fork() | ||
4891 | * and addition to css_set. | ||
4892 | */ | ||
4893 | if (need_forkexit_callback) { | ||
4894 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4895 | struct cgroup_subsys *ss = subsys[i]; | ||
4896 | |||
4897 | /* | ||
4898 | * fork/exit callbacks are supported only for | ||
4899 | * builtin subsystems and we don't need further | ||
4900 | * synchronization as they never go away. | ||
4901 | */ | ||
4902 | if (!ss || ss->module) | ||
4903 | continue; | ||
4904 | |||
4905 | if (ss->fork) | ||
4906 | ss->fork(child); | ||
4907 | } | ||
4908 | } | ||
4892 | } | 4909 | } |
4910 | |||
4893 | /** | 4911 | /** |
4894 | * cgroup_exit - detach cgroup from exiting task | 4912 | * cgroup_exit - detach cgroup from exiting task |
4895 | * @tsk: pointer to task_struct of exiting process | 4913 | * @tsk: pointer to task_struct of exiting process |
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5022 | /* Caller must verify that the css is not for root cgroup */ | 5040 | /* Caller must verify that the css is not for root cgroup */ |
5023 | bool __css_tryget(struct cgroup_subsys_state *css) | 5041 | bool __css_tryget(struct cgroup_subsys_state *css) |
5024 | { | 5042 | { |
5025 | do { | 5043 | while (true) { |
5026 | int v = css_refcnt(css); | 5044 | int t, v; |
5027 | 5045 | ||
5028 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 5046 | v = css_refcnt(css); |
5047 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5048 | if (likely(t == v)) | ||
5029 | return true; | 5049 | return true; |
5050 | else if (t < 0) | ||
5051 | return false; | ||
5030 | cpu_relax(); | 5052 | cpu_relax(); |
5031 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 5053 | } |
5032 | |||
5033 | return false; | ||
5034 | } | 5054 | } |
5035 | EXPORT_SYMBOL_GPL(__css_tryget); | 5055 | EXPORT_SYMBOL_GPL(__css_tryget); |
5036 | 5056 | ||
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5049 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 5069 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5050 | check_for_release(cgrp); | 5070 | check_for_release(cgrp); |
5051 | } | 5071 | } |
5052 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5053 | break; | 5072 | break; |
5054 | case 0: | 5073 | case 0: |
5055 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 5074 | schedule_work(&css->dput_work); |
5056 | schedule_work(&css->dput_work); | ||
5057 | break; | 5075 | break; |
5058 | } | 5076 | } |
5059 | rcu_read_unlock(); | 5077 | rcu_read_unlock(); |
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5439 | } | 5457 | } |
5440 | 5458 | ||
5441 | #ifdef CONFIG_CGROUP_DEBUG | 5459 | #ifdef CONFIG_CGROUP_DEBUG |
5442 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | 5460 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) |
5443 | { | 5461 | { |
5444 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5462 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5445 | 5463 | ||
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | |||
5449 | return css; | 5467 | return css; |
5450 | } | 5468 | } |
5451 | 5469 | ||
5452 | static void debug_destroy(struct cgroup *cont) | 5470 | static void debug_css_free(struct cgroup *cont) |
5453 | { | 5471 | { |
5454 | kfree(cont->subsys[debug_subsys_id]); | 5472 | kfree(cont->subsys[debug_subsys_id]); |
5455 | } | 5473 | } |
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = { | |||
5578 | 5596 | ||
5579 | struct cgroup_subsys debug_subsys = { | 5597 | struct cgroup_subsys debug_subsys = { |
5580 | .name = "debug", | 5598 | .name = "debug", |
5581 | .create = debug_create, | 5599 | .css_alloc = debug_css_alloc, |
5582 | .destroy = debug_destroy, | 5600 | .css_free = debug_css_free, |
5583 | .subsys_id = debug_subsys_id, | 5601 | .subsys_id = debug_subsys_id, |
5584 | .base_cftypes = debug_files, | 5602 | .base_cftypes = debug_files, |
5585 | }; | 5603 | }; |