aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c601
1 files changed, 448 insertions, 153 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,19 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258static int css_unbias_refcnt(int refcnt)
259{
260 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
261}
262
263/* the current nr of refs, always >= 0 whether @css is deactivated or not */
264static int css_refcnt(struct cgroup_subsys_state *css)
265{
266 int v = atomic_read(&css->refcnt);
267
268 return css_unbias_refcnt(v);
269}
270
242/* convenient tests for these bits */ 271/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 272inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 273{
@@ -279,6 +308,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 308#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 309list_for_each_entry(_root, &roots, root_list)
281 310
311static inline struct cgroup *__d_cgrp(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cfent *__d_cfe(struct dentry *dentry)
317{
318 return dentry->d_fsdata;
319}
320
321static inline struct cftype *__d_cft(struct dentry *dentry)
322{
323 return __d_cfe(dentry)->type;
324}
325
282/* the list of cgroups eligible for automatic release. Protected by 326/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 327 * release_list_lock */
284static LIST_HEAD(release_list); 328static LIST_HEAD(release_list);
@@ -816,12 +860,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 860 struct cgroup_subsys *ss;
817 int ret = 0; 861 int ret = 0;
818 862
819 for_each_subsys(cgrp->root, ss) 863 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 864 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 865 continue;
822 if (ret) 866
823 break; 867 ret = ss->pre_destroy(cgrp);
868 if (ret) {
869 /* ->pre_destroy() failure is being deprecated */
870 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
871 break;
824 } 872 }
873 }
825 874
826 return ret; 875 return ret;
827} 876}
@@ -864,6 +913,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 913 BUG_ON(!list_empty(&cgrp->pidlists));
865 914
866 kfree_rcu(cgrp, rcu_head); 915 kfree_rcu(cgrp, rcu_head);
916 } else {
917 struct cfent *cfe = __d_cfe(dentry);
918 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
919
920 WARN_ONCE(!list_empty(&cfe->node) &&
921 cgrp != &cgrp->root->top_cgroup,
922 "cfe still linked for %s\n", cfe->type->name);
923 kfree(cfe);
867 } 924 }
868 iput(inode); 925 iput(inode);
869} 926}
@@ -882,34 +939,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 939 dput(parent);
883} 940}
884 941
885static void cgroup_clear_directory(struct dentry *dentry) 942static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 943{
887 struct list_head *node; 944 struct cfent *cfe;
888 945
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 946 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 947 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 948
892 while (node != &dentry->d_subdirs) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 950 struct dentry *d = cfe->dentry;
894 951
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 952 if (cft && cfe->type != cft)
896 list_del_init(node); 953 continue;
897 if (d->d_inode) { 954
898 /* This should never be called on a cgroup 955 dget(d);
899 * directory with child cgroups */ 956 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 957 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 958 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 959 dput(d);
903 spin_unlock(&dentry->d_lock); 960
904 d_delete(d); 961 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 962 }
912 spin_unlock(&dentry->d_lock); 963 return -ENOENT;
964}
965
966static void cgroup_clear_directory(struct dentry *dir)
967{
968 struct cgroup *cgrp = __d_cgrp(dir);
969
970 while (!list_empty(&cgrp->files))
971 cgroup_rm_file(cgrp, NULL);
913} 972}
914 973
915/* 974/*
@@ -1294,6 +1353,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1353 if (ret)
1295 goto out_unlock; 1354 goto out_unlock;
1296 1355
1356 /* See feature-removal-schedule.txt */
1357 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1358 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1359 task_tgid_nr(current), current->comm);
1360
1297 /* Don't allow flags or name to change at remount */ 1361 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1362 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1363 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1372,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1372 goto out_unlock;
1309 } 1373 }
1310 1374
1311 /* (re)populate subsystem files */ 1375 /* clear out any existing files and repopulate subsystem files */
1376 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1377 cgroup_populate_dir(cgrp);
1313 1378
1314 if (opts.release_agent) 1379 if (opts.release_agent)
@@ -1333,6 +1398,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1398{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1399 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1400 INIT_LIST_HEAD(&cgrp->children);
1401 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1402 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1403 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1404 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1410,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1410static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1411{
1346 struct cgroup *cgrp = &root->top_cgroup; 1412 struct cgroup *cgrp = &root->top_cgroup;
1413
1347 INIT_LIST_HEAD(&root->subsys_list); 1414 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1415 INIT_LIST_HEAD(&root->root_list);
1416 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1417 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1418 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1419 cgrp->top_cgroup = cgrp;
1420 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1421 init_cgroup_housekeeping(cgrp);
1353} 1422}
1354 1423
@@ -1692,16 +1761,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1761
1693static struct kobject *cgroup_kobj; 1762static struct kobject *cgroup_kobj;
1694 1763
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1764/**
1706 * cgroup_path - generate the path of a cgroup 1765 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1766 * @cgrp: the cgroup in question
@@ -2160,9 +2219,9 @@ retry_find_task:
2160 * only need to check permissions on one of them. 2219 * only need to check permissions on one of them.
2161 */ 2220 */
2162 tcred = __task_cred(tsk); 2221 tcred = __task_cred(tsk);
2163 if (cred->euid && 2222 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2164 cred->euid != tcred->uid && 2223 !uid_eq(cred->euid, tcred->uid) &&
2165 cred->euid != tcred->suid) { 2224 !uid_eq(cred->euid, tcred->suid)) {
2166 rcu_read_unlock(); 2225 rcu_read_unlock();
2167 ret = -EACCES; 2226 ret = -EACCES;
2168 goto out_unlock_cgroup; 2227 goto out_unlock_cgroup;
@@ -2172,6 +2231,18 @@ retry_find_task:
2172 2231
2173 if (threadgroup) 2232 if (threadgroup)
2174 tsk = tsk->group_leader; 2233 tsk = tsk->group_leader;
2234
2235 /*
2236 * Workqueue threads may acquire PF_THREAD_BOUND and become
2237 * trapped in a cpuset, or RT worker may be born in a cgroup
2238 * with no rt_runtime allocated. Just say no.
2239 */
2240 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2241 ret = -EINVAL;
2242 rcu_read_unlock();
2243 goto out_unlock_cgroup;
2244 }
2245
2175 get_task_struct(tsk); 2246 get_task_struct(tsk);
2176 rcu_read_unlock(); 2247 rcu_read_unlock();
2177 2248
@@ -2603,50 +2674,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2674 return mode;
2604} 2675}
2605 2676
2606int cgroup_add_file(struct cgroup *cgrp, 2677static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2678 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2679{
2610 struct dentry *dir = cgrp->dentry; 2680 struct dentry *dir = cgrp->dentry;
2681 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2682 struct dentry *dentry;
2683 struct cfent *cfe;
2612 int error; 2684 int error;
2613 umode_t mode; 2685 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2686 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2687
2688 /* does @cft->flags tell us to skip creation on @cgrp? */
2689 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2690 return 0;
2691 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2692 return 0;
2693
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2694 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2695 strcpy(name, subsys->name);
2618 strcat(name, "."); 2696 strcat(name, ".");
2619 } 2697 }
2620 strcat(name, cft->name); 2698 strcat(name, cft->name);
2699
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2700 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2701
2702 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2703 if (!cfe)
2704 return -ENOMEM;
2705
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2706 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2707 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2708 error = PTR_ERR(dentry);
2709 goto out;
2710 }
2711
2712 mode = cgroup_file_mode(cft);
2713 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2714 if (!error) {
2715 cfe->type = (void *)cft;
2716 cfe->dentry = dentry;
2717 dentry->d_fsdata = cfe;
2718 list_add_tail(&cfe->node, &parent->files);
2719 cfe = NULL;
2720 }
2721 dput(dentry);
2722out:
2723 kfree(cfe);
2632 return error; 2724 return error;
2633} 2725}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2726
2636int cgroup_add_files(struct cgroup *cgrp, 2727static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2728 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2729{
2641 int i, err; 2730 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2731 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2732
2644 if (err) 2733 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2734 if (is_add)
2735 err = cgroup_add_file(cgrp, subsys, cft);
2736 else
2737 err = cgroup_rm_file(cgrp, cft);
2738 if (err) {
2739 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2740 is_add ? "add" : "remove", cft->name, err);
2741 ret = err;
2742 }
2743 }
2744 return ret;
2745}
2746
2747static DEFINE_MUTEX(cgroup_cft_mutex);
2748
2749static void cgroup_cfts_prepare(void)
2750 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2751{
2752 /*
2753 * Thanks to the entanglement with vfs inode locking, we can't walk
2754 * the existing cgroups under cgroup_mutex and create files.
2755 * Instead, we increment reference on all cgroups and build list of
2756 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2757 * exclusive access to the field.
2758 */
2759 mutex_lock(&cgroup_cft_mutex);
2760 mutex_lock(&cgroup_mutex);
2761}
2762
2763static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2764 const struct cftype *cfts, bool is_add)
2765 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2766{
2767 LIST_HEAD(pending);
2768 struct cgroup *cgrp, *n;
2769
2770 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2771 if (cfts && ss->root != &rootnode) {
2772 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2773 dget(cgrp->dentry);
2774 list_add_tail(&cgrp->cft_q_node, &pending);
2775 }
2646 } 2776 }
2777
2778 mutex_unlock(&cgroup_mutex);
2779
2780 /*
2781 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2782 * files for all cgroups which were created before.
2783 */
2784 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2785 struct inode *inode = cgrp->dentry->d_inode;
2786
2787 mutex_lock(&inode->i_mutex);
2788 mutex_lock(&cgroup_mutex);
2789 if (!cgroup_is_removed(cgrp))
2790 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2791 mutex_unlock(&cgroup_mutex);
2792 mutex_unlock(&inode->i_mutex);
2793
2794 list_del_init(&cgrp->cft_q_node);
2795 dput(cgrp->dentry);
2796 }
2797
2798 mutex_unlock(&cgroup_cft_mutex);
2799}
2800
2801/**
2802 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2803 * @ss: target cgroup subsystem
2804 * @cfts: zero-length name terminated array of cftypes
2805 *
2806 * Register @cfts to @ss. Files described by @cfts are created for all
2807 * existing cgroups to which @ss is attached and all future cgroups will
2808 * have them too. This function can be called anytime whether @ss is
2809 * attached or not.
2810 *
2811 * Returns 0 on successful registration, -errno on failure. Note that this
2812 * function currently returns 0 as long as @cfts registration is successful
2813 * even if some file creation attempts on existing cgroups fail.
2814 */
2815int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2816{
2817 struct cftype_set *set;
2818
2819 set = kzalloc(sizeof(*set), GFP_KERNEL);
2820 if (!set)
2821 return -ENOMEM;
2822
2823 cgroup_cfts_prepare();
2824 set->cfts = cfts;
2825 list_add_tail(&set->node, &ss->cftsets);
2826 cgroup_cfts_commit(ss, cfts, true);
2827
2647 return 0; 2828 return 0;
2648} 2829}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831
2832/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2834 * @ss: target cgroup subsystem
2835 * @cfts: zero-length name terminated array of cftypes
2836 *
2837 * Unregister @cfts from @ss. Files described by @cfts are removed from
2838 * all existing cgroups to which @ss is attached and all future cgroups
2839 * won't have them either. This function can be called anytime whether @ss
2840 * is attached or not.
2841 *
2842 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2843 * registered with @ss.
2844 */
2845int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2846{
2847 struct cftype_set *set;
2848
2849 cgroup_cfts_prepare();
2850
2851 list_for_each_entry(set, &ss->cftsets, node) {
2852 if (set->cfts == cfts) {
2853 list_del_init(&set->node);
2854 cgroup_cfts_commit(ss, cfts, false);
2855 return 0;
2856 }
2857 }
2858
2859 cgroup_cfts_commit(ss, NULL, false);
2860 return -ENOENT;
2861}
2650 2862
2651/** 2863/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2864 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3837,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3837 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3838 .write_u64 = cgroup_clone_children_write,
3627 }, 3839 },
3628}; 3840 {
3629 3841 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3842 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3843 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3844 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3845 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3846 },
3847 { } /* terminate */
3635}; 3848};
3636 3849
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3850static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3852,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3852 int err;
3640 struct cgroup_subsys *ss; 3853 struct cgroup_subsys *ss;
3641 3854
3642 /* First clear out any existing files */ 3855 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3856 if (err < 0)
3647 return err; 3857 return err;
3648 3858
3649 if (cgrp == cgrp->top_cgroup) { 3859 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3860 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3861 struct cftype_set *set;
3656 return err; 3862
3863 list_for_each_entry(set, &ss->cftsets, node)
3864 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3865 }
3866
3658 /* This cgroup is ready now */ 3867 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3868 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3869 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3879,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3879 return 0;
3671} 3880}
3672 3881
3882static void css_dput_fn(struct work_struct *work)
3883{
3884 struct cgroup_subsys_state *css =
3885 container_of(work, struct cgroup_subsys_state, dput_work);
3886 struct dentry *dentry = css->cgroup->dentry;
3887 struct super_block *sb = dentry->d_sb;
3888
3889 atomic_inc(&sb->s_active);
3890 dput(dentry);
3891 deactivate_super(sb);
3892}
3893
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3894static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3895 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3896 struct cgroup *cgrp)
@@ -3682,6 +3903,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3903 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3904 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3905 cgrp->subsys[ss->subsys_id] = css;
3906
3907 /*
3908 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3909 * which is put on the last css_put(). dput() requires process
3910 * context, which css_put() may be called without. @css->dput_work
3911 * will be used to invoke dput() asynchronously from css_put().
3912 */
3913 INIT_WORK(&css->dput_work, css_dput_fn);
3914 if (ss->__DEPRECATED_clear_css_refs)
3915 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3916}
3686 3917
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3918static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4015,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4015 if (err < 0)
3785 goto err_remove; 4016 goto err_remove;
3786 4017
4018 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4019 for_each_subsys(root, ss)
4020 if (!ss->__DEPRECATED_clear_css_refs)
4021 dget(dentry);
4022
3787 /* The cgroup directory was pre-locked for us */ 4023 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4024 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4025
4026 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4027
3790 err = cgroup_populate_dir(cgrp); 4028 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4029 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4030
@@ -3826,18 +4064,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4064 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4065}
3828 4066
4067/*
4068 * Check the reference count on each subsystem. Since we already
4069 * established that there are no tasks in the cgroup, if the css refcount
4070 * is also 1, then there should be no outstanding references, so the
4071 * subsystem is safe to destroy. We scan across all subsystems rather than
4072 * using the per-hierarchy linked list of mounted subsystems since we can
4073 * be called via check_for_release() with no synchronization other than
4074 * RCU, and the subsystem linked list isn't RCU-safe.
4075 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4076static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4077{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4078 int i;
4079
3841 /* 4080 /*
3842 * We won't need to lock the subsys array, because the subsystems 4081 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4082 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4085,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4085 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4086 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4087 struct cgroup_subsys_state *css;
4088
3849 /* Skip subsystems not present or not in this hierarchy */ 4089 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4090 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4091 continue;
4092
3852 css = cgrp->subsys[ss->subsys_id]; 4093 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4094 /*
4095 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4096 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4097 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4098 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4099 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4100 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4101 */
4102 if (css && css_refcnt(css) > 1)
3860 return 1; 4103 return 1;
3861 } 4104 }
3862 return 0; 4105 return 0;
@@ -3866,51 +4109,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4109 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4110 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4111 * busy subsystems. Call with cgroup_mutex held
4112 *
4113 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4114 * not, cgroup removal behaves differently.
4115 *
4116 * If clear is set, css refcnt for the subsystem should be zero before
4117 * cgroup removal can be committed. This is implemented by
4118 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4119 * called multiple times until all css refcnts reach zero and is allowed to
4120 * veto removal on any invocation. This behavior is deprecated and will be
4121 * removed as soon as the existing user (memcg) is updated.
4122 *
4123 * If clear is not set, each css holds an extra reference to the cgroup's
4124 * dentry and cgroup removal proceeds regardless of css refs.
4125 * ->pre_destroy() will be called at least once and is not allowed to fail.
4126 * On the last put of each css, whenever that may be, the extra dentry ref
4127 * is put so that dentry destruction happens only after all css's are
4128 * released.
3869 */ 4129 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4130static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4131{
3873 struct cgroup_subsys *ss; 4132 struct cgroup_subsys *ss;
3874 unsigned long flags; 4133 unsigned long flags;
3875 bool failed = false; 4134 bool failed = false;
4135
3876 local_irq_save(flags); 4136 local_irq_save(flags);
4137
4138 /*
4139 * Block new css_tryget() by deactivating refcnt. If all refcnts
4140 * for subsystems w/ clear_css_refs set were 1 at the moment of
4141 * deactivation, we succeeded.
4142 */
3877 for_each_subsys(cgrp->root, ss) { 4143 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4144 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4145
3880 while (1) { 4146 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4147 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4148
3883 if (refcnt > 1) { 4149 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4150 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4151 }
3899 done: 4152
4153 /*
4154 * If succeeded, set REMOVED and put all the base refs; otherwise,
4155 * restore refcnts to positive values. Either way, all in-progress
4156 * css_tryget() will be released.
4157 */
3900 for_each_subsys(cgrp->root, ss) { 4158 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4159 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4160
3903 /* 4161 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4162 set_bit(CSS_REMOVED, &css->flags);
4163 css_put(css);
4164 } else {
4165 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4166 }
3913 } 4167 }
4168
3914 local_irq_restore(flags); 4169 local_irq_restore(flags);
3915 return !failed; 4170 return !failed;
3916} 4171}
@@ -3995,6 +4250,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4250 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4251 cgroup_unlock_hierarchy(cgrp->root);
3997 4252
4253 list_del_init(&cgrp->allcg_node);
4254
3998 d = dget(cgrp->dentry); 4255 d = dget(cgrp->dentry);
3999 4256
4000 cgroup_d_remove_dir(d); 4257 cgroup_d_remove_dir(d);
@@ -4021,12 +4278,29 @@ again:
4021 return 0; 4278 return 0;
4022} 4279}
4023 4280
4281static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4282{
4283 INIT_LIST_HEAD(&ss->cftsets);
4284
4285 /*
4286 * base_cftset is embedded in subsys itself, no need to worry about
4287 * deregistration.
4288 */
4289 if (ss->base_cftypes) {
4290 ss->base_cftset.cfts = ss->base_cftypes;
4291 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4292 }
4293}
4294
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4295static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4296{
4026 struct cgroup_subsys_state *css; 4297 struct cgroup_subsys_state *css;
4027 4298
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4299 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4300
4301 /* init base cftset */
4302 cgroup_init_cftsets(ss);
4303
4030 /* Create the top cgroup state for this subsystem */ 4304 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4305 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4306 ss->root = &rootnode;
@@ -4096,6 +4370,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4370 return 0;
4097 } 4371 }
4098 4372
4373 /* init base cftset */
4374 cgroup_init_cftsets(ss);
4375
4099 /* 4376 /*
4100 * need to register a subsys id before anything else - for example, 4377 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4378 * init_cgroup_css needs it.
@@ -4685,21 +4962,43 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4962}
4686 4963
4687/* Caller must verify that the css is not for root cgroup */ 4964/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4965bool __css_tryget(struct cgroup_subsys_state *css)
4966{
4967 do {
4968 int v = css_refcnt(css);
4969
4970 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4971 return true;
4972 cpu_relax();
4973 } while (!test_bit(CSS_REMOVED, &css->flags));
4974
4975 return false;
4976}
4977EXPORT_SYMBOL_GPL(__css_tryget);
4978
4979/* Caller must verify that the css is not for root cgroup */
4980void __css_put(struct cgroup_subsys_state *css)
4689{ 4981{
4690 struct cgroup *cgrp = css->cgroup; 4982 struct cgroup *cgrp = css->cgroup;
4691 int val; 4983 int v;
4984
4692 rcu_read_lock(); 4985 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4986 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4694 if (val == 1) { 4987
4988 switch (v) {
4989 case 1:
4695 if (notify_on_release(cgrp)) { 4990 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4991 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4992 check_for_release(cgrp);
4698 } 4993 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4994 cgroup_wakeup_rmdir_waiter(cgrp);
4995 break;
4996 case 0:
4997 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4998 schedule_work(&css->dput_work);
4999 break;
4700 } 5000 }
4701 rcu_read_unlock(); 5001 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 5002}
4704EXPORT_SYMBOL_GPL(__css_put); 5003EXPORT_SYMBOL_GPL(__css_put);
4705 5004
@@ -4818,7 +5117,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5117 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5118 * it's unchanged until freed.
4820 */ 5119 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5120 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5121
4823 if (cssid) 5122 if (cssid)
4824 return cssid->id; 5123 return cssid->id;
@@ -4830,7 +5129,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5129{
4831 struct css_id *cssid; 5130 struct css_id *cssid;
4832 5131
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5132 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5133
4835 if (cssid) 5134 if (cssid)
4836 return cssid->depth; 5135 return cssid->depth;
@@ -4844,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth);
4844 * @root: the css supporsed to be an ancestor of the child. 5143 * @root: the css supporsed to be an ancestor of the child.
4845 * 5144 *
4846 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 5145 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4847 * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). 5146 * this function reads css->id, the caller must hold rcu_read_lock().
4848 * But, considering usual usage, the csses should be valid objects after test. 5147 * But, considering usual usage, the csses should be valid objects after test.
4849 * Assuming that the caller will do some action to the child if this returns 5148 * Assuming that the caller will do some action to the child if this returns
4850 * returns true, the caller must take "child";s reference count. 5149 * returns true, the caller must take "child";s reference count.
@@ -4856,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4856{ 5155{
4857 struct css_id *child_id; 5156 struct css_id *child_id;
4858 struct css_id *root_id; 5157 struct css_id *root_id;
4859 bool ret = true;
4860 5158
4861 rcu_read_lock();
4862 child_id = rcu_dereference(child->id); 5159 child_id = rcu_dereference(child->id);
5160 if (!child_id)
5161 return false;
4863 root_id = rcu_dereference(root->id); 5162 root_id = rcu_dereference(root->id);
4864 if (!child_id 5163 if (!root_id)
4865 || !root_id 5164 return false;
4866 || (child_id->depth < root_id->depth) 5165 if (child_id->depth < root_id->depth)
4867 || (child_id->stack[root_id->depth] != root_id->id)) 5166 return false;
4868 ret = false; 5167 if (child_id->stack[root_id->depth] != root_id->id)
4869 rcu_read_unlock(); 5168 return false;
4870 return ret; 5169 return true;
4871} 5170}
4872 5171
4873void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5172void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
@@ -5211,19 +5510,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5510 .name = "releasable",
5212 .read_u64 = releasable_read, 5511 .read_u64 = releasable_read,
5213 }, 5512 },
5214};
5215 5513
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5514 { } /* terminate */
5217{ 5515};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5516
5222struct cgroup_subsys debug_subsys = { 5517struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5518 .name = "debug",
5224 .create = debug_create, 5519 .create = debug_create,
5225 .destroy = debug_destroy, 5520 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5521 .subsys_id = debug_subsys_id,
5522 .base_cftypes = debug_files,
5228}; 5523};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5524#endif /* CONFIG_CGROUP_DEBUG */