aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c564
1 files changed, 424 insertions, 140 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c8329b0c257..a0c6af34d50 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css)
260{
261 int v = atomic_read(&css->refcnt);
262
263 return v >= 0 ? v : v - CSS_DEACT_BIAS;
264}
265
242/* convenient tests for these bits */ 266/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 267inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 268{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 303#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 304list_for_each_entry(_root, &roots, root_list)
281 305
306static inline struct cgroup *__d_cgrp(struct dentry *dentry)
307{
308 return dentry->d_fsdata;
309}
310
311static inline struct cfent *__d_cfe(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cftype *__d_cft(struct dentry *dentry)
317{
318 return __d_cfe(dentry)->type;
319}
320
282/* the list of cgroups eligible for automatic release. Protected by 321/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 322 * release_list_lock */
284static LIST_HEAD(release_list); 323static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 855 struct cgroup_subsys *ss;
817 int ret = 0; 856 int ret = 0;
818 857
819 for_each_subsys(cgrp->root, ss) 858 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 859 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 860 continue;
822 if (ret) 861
823 break; 862 ret = ss->pre_destroy(cgrp);
863 if (ret) {
864 /* ->pre_destroy() failure is being deprecated */
865 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
866 break;
824 } 867 }
868 }
825 869
826 return ret; 870 return ret;
827} 871}
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 908 BUG_ON(!list_empty(&cgrp->pidlists));
865 909
866 kfree_rcu(cgrp, rcu_head); 910 kfree_rcu(cgrp, rcu_head);
911 } else {
912 struct cfent *cfe = __d_cfe(dentry);
913 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
914
915 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name);
918 kfree(cfe);
867 } 919 }
868 iput(inode); 920 iput(inode);
869} 921}
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 934 dput(parent);
883} 935}
884 936
885static void cgroup_clear_directory(struct dentry *dentry) 937static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 938{
887 struct list_head *node; 939 struct cfent *cfe;
888 940
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 941 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 942 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 943
892 while (node != &dentry->d_subdirs) { 944 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 945 struct dentry *d = cfe->dentry;
894 946
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 947 if (cft && cfe->type != cft)
896 list_del_init(node); 948 continue;
897 if (d->d_inode) { 949
898 /* This should never be called on a cgroup 950 dget(d);
899 * directory with child cgroups */ 951 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 952 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 953 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 954 dput(d);
903 spin_unlock(&dentry->d_lock); 955
904 d_delete(d); 956 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 957 }
912 spin_unlock(&dentry->d_lock); 958 return -ENOENT;
959}
960
961static void cgroup_clear_directory(struct dentry *dir)
962{
963 struct cgroup *cgrp = __d_cgrp(dir);
964
965 while (!list_empty(&cgrp->files))
966 cgroup_rm_file(cgrp, NULL);
913} 967}
914 968
915/* 969/*
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1348 if (ret)
1295 goto out_unlock; 1349 goto out_unlock;
1296 1350
1351 /* See feature-removal-schedule.txt */
1352 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1353 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1354 task_tgid_nr(current), current->comm);
1355
1297 /* Don't allow flags or name to change at remount */ 1356 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1357 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1358 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1367 goto out_unlock;
1309 } 1368 }
1310 1369
1311 /* (re)populate subsystem files */ 1370 /* clear out any existing files and repopulate subsystem files */
1371 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1372 cgroup_populate_dir(cgrp);
1313 1373
1314 if (opts.release_agent) 1374 if (opts.release_agent)
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1393{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1394 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1395 INIT_LIST_HEAD(&cgrp->children);
1396 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1397 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1398 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1399 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1405static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1406{
1346 struct cgroup *cgrp = &root->top_cgroup; 1407 struct cgroup *cgrp = &root->top_cgroup;
1408
1347 INIT_LIST_HEAD(&root->subsys_list); 1409 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1410 INIT_LIST_HEAD(&root->root_list);
1411 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1412 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1413 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1414 cgrp->top_cgroup = cgrp;
1415 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1416 init_cgroup_housekeeping(cgrp);
1353} 1417}
1354 1418
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1756
1693static struct kobject *cgroup_kobj; 1757static struct kobject *cgroup_kobj;
1694 1758
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1759/**
1706 * cgroup_path - generate the path of a cgroup 1760 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1761 * @cgrp: the cgroup in question
@@ -2172,6 +2226,18 @@ retry_find_task:
2172 2226
2173 if (threadgroup) 2227 if (threadgroup)
2174 tsk = tsk->group_leader; 2228 tsk = tsk->group_leader;
2229
2230 /*
2231 * Workqueue threads may acquire PF_THREAD_BOUND and become
2232 * trapped in a cpuset, or RT worker may be born in a cgroup
2233 * with no rt_runtime allocated. Just say no.
2234 */
2235 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2236 ret = -EINVAL;
2237 rcu_read_unlock();
2238 goto out_unlock_cgroup;
2239 }
2240
2175 get_task_struct(tsk); 2241 get_task_struct(tsk);
2176 rcu_read_unlock(); 2242 rcu_read_unlock();
2177 2243
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2669 return mode;
2604} 2670}
2605 2671
2606int cgroup_add_file(struct cgroup *cgrp, 2672static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2673 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2674{
2610 struct dentry *dir = cgrp->dentry; 2675 struct dentry *dir = cgrp->dentry;
2676 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2677 struct dentry *dentry;
2678 struct cfent *cfe;
2612 int error; 2679 int error;
2613 umode_t mode; 2680 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2681 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2682
2683 /* does @cft->flags tell us to skip creation on @cgrp? */
2684 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2685 return 0;
2686 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2687 return 0;
2688
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2689 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2690 strcpy(name, subsys->name);
2618 strcat(name, "."); 2691 strcat(name, ".");
2619 } 2692 }
2620 strcat(name, cft->name); 2693 strcat(name, cft->name);
2694
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2695 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2696
2697 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2698 if (!cfe)
2699 return -ENOMEM;
2700
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2701 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2702 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2703 error = PTR_ERR(dentry);
2704 goto out;
2705 }
2706
2707 mode = cgroup_file_mode(cft);
2708 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2709 if (!error) {
2710 cfe->type = (void *)cft;
2711 cfe->dentry = dentry;
2712 dentry->d_fsdata = cfe;
2713 list_add_tail(&cfe->node, &parent->files);
2714 cfe = NULL;
2715 }
2716 dput(dentry);
2717out:
2718 kfree(cfe);
2632 return error; 2719 return error;
2633} 2720}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2721
2636int cgroup_add_files(struct cgroup *cgrp, 2722static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2723 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2724{
2641 int i, err; 2725 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2726 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2727
2644 if (err) 2728 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2729 if (is_add)
2730 err = cgroup_add_file(cgrp, subsys, cft);
2731 else
2732 err = cgroup_rm_file(cgrp, cft);
2733 if (err) {
2734 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2735 is_add ? "add" : "remove", cft->name, err);
2736 ret = err;
2737 }
2738 }
2739 return ret;
2740}
2741
2742static DEFINE_MUTEX(cgroup_cft_mutex);
2743
2744static void cgroup_cfts_prepare(void)
2745 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2746{
2747 /*
2748 * Thanks to the entanglement with vfs inode locking, we can't walk
2749 * the existing cgroups under cgroup_mutex and create files.
2750 * Instead, we increment reference on all cgroups and build list of
2751 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2752 * exclusive access to the field.
2753 */
2754 mutex_lock(&cgroup_cft_mutex);
2755 mutex_lock(&cgroup_mutex);
2756}
2757
2758static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2759 const struct cftype *cfts, bool is_add)
2760 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2761{
2762 LIST_HEAD(pending);
2763 struct cgroup *cgrp, *n;
2764
2765 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2766 if (cfts && ss->root != &rootnode) {
2767 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2768 dget(cgrp->dentry);
2769 list_add_tail(&cgrp->cft_q_node, &pending);
2770 }
2771 }
2772
2773 mutex_unlock(&cgroup_mutex);
2774
2775 /*
2776 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2777 * files for all cgroups which were created before.
2778 */
2779 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2780 struct inode *inode = cgrp->dentry->d_inode;
2781
2782 mutex_lock(&inode->i_mutex);
2783 mutex_lock(&cgroup_mutex);
2784 if (!cgroup_is_removed(cgrp))
2785 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2786 mutex_unlock(&cgroup_mutex);
2787 mutex_unlock(&inode->i_mutex);
2788
2789 list_del_init(&cgrp->cft_q_node);
2790 dput(cgrp->dentry);
2646 } 2791 }
2792
2793 mutex_unlock(&cgroup_cft_mutex);
2794}
2795
2796/**
2797 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2798 * @ss: target cgroup subsystem
2799 * @cfts: zero-length name terminated array of cftypes
2800 *
2801 * Register @cfts to @ss. Files described by @cfts are created for all
2802 * existing cgroups to which @ss is attached and all future cgroups will
2803 * have them too. This function can be called anytime whether @ss is
2804 * attached or not.
2805 *
2806 * Returns 0 on successful registration, -errno on failure. Note that this
2807 * function currently returns 0 as long as @cfts registration is successful
2808 * even if some file creation attempts on existing cgroups fail.
2809 */
2810int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2811{
2812 struct cftype_set *set;
2813
2814 set = kzalloc(sizeof(*set), GFP_KERNEL);
2815 if (!set)
2816 return -ENOMEM;
2817
2818 cgroup_cfts_prepare();
2819 set->cfts = cfts;
2820 list_add_tail(&set->node, &ss->cftsets);
2821 cgroup_cfts_commit(ss, cfts, true);
2822
2647 return 0; 2823 return 0;
2648} 2824}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2825EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2826
2827/**
2828 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2829 * @ss: target cgroup subsystem
2830 * @cfts: zero-length name terminated array of cftypes
2831 *
2832 * Unregister @cfts from @ss. Files described by @cfts are removed from
2833 * all existing cgroups to which @ss is attached and all future cgroups
2834 * won't have them either. This function can be called anytime whether @ss
2835 * is attached or not.
2836 *
2837 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2838 * registered with @ss.
2839 */
2840int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2841{
2842 struct cftype_set *set;
2843
2844 cgroup_cfts_prepare();
2845
2846 list_for_each_entry(set, &ss->cftsets, node) {
2847 if (set->cfts == cfts) {
2848 list_del_init(&set->node);
2849 cgroup_cfts_commit(ss, cfts, false);
2850 return 0;
2851 }
2852 }
2853
2854 cgroup_cfts_commit(ss, NULL, false);
2855 return -ENOENT;
2856}
2650 2857
2651/** 2858/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2859 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3832,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3832 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3833 .write_u64 = cgroup_clone_children_write,
3627 }, 3834 },
3628}; 3835 {
3629 3836 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3837 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3838 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3839 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3840 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3841 },
3842 { } /* terminate */
3635}; 3843};
3636 3844
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3845static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3847 int err;
3640 struct cgroup_subsys *ss; 3848 struct cgroup_subsys *ss;
3641 3849
3642 /* First clear out any existing files */ 3850 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3851 if (err < 0)
3647 return err; 3852 return err;
3648 3853
3649 if (cgrp == cgrp->top_cgroup) { 3854 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3855 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3856 struct cftype_set *set;
3656 return err; 3857
3858 list_for_each_entry(set, &ss->cftsets, node)
3859 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3860 }
3861
3658 /* This cgroup is ready now */ 3862 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3863 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3864 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3874 return 0;
3671} 3875}
3672 3876
3877static void css_dput_fn(struct work_struct *work)
3878{
3879 struct cgroup_subsys_state *css =
3880 container_of(work, struct cgroup_subsys_state, dput_work);
3881
3882 dput(css->cgroup->dentry);
3883}
3884
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3885static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3886 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3887 struct cgroup *cgrp)
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3894 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3895 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3896 cgrp->subsys[ss->subsys_id] = css;
3897
3898 /*
3899 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3900 * which is put on the last css_put(). dput() requires process
3901 * context, which css_put() may be called without. @css->dput_work
3902 * will be used to invoke dput() asynchronously from css_put().
3903 */
3904 INIT_WORK(&css->dput_work, css_dput_fn);
3905 if (ss->__DEPRECATED_clear_css_refs)
3906 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3907}
3686 3908
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3909static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4006 if (err < 0)
3785 goto err_remove; 4007 goto err_remove;
3786 4008
4009 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4010 for_each_subsys(root, ss)
4011 if (!ss->__DEPRECATED_clear_css_refs)
4012 dget(dentry);
4013
3787 /* The cgroup directory was pre-locked for us */ 4014 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4015 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4016
4017 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4018
3790 err = cgroup_populate_dir(cgrp); 4019 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4020 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4021
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4055 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4056}
3828 4057
4058/*
4059 * Check the reference count on each subsystem. Since we already
4060 * established that there are no tasks in the cgroup, if the css refcount
4061 * is also 1, then there should be no outstanding references, so the
4062 * subsystem is safe to destroy. We scan across all subsystems rather than
4063 * using the per-hierarchy linked list of mounted subsystems since we can
4064 * be called via check_for_release() with no synchronization other than
4065 * RCU, and the subsystem linked list isn't RCU-safe.
4066 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4067static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4068{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4069 int i;
4070
3841 /* 4071 /*
3842 * We won't need to lock the subsys array, because the subsystems 4072 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4073 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4076 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4077 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4078 struct cgroup_subsys_state *css;
4079
3849 /* Skip subsystems not present or not in this hierarchy */ 4080 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4081 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4082 continue;
4083
3852 css = cgrp->subsys[ss->subsys_id]; 4084 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4085 /*
4086 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4087 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4088 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4089 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4090 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4091 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4092 */
4093 if (css && css_refcnt(css) > 1)
3860 return 1; 4094 return 1;
3861 } 4095 }
3862 return 0; 4096 return 0;
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4100 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4101 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4102 * busy subsystems. Call with cgroup_mutex held
4103 *
4104 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4105 * not, cgroup removal behaves differently.
4106 *
4107 * If clear is set, css refcnt for the subsystem should be zero before
4108 * cgroup removal can be committed. This is implemented by
4109 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4110 * called multiple times until all css refcnts reach zero and is allowed to
4111 * veto removal on any invocation. This behavior is deprecated and will be
4112 * removed as soon as the existing user (memcg) is updated.
4113 *
4114 * If clear is not set, each css holds an extra reference to the cgroup's
4115 * dentry and cgroup removal proceeds regardless of css refs.
4116 * ->pre_destroy() will be called at least once and is not allowed to fail.
4117 * On the last put of each css, whenever that may be, the extra dentry ref
4118 * is put so that dentry destruction happens only after all css's are
4119 * released.
3869 */ 4120 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4121static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4122{
3873 struct cgroup_subsys *ss; 4123 struct cgroup_subsys *ss;
3874 unsigned long flags; 4124 unsigned long flags;
3875 bool failed = false; 4125 bool failed = false;
4126
3876 local_irq_save(flags); 4127 local_irq_save(flags);
4128
4129 /*
4130 * Block new css_tryget() by deactivating refcnt. If all refcnts
4131 * for subsystems w/ clear_css_refs set were 1 at the moment of
4132 * deactivation, we succeeded.
4133 */
3877 for_each_subsys(cgrp->root, ss) { 4134 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4135 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4136
3880 while (1) { 4137 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4138 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4139
3883 if (refcnt > 1) { 4140 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4141 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4142 }
3899 done: 4143
4144 /*
4145 * If succeeded, set REMOVED and put all the base refs; otherwise,
4146 * restore refcnts to positive values. Either way, all in-progress
4147 * css_tryget() will be released.
4148 */
3900 for_each_subsys(cgrp->root, ss) { 4149 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4150 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4151
3903 /* 4152 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4153 set_bit(CSS_REMOVED, &css->flags);
4154 css_put(css);
4155 } else {
4156 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4157 }
3913 } 4158 }
4159
3914 local_irq_restore(flags); 4160 local_irq_restore(flags);
3915 return !failed; 4161 return !failed;
3916} 4162}
@@ -3995,6 +4241,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4241 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4242 cgroup_unlock_hierarchy(cgrp->root);
3997 4243
4244 list_del_init(&cgrp->allcg_node);
4245
3998 d = dget(cgrp->dentry); 4246 d = dget(cgrp->dentry);
3999 4247
4000 cgroup_d_remove_dir(d); 4248 cgroup_d_remove_dir(d);
@@ -4021,12 +4269,29 @@ again:
4021 return 0; 4269 return 0;
4022} 4270}
4023 4271
4272static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4273{
4274 INIT_LIST_HEAD(&ss->cftsets);
4275
4276 /*
4277 * base_cftset is embedded in subsys itself, no need to worry about
4278 * deregistration.
4279 */
4280 if (ss->base_cftypes) {
4281 ss->base_cftset.cfts = ss->base_cftypes;
4282 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4283 }
4284}
4285
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4286static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4287{
4026 struct cgroup_subsys_state *css; 4288 struct cgroup_subsys_state *css;
4027 4289
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4290 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4291
4292 /* init base cftset */
4293 cgroup_init_cftsets(ss);
4294
4030 /* Create the top cgroup state for this subsystem */ 4295 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4296 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4297 ss->root = &rootnode;
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4361 return 0;
4097 } 4362 }
4098 4363
4364 /* init base cftset */
4365 cgroup_init_cftsets(ss);
4366
4099 /* 4367 /*
4100 * need to register a subsys id before anything else - for example, 4368 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4369 * init_cgroup_css needs it.
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4953}
4686 4954
4687/* Caller must verify that the css is not for root cgroup */ 4955/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 do {
4959 int v = css_refcnt(css);
4960
4961 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4962 return true;
4963 cpu_relax();
4964 } while (!test_bit(CSS_REMOVED, &css->flags));
4965
4966 return false;
4967}
4968EXPORT_SYMBOL_GPL(__css_tryget);
4969
4970/* Caller must verify that the css is not for root cgroup */
4971void __css_put(struct cgroup_subsys_state *css)
4689{ 4972{
4690 struct cgroup *cgrp = css->cgroup; 4973 struct cgroup *cgrp = css->cgroup;
4691 int val; 4974
4692 rcu_read_lock(); 4975 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4976 atomic_dec(&css->refcnt);
4694 if (val == 1) { 4977 switch (css_refcnt(css)) {
4978 case 1:
4695 if (notify_on_release(cgrp)) { 4979 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4980 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4981 check_for_release(cgrp);
4698 } 4982 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4983 cgroup_wakeup_rmdir_waiter(cgrp);
4984 break;
4985 case 0:
4986 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4987 schedule_work(&css->dput_work);
4988 break;
4700 } 4989 }
4701 rcu_read_unlock(); 4990 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 4991}
4704EXPORT_SYMBOL_GPL(__css_put); 4992EXPORT_SYMBOL_GPL(__css_put);
4705 4993
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5106 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5107 * it's unchanged until freed.
4820 */ 5108 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5109 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5110
4823 if (cssid) 5111 if (cssid)
4824 return cssid->id; 5112 return cssid->id;
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5118{
4831 struct css_id *cssid; 5119 struct css_id *cssid;
4832 5120
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5121 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5122
4835 if (cssid) 5123 if (cssid)
4836 return cssid->depth; 5124 return cssid->depth;
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5499 .name = "releasable",
5212 .read_u64 = releasable_read, 5500 .read_u64 = releasable_read,
5213 }, 5501 },
5214};
5215 5502
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5503 { } /* terminate */
5217{ 5504};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5505
5222struct cgroup_subsys debug_subsys = { 5506struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5507 .name = "debug",
5224 .create = debug_create, 5508 .create = debug_create,
5225 .destroy = debug_destroy, 5509 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5510 .subsys_id = debug_subsys_id,
5511 .base_cftypes = debug_files,
5228}; 5512};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5513#endif /* CONFIG_CGROUP_DEBUG */