aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 22:14:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 22:14:20 -0400
commit191a712090bb8a10e6f129360eeed2d68f3d4c9a (patch)
tree17e2d6c27fb8a7c3a61828fbcc7c343a4966a0a9 /kernel/cgroup.c
parent46d9be3e5eb01f71fc02653755d970247174b400 (diff)
parent2a0010af17b1739ef8ea8cf02647a127241ee674 (diff)
Merge branch 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Fixes and a lot of cleanups. Locking cleanup is finally complete. cgroup_mutex is no longer exposed to individual controlelrs which used to cause nasty deadlock issues. Li fixed and cleaned up quite a bit including long standing ones like racy cgroup_path(). - device cgroup now supports proper hierarchy thanks to Aristeu. - perf_event cgroup now supports proper hierarchy. - A new mount option "__DEVEL__sane_behavior" is added. As indicated by the name, this option is to be used for development only at this point and generates a warning message when used. Unfortunately, cgroup interface currently has too many brekages and inconsistencies to implement a consistent and unified hierarchy on top. The new flag is used to collect the behavior changes which are necessary to implement consistent unified hierarchy. It's likely that this flag won't be used verbatim when it becomes ready but will be enabled implicitly along with unified hierarchy. The option currently disables some of broken behaviors in cgroup core and also .use_hierarchy switch in memcg (will be routed through -mm), which can be used to make very unusual hierarchy where nesting is partially honored. It will also be used to implement hierarchy support for blk-throttle which would be impossible otherwise without introducing a full separate set of control knobs. This is essentially versioning of interface which isn't very nice but at this point I can't see any other options which would allow keeping the interface the same while moving towards hierarchy behavior which is at least somewhat sane. The planned unified hierarchy is likely to require some level of adaptation from userland anyway, so I think it'd be best to take the chance and update the interface such that it's supportable in the long term. Maintaining the existing interface does complicate cgroup core but shouldn't put too much strain on individual controllers and I think it'd be manageable for the foreseeable future. Maybe we'll be able to drop it in a decade. Fix up conflicts (including a semantic one adding a new #include to ppc that was uncovered by header the file changes) as per Tejun. * 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (45 commits) cpuset: fix compile warning when CONFIG_SMP=n cpuset: fix cpu hotplug vs rebuild_sched_domains() race cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn() cgroup: restore the call to eventfd->poll() cgroup: fix use-after-free when umounting cgroupfs cgroup: fix broken file xattrs devcg: remove parent_cgroup. memcg: force use_hierarchy if sane_behavior cgroup: remove cgrp->top_cgroup cgroup: introduce sane_behavior mount option move cgroupfs_root to include/linux/cgroup.h cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix cgroup: make cgroup_path() not print double slashes Revert "cgroup: remove bind() method from cgroup_subsys." perf: make perf_event cgroup hierarchical cgroup: implement cgroup_is_descendant() cgroup: make sure parent won't be destroyed before its children cgroup: remove bind() method from cgroup_subsys. devcg: remove broken_hierarchy tag cgroup: remove cgroup_lock_is_held() ...
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c724
1 files changed, 299 insertions, 425 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1f628bc039f4..eeb7e49946b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
30#include <linux/cred.h> 30#include <linux/cred.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
33#include <linux/fs.h>
34#include <linux/init_task.h> 33#include <linux/init_task.h>
35#include <linux/kernel.h> 34#include <linux/kernel.h>
36#include <linux/list.h> 35#include <linux/list.h>
@@ -59,7 +58,7 @@
59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h> 59#include <linux/eventfd.h>
61#include <linux/poll.h> 60#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
63#include <linux/kthread.h> 62#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
@@ -83,7 +82,13 @@
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 82 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it. 83 * breaks it.
85 */ 84 */
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
88#else
86static DEFINE_MUTEX(cgroup_mutex); 89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
87static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
88 93
89/* 94/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 103#include <linux/cgroup_subsys.h>
99}; 104};
100 105
101#define MAX_CGROUP_ROOT_NAMELEN 64
102
103/*
104 * A cgroupfs_root represents the root of a cgroup hierarchy,
105 * and may be associated with a superblock to form an active
106 * hierarchy
107 */
108struct cgroupfs_root {
109 struct super_block *sb;
110
111 /*
112 * The bitmask of subsystems intended to be attached to this
113 * hierarchy
114 */
115 unsigned long subsys_mask;
116
117 /* Unique id for this hierarchy. */
118 int hierarchy_id;
119
120 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask;
122
123 /* A list running through the attached subsystems */
124 struct list_head subsys_list;
125
126 /* The root cgroup for this hierarchy */
127 struct cgroup top_cgroup;
128
129 /* Tracks how many cgroups are currently defined in hierarchy.*/
130 int number_of_cgroups;
131
132 /* A list running through the active hierarchies */
133 struct list_head root_list;
134
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */
139 unsigned long flags;
140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX];
146
147 /* The name for this hierarchy - may be empty */
148 char name[MAX_CGROUP_ROOT_NAMELEN];
149};
150
151/* 106/*
152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
153 * subsystems that are otherwise unattached - it never has more than a 108 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
162 struct list_head node; 117 struct list_head node;
163 struct dentry *dentry; 118 struct dentry *dentry;
164 struct cftype *type; 119 struct cftype *type;
120
121 /* file xattrs */
122 struct simple_xattrs xattrs;
165}; 123};
166 124
167/* 125/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
238/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
239#define dummytop (&rootnode.top_cgroup) 197#define dummytop (&rootnode.top_cgroup)
240 198
199static struct cgroup_name root_cgroup_name = { .name = "/" };
200
241/* This flag indicates whether tasks in the fork and exit paths should 201/* This flag indicates whether tasks in the fork and exit paths should
242 * check for fork/exit handlers to call. This avoids us having to do 202 * check for fork/exit handlers to call. This avoids us having to do
243 * extra work in the fork/exit path if none of the subsystems need to 203 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add); 210 struct cftype cfts[], bool is_add);
251 211
252#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void)
254{
255 return lockdep_is_held(&cgroup_mutex);
256}
257#else /* #ifdef CONFIG_PROVE_LOCKING */
258int cgroup_lock_is_held(void)
259{
260 return mutex_is_locked(&cgroup_mutex);
261}
262#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
263
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265
266static int css_unbias_refcnt(int refcnt) 212static int css_unbias_refcnt(int refcnt)
267{ 213{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
282 return test_bit(CGRP_REMOVED, &cgrp->flags); 228 return test_bit(CGRP_REMOVED, &cgrp->flags);
283} 229}
284 230
285/* bits in struct cgroupfs_root flags field */ 231/**
286enum { 232 * cgroup_is_descendant - test ancestry
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 * @cgrp: the cgroup to be tested
288 ROOT_XATTR, /* supports extended attributes */ 234 * @ancestor: possible ancestor of @cgrp
289}; 235 *
236 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 * and @ancestor are accessible.
239 */
240bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241{
242 while (cgrp) {
243 if (cgrp == ancestor)
244 return true;
245 cgrp = cgrp->parent;
246 }
247 return false;
248}
249EXPORT_SYMBOL_GPL(cgroup_is_descendant);
290 250
291static int cgroup_is_releasable(const struct cgroup *cgrp) 251static int cgroup_is_releasable(const struct cgroup *cgrp)
292{ 252{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
327 return __d_cfe(dentry)->type; 287 return __d_cfe(dentry)->type;
328} 288}
329 289
290/**
291 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
292 * @cgrp: the cgroup to be checked for liveness
293 *
294 * On success, returns true; the mutex should be later unlocked. On
295 * failure returns false with no lock held.
296 */
297static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{
299 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) {
301 mutex_unlock(&cgroup_mutex);
302 return false;
303 }
304 return true;
305}
306
330/* the list of cgroups eligible for automatic release. Protected by 307/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */ 308 * release_list_lock */
332static LIST_HEAD(release_list); 309static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
800 * update of a tasks cgroup pointer by cgroup_attach_task() 777 * update of a tasks cgroup pointer by cgroup_attach_task()
801 */ 778 */
802 779
803/**
804 * cgroup_lock - lock out any changes to cgroup structures
805 *
806 */
807void cgroup_lock(void)
808{
809 mutex_lock(&cgroup_mutex);
810}
811EXPORT_SYMBOL_GPL(cgroup_lock);
812
813/**
814 * cgroup_unlock - release lock on cgroup changes
815 *
816 * Undo the lock taken in a previous cgroup_lock() call.
817 */
818void cgroup_unlock(void)
819{
820 mutex_unlock(&cgroup_mutex);
821}
822EXPORT_SYMBOL_GPL(cgroup_unlock);
823
824/* 780/*
825 * A couple of forward declarations required, due to cyclic reference loop: 781 * A couple of forward declarations required, due to cyclic reference loop:
826 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 782 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
859 return inode; 815 return inode;
860} 816}
861 817
818static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
819{
820 struct cgroup_name *name;
821
822 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
823 if (!name)
824 return NULL;
825 strcpy(name->name, dentry->d_name.name);
826 return name;
827}
828
862static void cgroup_free_fn(struct work_struct *work) 829static void cgroup_free_fn(struct work_struct *work)
863{ 830{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
875 mutex_unlock(&cgroup_mutex); 842 mutex_unlock(&cgroup_mutex);
876 843
877 /* 844 /*
845 * We get a ref to the parent's dentry, and put the ref when
846 * this cgroup is being freed, so it's guaranteed that the
847 * parent won't be destroyed before its children.
848 */
849 dput(cgrp->parent->dentry);
850
851 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
852
853 /*
878 * Drop the active superblock reference that we took when we 854 * Drop the active superblock reference that we took when we
879 * created the cgroup 855 * created the cgroup. This will free cgrp->root, if we are
856 * holding the last reference to @sb.
880 */ 857 */
881 deactivate_super(cgrp->root->sb); 858 deactivate_super(cgrp->root->sb);
882 859
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
888 865
889 simple_xattrs_free(&cgrp->xattrs); 866 simple_xattrs_free(&cgrp->xattrs);
890 867
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 868 kfree(rcu_dereference_raw(cgrp->name));
892 kfree(cgrp); 869 kfree(cgrp);
893} 870}
894 871
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
910 } else { 887 } else {
911 struct cfent *cfe = __d_cfe(dentry); 888 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 889 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913 struct cftype *cft = cfe->type;
914 890
915 WARN_ONCE(!list_empty(&cfe->node) && 891 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup, 892 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name); 893 "cfe still linked for %s\n", cfe->type->name);
894 simple_xattrs_free(&cfe->xattrs);
918 kfree(cfe); 895 kfree(cfe);
919 simple_xattrs_free(&cft->xattrs);
920 } 896 }
921 iput(inode); 897 iput(inode);
922} 898}
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1108 mutex_lock(&cgroup_root_mutex); 1084 mutex_lock(&cgroup_root_mutex);
1109 for_each_subsys(root, ss) 1085 for_each_subsys(root, ss)
1110 seq_printf(seq, ",%s", ss->name); 1086 seq_printf(seq, ",%s", ss->name);
1111 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior");
1089 if (root->flags & CGRP_ROOT_NOPREFIX)
1112 seq_puts(seq, ",noprefix"); 1090 seq_puts(seq, ",noprefix");
1113 if (test_bit(ROOT_XATTR, &root->flags)) 1091 if (root->flags & CGRP_ROOT_XATTR)
1114 seq_puts(seq, ",xattr"); 1092 seq_puts(seq, ",xattr");
1115 if (strlen(root->release_agent_path)) 1093 if (strlen(root->release_agent_path))
1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1094 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1172 all_ss = true; 1150 all_ss = true;
1173 continue; 1151 continue;
1174 } 1152 }
1153 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1154 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1155 continue;
1156 }
1175 if (!strcmp(token, "noprefix")) { 1157 if (!strcmp(token, "noprefix")) {
1176 set_bit(ROOT_NOPREFIX, &opts->flags); 1158 opts->flags |= CGRP_ROOT_NOPREFIX;
1177 continue; 1159 continue;
1178 } 1160 }
1179 if (!strcmp(token, "clone_children")) { 1161 if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 continue; 1163 continue;
1182 } 1164 }
1183 if (!strcmp(token, "xattr")) { 1165 if (!strcmp(token, "xattr")) {
1184 set_bit(ROOT_XATTR, &opts->flags); 1166 opts->flags |= CGRP_ROOT_XATTR;
1185 continue; 1167 continue;
1186 } 1168 }
1187 if (!strncmp(token, "release_agent=", 14)) { 1169 if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1259 1241
1260 /* Consistency checks */ 1242 /* Consistency checks */
1261 1243
1244 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1245 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1246
1247 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1248 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1249 return -EINVAL;
1250 }
1251
1252 if (opts->cpuset_clone_children) {
1253 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1254 return -EINVAL;
1255 }
1256 }
1257
1262 /* 1258 /*
1263 * Option noprefix was introduced just for backward compatibility 1259 * Option noprefix was introduced just for backward compatibility
1264 * with the old cpuset, so we allow noprefix only if mounting just 1260 * with the old cpuset, so we allow noprefix only if mounting just
1265 * the cpuset subsystem. 1261 * the cpuset subsystem.
1266 */ 1262 */
1267 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1263 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 (opts->subsys_mask & mask))
1269 return -EINVAL; 1264 return -EINVAL;
1270 1265
1271 1266
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1336 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1337 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1338 1333
1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1336 return -EINVAL;
1337 }
1338
1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1340 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1341 mutex_lock(&cgroup_root_mutex); 1341 mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1421 INIT_LIST_HEAD(&root->allcg_list); 1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1423 cgrp->root = root;
1424 cgrp->top_cgroup = cgrp; 1424 cgrp->name = &root_cgroup_name;
1425 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1427}
@@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1685 * any) is not needed 1685 * any) is not needed
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688
1689 if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
1690 root->flags != opts.flags) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL;
1693 goto drop_new_super;
1694 }
1695
1688 /* no subsys rebinding, so refcounts don't change */ 1696 /* no subsys rebinding, so refcounts don't change */
1689 drop_parsed_module_refcounts(opts.subsys_mask); 1697 drop_parsed_module_refcounts(opts.subsys_mask);
1690 } 1698 }
@@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;
1769 * @buf: the buffer to write the path into 1777 * @buf: the buffer to write the path into
1770 * @buflen: the length of the buffer 1778 * @buflen: the length of the buffer
1771 * 1779 *
1772 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1780 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1773 * reference. Writes path of cgroup into buf. Returns 0 on success, 1781 *
1774 * -errno on error. 1782 * We can't generate cgroup path using dentry->d_name, as accessing
1783 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1784 * inode's i_mutex, while on the other hand cgroup_path() can be called
1785 * with some irq-safe spinlocks held.
1775 */ 1786 */
1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1787int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1777{ 1788{
1778 struct dentry *dentry = cgrp->dentry; 1789 int ret = -ENAMETOOLONG;
1779 char *start; 1790 char *start;
1780 1791
1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1792 if (!cgrp->parent) {
1782 "cgroup_path() called without proper locking"); 1793 if (strlcpy(buf, "/", buflen) >= buflen)
1783 1794 return -ENAMETOOLONG;
1784 if (cgrp == dummytop) {
1785 /*
1786 * Inactive subsystems have no dentry for their root
1787 * cgroup
1788 */
1789 strcpy(buf, "/");
1790 return 0; 1795 return 0;
1791 } 1796 }
1792 1797
1793 start = buf + buflen - 1; 1798 start = buf + buflen - 1;
1794
1795 *start = '\0'; 1799 *start = '\0';
1796 for (;;) {
1797 int len = dentry->d_name.len;
1798 1800
1801 rcu_read_lock();
1802 do {
1803 const char *name = cgroup_name(cgrp);
1804 int len;
1805
1806 len = strlen(name);
1799 if ((start -= len) < buf) 1807 if ((start -= len) < buf)
1800 return -ENAMETOOLONG; 1808 goto out;
1801 memcpy(start, dentry->d_name.name, len); 1809 memcpy(start, name, len);
1802 cgrp = cgrp->parent;
1803 if (!cgrp)
1804 break;
1805 1810
1806 dentry = cgrp->dentry;
1807 if (!cgrp->parent)
1808 continue;
1809 if (--start < buf) 1811 if (--start < buf)
1810 return -ENAMETOOLONG; 1812 goto out;
1811 *start = '/'; 1813 *start = '/';
1812 } 1814
1815 cgrp = cgrp->parent;
1816 } while (cgrp->parent);
1817 ret = 0;
1813 memmove(buf, start, buf + buflen - start); 1818 memmove(buf, start, buf + buflen - start);
1814 return 0; 1819out:
1820 rcu_read_unlock();
1821 return ret;
1815} 1822}
1816EXPORT_SYMBOL_GPL(cgroup_path); 1823EXPORT_SYMBOL_GPL(cgroup_path);
1817 1824
@@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1900 * 1907 *
1901 * Must be called with cgroup_mutex and threadgroup locked. 1908 * Must be called with cgroup_mutex and threadgroup locked.
1902 */ 1909 */
1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1910static void cgroup_task_migrate(struct cgroup *oldcgrp,
1904 struct task_struct *tsk, struct css_set *newcg) 1911 struct task_struct *tsk, struct css_set *newcg)
1905{ 1912{
1906 struct css_set *oldcg; 1913 struct css_set *oldcg;
@@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1933} 1940}
1934 1941
1935/** 1942/**
1936 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1943 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1937 * @cgrp: the cgroup the task is attaching to
1938 * @tsk: the task to be attached
1939 *
1940 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1941 * @tsk during call.
1942 */
1943int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1944{
1945 int retval = 0;
1946 struct cgroup_subsys *ss, *failed_ss = NULL;
1947 struct cgroup *oldcgrp;
1948 struct cgroupfs_root *root = cgrp->root;
1949 struct cgroup_taskset tset = { };
1950 struct css_set *newcg;
1951
1952 /* @tsk either already exited or can't exit until the end */
1953 if (tsk->flags & PF_EXITING)
1954 return -ESRCH;
1955
1956 /* Nothing to do if the task is already in that cgroup */
1957 oldcgrp = task_cgroup_from_root(tsk, root);
1958 if (cgrp == oldcgrp)
1959 return 0;
1960
1961 tset.single.task = tsk;
1962 tset.single.cgrp = oldcgrp;
1963
1964 for_each_subsys(root, ss) {
1965 if (ss->can_attach) {
1966 retval = ss->can_attach(cgrp, &tset);
1967 if (retval) {
1968 /*
1969 * Remember on which subsystem the can_attach()
1970 * failed, so that we only call cancel_attach()
1971 * against the subsystems whose can_attach()
1972 * succeeded. (See below)
1973 */
1974 failed_ss = ss;
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 newcg = find_css_set(tsk->cgroups, cgrp);
1981 if (!newcg) {
1982 retval = -ENOMEM;
1983 goto out;
1984 }
1985
1986 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1987
1988 for_each_subsys(root, ss) {
1989 if (ss->attach)
1990 ss->attach(cgrp, &tset);
1991 }
1992
1993out:
1994 if (retval) {
1995 for_each_subsys(root, ss) {
1996 if (ss == failed_ss)
1997 /*
1998 * This subsystem was the one that failed the
1999 * can_attach() check earlier, so we don't need
2000 * to call cancel_attach() against it or any
2001 * remaining subsystems.
2002 */
2003 break;
2004 if (ss->cancel_attach)
2005 ss->cancel_attach(cgrp, &tset);
2006 }
2007 }
2008 return retval;
2009}
2010
2011/**
2012 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2013 * @from: attach to all cgroups of a given task
2014 * @tsk: the task to be attached
2015 */
2016int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2017{
2018 struct cgroupfs_root *root;
2019 int retval = 0;
2020
2021 cgroup_lock();
2022 for_each_active_root(root) {
2023 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2024
2025 retval = cgroup_attach_task(from_cg, tsk);
2026 if (retval)
2027 break;
2028 }
2029 cgroup_unlock();
2030
2031 return retval;
2032}
2033EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2034
2035/**
2036 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2037 * @cgrp: the cgroup to attach to 1944 * @cgrp: the cgroup to attach to
2038 * @leader: the threadgroup leader task_struct of the group to be attached 1945 * @tsk: the task or the leader of the threadgroup to be attached
1946 * @threadgroup: attach the whole threadgroup?
2039 * 1947 *
2040 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1948 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2041 * task_lock of each thread in leader's threadgroup individually in turn. 1949 * task_lock of @tsk or each thread in the threadgroup individually in turn.
2042 */ 1950 */
2043static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1951static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1952 bool threadgroup)
2044{ 1953{
2045 int retval, i, group_size; 1954 int retval, i, group_size;
2046 struct cgroup_subsys *ss, *failed_ss = NULL; 1955 struct cgroup_subsys *ss, *failed_ss = NULL;
2047 /* guaranteed to be initialized later, but the compiler needs this */
2048 struct cgroupfs_root *root = cgrp->root; 1956 struct cgroupfs_root *root = cgrp->root;
2049 /* threadgroup list cursor and array */ 1957 /* threadgroup list cursor and array */
2050 struct task_struct *tsk; 1958 struct task_struct *leader = tsk;
2051 struct task_and_cgroup *tc; 1959 struct task_and_cgroup *tc;
2052 struct flex_array *group; 1960 struct flex_array *group;
2053 struct cgroup_taskset tset = { }; 1961 struct cgroup_taskset tset = { };
@@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2059 * group - group_rwsem prevents new threads from appearing, and if 1967 * group - group_rwsem prevents new threads from appearing, and if
2060 * threads exit, this will just be an over-estimate. 1968 * threads exit, this will just be an over-estimate.
2061 */ 1969 */
2062 group_size = get_nr_threads(leader); 1970 if (threadgroup)
1971 group_size = get_nr_threads(tsk);
1972 else
1973 group_size = 1;
2063 /* flex_array supports very large thread-groups better than kmalloc. */ 1974 /* flex_array supports very large thread-groups better than kmalloc. */
2064 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1975 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2065 if (!group) 1976 if (!group)
2066 return -ENOMEM; 1977 return -ENOMEM;
2067 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1978 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2068 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 1979 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2069 if (retval) 1980 if (retval)
2070 goto out_free_group_list; 1981 goto out_free_group_list;
2071 1982
2072 tsk = leader;
2073 i = 0; 1983 i = 0;
2074 /* 1984 /*
2075 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1985 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2098 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2008 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2099 BUG_ON(retval != 0); 2009 BUG_ON(retval != 0);
2100 i++; 2010 i++;
2011
2012 if (!threadgroup)
2013 break;
2101 } while_each_thread(leader, tsk); 2014 } while_each_thread(leader, tsk);
2102 rcu_read_unlock(); 2015 rcu_read_unlock();
2103 /* remember the number of threads in the array for later. */ 2016 /* remember the number of threads in the array for later. */
@@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2143 */ 2056 */
2144 for (i = 0; i < group_size; i++) { 2057 for (i = 0; i < group_size; i++) {
2145 tc = flex_array_get(group, i); 2058 tc = flex_array_get(group, i);
2146 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2059 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2147 } 2060 }
2148 /* nothing is sensitive to fork() after this point. */ 2061 /* nothing is sensitive to fork() after this point. */
2149 2062
@@ -2251,17 +2164,42 @@ retry_find_task:
2251 put_task_struct(tsk); 2164 put_task_struct(tsk);
2252 goto retry_find_task; 2165 goto retry_find_task;
2253 } 2166 }
2254 ret = cgroup_attach_proc(cgrp, tsk); 2167 }
2255 } else 2168
2256 ret = cgroup_attach_task(cgrp, tsk); 2169 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2170
2257 threadgroup_unlock(tsk); 2171 threadgroup_unlock(tsk);
2258 2172
2259 put_task_struct(tsk); 2173 put_task_struct(tsk);
2260out_unlock_cgroup: 2174out_unlock_cgroup:
2261 cgroup_unlock(); 2175 mutex_unlock(&cgroup_mutex);
2262 return ret; 2176 return ret;
2263} 2177}
2264 2178
2179/**
2180 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2181 * @from: attach to all cgroups of a given task
2182 * @tsk: the task to be attached
2183 */
2184int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2185{
2186 struct cgroupfs_root *root;
2187 int retval = 0;
2188
2189 mutex_lock(&cgroup_mutex);
2190 for_each_active_root(root) {
2191 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2192
2193 retval = cgroup_attach_task(from_cg, tsk, false);
2194 if (retval)
2195 break;
2196 }
2197 mutex_unlock(&cgroup_mutex);
2198
2199 return retval;
2200}
2201EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2202
2265static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2203static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2266{ 2204{
2267 return attach_task_by_pid(cgrp, pid, false); 2205 return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272 return attach_task_by_pid(cgrp, tgid, true); 2210 return attach_task_by_pid(cgrp, tgid, true);
2273} 2211}
2274 2212
2275/**
2276 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2277 * @cgrp: the cgroup to be checked for liveness
2278 *
2279 * On success, returns true; the lock should be later released with
2280 * cgroup_unlock(). On failure returns false with no lock held.
2281 */
2282bool cgroup_lock_live_group(struct cgroup *cgrp)
2283{
2284 mutex_lock(&cgroup_mutex);
2285 if (cgroup_is_removed(cgrp)) {
2286 mutex_unlock(&cgroup_mutex);
2287 return false;
2288 }
2289 return true;
2290}
2291EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2292
2293static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2213static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2294 const char *buffer) 2214 const char *buffer)
2295{ 2215{
@@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 mutex_lock(&cgroup_root_mutex); 2221 mutex_lock(&cgroup_root_mutex);
2302 strcpy(cgrp->root->release_agent_path, buffer); 2222 strcpy(cgrp->root->release_agent_path, buffer);
2303 mutex_unlock(&cgroup_root_mutex); 2223 mutex_unlock(&cgroup_root_mutex);
2304 cgroup_unlock(); 2224 mutex_unlock(&cgroup_mutex);
2305 return 0; 2225 return 0;
2306} 2226}
2307 2227
@@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2312 return -ENODEV; 2232 return -ENODEV;
2313 seq_puts(seq, cgrp->root->release_agent_path); 2233 seq_puts(seq, cgrp->root->release_agent_path);
2314 seq_putc(seq, '\n'); 2234 seq_putc(seq, '\n');
2315 cgroup_unlock(); 2235 mutex_unlock(&cgroup_mutex);
2236 return 0;
2237}
2238
2239static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2240 struct seq_file *seq)
2241{
2242 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2316 return 0; 2243 return 0;
2317} 2244}
2318 2245
@@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2537static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2464static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2465 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2466{
2467 int ret;
2468 struct cgroup_name *name, *old_name;
2469 struct cgroup *cgrp;
2470
2471 /*
2472 * It's convinient to use parent dir's i_mutex to protected
2473 * cgrp->name.
2474 */
2475 lockdep_assert_held(&old_dir->i_mutex);
2476
2540 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2477 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2541 return -ENOTDIR; 2478 return -ENOTDIR;
2542 if (new_dentry->d_inode) 2479 if (new_dentry->d_inode)
2543 return -EEXIST; 2480 return -EEXIST;
2544 if (old_dir != new_dir) 2481 if (old_dir != new_dir)
2545 return -EIO; 2482 return -EIO;
2546 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2483
2484 cgrp = __d_cgrp(old_dentry);
2485
2486 name = cgroup_alloc_name(new_dentry);
2487 if (!name)
2488 return -ENOMEM;
2489
2490 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2491 if (ret) {
2492 kfree(name);
2493 return ret;
2494 }
2495
2496 old_name = cgrp->name;
2497 rcu_assign_pointer(cgrp->name, name);
2498
2499 kfree_rcu(old_name, rcu_head);
2500 return 0;
2547} 2501}
2548 2502
2549static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2503static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2551 if (S_ISDIR(dentry->d_inode->i_mode)) 2505 if (S_ISDIR(dentry->d_inode->i_mode))
2552 return &__d_cgrp(dentry)->xattrs; 2506 return &__d_cgrp(dentry)->xattrs;
2553 else 2507 else
2554 return &__d_cft(dentry)->xattrs; 2508 return &__d_cfe(dentry)->xattrs;
2555} 2509}
2556 2510
2557static inline int xattr_enabled(struct dentry *dentry) 2511static inline int xattr_enabled(struct dentry *dentry)
2558{ 2512{
2559 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2513 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2560 return test_bit(ROOT_XATTR, &root->flags); 2514 return root->flags & CGRP_ROOT_XATTR;
2561} 2515}
2562 2516
2563static bool is_valid_xattr(const char *name) 2517static bool is_valid_xattr(const char *name)
@@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2727 umode_t mode; 2681 umode_t mode;
2728 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 2683
2730 simple_xattrs_init(&cft->xattrs); 2684 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2731
2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2733 strcpy(name, subsys->name); 2685 strcpy(name, subsys->name);
2734 strcat(name, "."); 2686 strcat(name, ".");
2735 } 2687 }
@@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2753 cfe->type = (void *)cft; 2705 cfe->type = (void *)cft;
2754 cfe->dentry = dentry; 2706 cfe->dentry = dentry;
2755 dentry->d_fsdata = cfe; 2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2756 list_add_tail(&cfe->node, &parent->files); 2709 list_add_tail(&cfe->node, &parent->files);
2757 cfe = NULL; 2710 cfe = NULL;
2758 } 2711 }
@@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2770 2723
2771 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2724 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2772 /* does cft->flags tell us to skip this file on @cgrp? */ 2725 /* does cft->flags tell us to skip this file on @cgrp? */
2726 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2727 continue;
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2728 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue; 2729 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2730 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3300 return 0; 3255 return 0;
3301} 3256}
3302 3257
3258static void cgroup_transfer_one_task(struct task_struct *task,
3259 struct cgroup_scanner *scan)
3260{
3261 struct cgroup *new_cgroup = scan->data;
3262
3263 mutex_lock(&cgroup_mutex);
3264 cgroup_attach_task(new_cgroup, task, false);
3265 mutex_unlock(&cgroup_mutex);
3266}
3267
3268/**
3269 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3270 * @to: cgroup to which the tasks will be moved
3271 * @from: cgroup in which the tasks currently reside
3272 */
3273int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3274{
3275 struct cgroup_scanner scan;
3276
3277 scan.cg = from;
3278 scan.test_task = NULL; /* select all tasks in cgroup */
3279 scan.process_task = cgroup_transfer_one_task;
3280 scan.heap = NULL;
3281 scan.data = to;
3282
3283 return cgroup_scan_tasks(&scan);
3284}
3285
3303/* 3286/*
3304 * Stuff for reading the 'tasks'/'procs' files. 3287 * Stuff for reading the 'tasks'/'procs' files.
3305 * 3288 *
@@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)
3362 else 3345 else
3363 kfree(p); 3346 kfree(p);
3364} 3347}
3365static void *pidlist_resize(void *p, int newcount)
3366{
3367 void *newlist;
3368 /* note: if new alloc fails, old p will still be valid either way */
3369 if (is_vmalloc_addr(p)) {
3370 newlist = vmalloc(newcount * sizeof(pid_t));
3371 if (!newlist)
3372 return NULL;
3373 memcpy(newlist, p, newcount * sizeof(pid_t));
3374 vfree(p);
3375 } else {
3376 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3377 }
3378 return newlist;
3379}
3380 3348
3381/* 3349/*
3382 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3350 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3383 * If the new stripped list is sufficiently smaller and there's enough memory 3351 * Returns the number of unique elements.
3384 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3385 * number of unique elements.
3386 */ 3352 */
3387/* is the size difference enough that we should re-allocate the array? */ 3353static int pidlist_uniq(pid_t *list, int length)
3388#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3389static int pidlist_uniq(pid_t **p, int length)
3390{ 3354{
3391 int src, dest = 1; 3355 int src, dest = 1;
3392 pid_t *list = *p;
3393 pid_t *newlist;
3394 3356
3395 /* 3357 /*
3396 * we presume the 0th element is unique, so i starts at 1. trivial 3358 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
3411 dest++; 3373 dest++;
3412 } 3374 }
3413after: 3375after:
3414 /*
3415 * if the length difference is large enough, we want to allocate a
3416 * smaller buffer to save memory. if this fails due to out of memory,
3417 * we'll just stay with what we've got.
3418 */
3419 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3420 newlist = pidlist_resize(list, dest);
3421 if (newlist)
3422 *p = newlist;
3423 }
3424 return dest; 3376 return dest;
3425} 3377}
3426 3378
@@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3516 /* now sort & (if procs) strip out duplicates */ 3468 /* now sort & (if procs) strip out duplicates */
3517 sort(array, length, sizeof(pid_t), cmppid, NULL); 3469 sort(array, length, sizeof(pid_t), cmppid, NULL);
3518 if (type == CGROUP_FILE_PROCS) 3470 if (type == CGROUP_FILE_PROCS)
3519 length = pidlist_uniq(&array, length); 3471 length = pidlist_uniq(array, length);
3520 l = cgroup_pidlist_find(cgrp, type); 3472 l = cgroup_pidlist_find(cgrp, type);
3521 if (!l) { 3473 if (!l) {
3522 pidlist_free(array); 3474 pidlist_free(array);
@@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3930 if (ret) 3882 if (ret)
3931 goto fail; 3883 goto fail;
3932 3884
3933 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3885 efile->f_op->poll(efile, &event->pt);
3934 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3935 ret = 0;
3936 goto fail;
3937 }
3938 3886
3939 /* 3887 /*
3940 * Events should be removed after rmdir of cgroup directory, but before 3888 * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3964,16 @@ static struct cftype files[] = {
4016 }, 3964 },
4017 { 3965 {
4018 .name = "cgroup.clone_children", 3966 .name = "cgroup.clone_children",
3967 .flags = CFTYPE_INSANE,
4019 .read_u64 = cgroup_clone_children_read, 3968 .read_u64 = cgroup_clone_children_read,
4020 .write_u64 = cgroup_clone_children_write, 3969 .write_u64 = cgroup_clone_children_write,
4021 }, 3970 },
4022 { 3971 {
3972 .name = "cgroup.sane_behavior",
3973 .flags = CFTYPE_ONLY_ON_ROOT,
3974 .read_seq_string = cgroup_sane_behavior_show,
3975 },
3976 {
4023 .name = "release_agent", 3977 .name = "release_agent",
4024 .flags = CFTYPE_ONLY_ON_ROOT, 3978 .flags = CFTYPE_ONLY_ON_ROOT,
4025 .read_seq_string = cgroup_release_agent_show, 3979 .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4131 if (!(css->flags & CSS_ONLINE)) 4085 if (!(css->flags & CSS_ONLINE))
4132 return; 4086 return;
4133 4087
4134 /* 4088 if (ss->css_offline)
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp); 4089 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145 4090
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4091 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4147} 4092}
@@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 umode_t mode) 4103 umode_t mode)
4159{ 4104{
4160 struct cgroup *cgrp; 4105 struct cgroup *cgrp;
4106 struct cgroup_name *name;
4161 struct cgroupfs_root *root = parent->root; 4107 struct cgroupfs_root *root = parent->root;
4162 int err = 0; 4108 int err = 0;
4163 struct cgroup_subsys *ss; 4109 struct cgroup_subsys *ss;
@@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 if (!cgrp) 4114 if (!cgrp)
4169 return -ENOMEM; 4115 return -ENOMEM;
4170 4116
4117 name = cgroup_alloc_name(dentry);
4118 if (!name)
4119 goto err_free_cgrp;
4120 rcu_assign_pointer(cgrp->name, name);
4121
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4122 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0) 4123 if (cgrp->id < 0)
4173 goto err_free_cgrp; 4124 goto err_free_name;
4174 4125
4175 /* 4126 /*
4176 * Only live parents can have children. Note that the liveliness 4127 * Only live parents can have children. Note that the liveliness
@@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4198 4149
4199 cgrp->parent = parent; 4150 cgrp->parent = parent;
4200 cgrp->root = parent->root; 4151 cgrp->root = parent->root;
4201 cgrp->top_cgroup = parent->top_cgroup;
4202 4152
4203 if (notify_on_release(parent)) 4153 if (notify_on_release(parent))
4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4154 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4241 for_each_subsys(root, ss) 4191 for_each_subsys(root, ss)
4242 dget(dentry); 4192 dget(dentry);
4243 4193
4194 /* hold a ref to the parent's dentry */
4195 dget(parent->dentry);
4196
4244 /* creation succeeded, notify subsystems */ 4197 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) { 4198 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp); 4199 err = online_css(ss, cgrp);
@@ -4276,6 +4229,8 @@ err_free_all:
4276 deactivate_super(sb); 4229 deactivate_super(sb);
4277err_free_id: 4230err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4231 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4232err_free_name:
4233 kfree(rcu_dereference_raw(cgrp->name));
4279err_free_cgrp: 4234err_free_cgrp:
4280 kfree(cgrp); 4235 kfree(cgrp);
4281 return err; 4236 return err;
@@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4295 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4250 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4296} 4251}
4297 4252
4298/*
4299 * Check the reference count on each subsystem. Since we already
4300 * established that there are no tasks in the cgroup, if the css refcount
4301 * is also 1, then there should be no outstanding references, so the
4302 * subsystem is safe to destroy. We scan across all subsystems rather than
4303 * using the per-hierarchy linked list of mounted subsystems since we can
4304 * be called via check_for_release() with no synchronization other than
4305 * RCU, and the subsystem linked list isn't RCU-safe.
4306 */
4307static int cgroup_has_css_refs(struct cgroup *cgrp)
4308{
4309 int i;
4310
4311 /*
4312 * We won't need to lock the subsys array, because the subsystems
4313 * we're concerned about aren't going anywhere since our cgroup root
4314 * has a reference on them.
4315 */
4316 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4317 struct cgroup_subsys *ss = subsys[i];
4318 struct cgroup_subsys_state *css;
4319
4320 /* Skip subsystems not present or not in this hierarchy */
4321 if (ss == NULL || ss->root != cgrp->root)
4322 continue;
4323
4324 css = cgrp->subsys[ss->subsys_id];
4325 /*
4326 * When called from check_for_release() it's possible
4327 * that by this point the cgroup has been removed
4328 * and the css deleted. But a false-positive doesn't
4329 * matter, since it can only happen if the cgroup
4330 * has been deleted and hence no longer needs the
4331 * release agent to be called anyway.
4332 */
4333 if (css && css_refcnt(css) > 1)
4334 return 1;
4335 }
4336 return 0;
4337}
4338
4339static int cgroup_destroy_locked(struct cgroup *cgrp) 4253static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4254 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4341{ 4255{
4342 struct dentry *d = cgrp->dentry; 4256 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent; 4257 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp; 4258 struct cgroup_event *event, *tmp;
4346 struct cgroup_subsys *ss; 4259 struct cgroup_subsys *ss;
4347 LIST_HEAD(tmp_list);
4348 4260
4349 lockdep_assert_held(&d->d_inode->i_mutex); 4261 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex); 4262 lockdep_assert_held(&cgroup_mutex);
@@ -4935,17 +4847,17 @@ void cgroup_post_fork(struct task_struct *child)
4935 * and addition to css_set. 4847 * and addition to css_set.
4936 */ 4848 */
4937 if (need_forkexit_callback) { 4849 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4850 /*
4851 * fork/exit callbacks are supported only for builtin
4852 * subsystems, and the builtin section of the subsys
4853 * array is immutable, so we don't need to lock the
4854 * subsys array here. On the other hand, modular section
4855 * of the array can be freed at module unload, so we
4856 * can't touch that.
4857 */
4858 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i]; 4859 struct cgroup_subsys *ss = subsys[i];
4940 4860
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork) 4861 if (ss->fork)
4950 ss->fork(child); 4862 ss->fork(child);
4951 } 4863 }
@@ -5010,13 +4922,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5010 tsk->cgroups = &init_css_set; 4922 tsk->cgroups = &init_css_set;
5011 4923
5012 if (run_callbacks && need_forkexit_callback) { 4924 if (run_callbacks && need_forkexit_callback) {
5013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4925 /*
4926 * fork/exit callbacks are supported only for builtin
4927 * subsystems, see cgroup_post_fork() for details.
4928 */
4929 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5014 struct cgroup_subsys *ss = subsys[i]; 4930 struct cgroup_subsys *ss = subsys[i];
5015 4931
5016 /* modular subsystems can't use callbacks */
5017 if (!ss || ss->module)
5018 continue;
5019
5020 if (ss->exit) { 4932 if (ss->exit) {
5021 struct cgroup *old_cgrp = 4933 struct cgroup *old_cgrp =
5022 rcu_dereference_raw(cg->subsys[i])->cgroup; 4934 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4942,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5030 put_css_set_taskexit(cg); 4942 put_css_set_taskexit(cg);
5031} 4943}
5032 4944
5033/**
5034 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
5035 * @cgrp: the cgroup in question
5036 * @task: the task in question
5037 *
5038 * See if @cgrp is a descendant of @task's cgroup in the appropriate
5039 * hierarchy.
5040 *
5041 * If we are sending in dummytop, then presumably we are creating
5042 * the top cgroup in the subsystem.
5043 *
5044 * Called only by the ns (nsproxy) cgroup.
5045 */
5046int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
5047{
5048 int ret;
5049 struct cgroup *target;
5050
5051 if (cgrp == dummytop)
5052 return 1;
5053
5054 target = task_cgroup_from_root(task, cgrp->root);
5055 while (cgrp != target && cgrp!= cgrp->top_cgroup)
5056 cgrp = cgrp->parent;
5057 ret = (cgrp == target);
5058 return ret;
5059}
5060
5061static void check_for_release(struct cgroup *cgrp) 4945static void check_for_release(struct cgroup *cgrp)
5062{ 4946{
5063 /* All of these checks rely on RCU to keep the cgroup 4947 /* All of these checks rely on RCU to keep the cgroup
5064 * structure alive */ 4948 * structure alive */
5065 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4949 if (cgroup_is_releasable(cgrp) &&
5066 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4950 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
5067 /* Control Group is currently removeable. If it's not 4951 /*
4952 * Control Group is currently removeable. If it's not
5068 * already queued for a userspace notification, queue 4953 * already queued for a userspace notification, queue
5069 * it now */ 4954 * it now
4955 */
5070 int need_schedule_work = 0; 4956 int need_schedule_work = 0;
4957
5071 raw_spin_lock(&release_list_lock); 4958 raw_spin_lock(&release_list_lock);
5072 if (!cgroup_is_removed(cgrp) && 4959 if (!cgroup_is_removed(cgrp) &&
5073 list_empty(&cgrp->release_list)) { 4960 list_empty(&cgrp->release_list)) {
@@ -5100,24 +4987,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
5100/* Caller must verify that the css is not for root cgroup */ 4987/* Caller must verify that the css is not for root cgroup */
5101void __css_put(struct cgroup_subsys_state *css) 4988void __css_put(struct cgroup_subsys_state *css)
5102{ 4989{
5103 struct cgroup *cgrp = css->cgroup;
5104 int v; 4990 int v;
5105 4991
5106 rcu_read_lock();
5107 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4992 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5108 4993 if (v == 0)
5109 switch (v) {
5110 case 1:
5111 if (notify_on_release(cgrp)) {
5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5113 check_for_release(cgrp);
5114 }
5115 break;
5116 case 0:
5117 schedule_work(&css->dput_work); 4994 schedule_work(&css->dput_work);
5118 break;
5119 }
5120 rcu_read_unlock();
5121} 4995}
5122EXPORT_SYMBOL_GPL(__css_put); 4996EXPORT_SYMBOL_GPL(__css_put);
5123 4997