aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:38 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commit81a6a5cdd2c5cd70874b88afe524ab09e9e869af (patch)
treeba46c47a0692b687a96e52e61bfda4f14457017f
parent817929ec274bcfe771586d338bb31d1659615686 (diff)
Task Control Groups: automatic userspace notification of idle cgroups
Add the following files to the cgroup filesystem: notify_on_release - configures/reports whether the cgroup subsystem should attempt to run a release script when this cgroup becomes unused release_agent - configures/reports the release agent to be used for this hierarchy (top level in each hierarchy only) releasable - reports whether this cgroup would have been auto-released if notify_on_release was true and a release agent was configured (mainly useful for debugging) To avoid locking issues, invoking the userspace release agent is done via a workqueue task; cgroups that need to have their release agents invoked by the workqueue task are linked on to a list. [pj@sgi.com: Need to include kmod.h] Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h11
-rw-r--r--kernel/cgroup.c428
2 files changed, 395 insertions, 44 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 836b3557bb76..9e9b7efa180b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -77,10 +77,11 @@ static inline void css_get(struct cgroup_subsys_state *css)
77 * css_get() 77 * css_get()
78 */ 78 */
79 79
80extern void __css_put(struct cgroup_subsys_state *css);
80static inline void css_put(struct cgroup_subsys_state *css) 81static inline void css_put(struct cgroup_subsys_state *css)
81{ 82{
82 if (!test_bit(CSS_ROOT, &css->flags)) 83 if (!test_bit(CSS_ROOT, &css->flags))
83 atomic_dec(&css->refcnt); 84 __css_put(css);
84} 85}
85 86
86struct cgroup { 87struct cgroup {
@@ -112,6 +113,13 @@ struct cgroup {
112 * tasks in this cgroup. Protected by css_set_lock 113 * tasks in this cgroup. Protected by css_set_lock
113 */ 114 */
114 struct list_head css_sets; 115 struct list_head css_sets;
116
117 /*
118 * Linked list running through all cgroups that can
119 * potentially be reaped by the release agent. Protected by
120 * release_list_lock
121 */
122 struct list_head release_list;
115}; 123};
116 124
117/* A css_set is a structure holding pointers to a set of 125/* A css_set is a structure holding pointers to a set of
@@ -293,7 +301,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont,
293 struct cgroup_iter *it); 301 struct cgroup_iter *it);
294void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); 302void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
295 303
296
297#else /* !CONFIG_CGROUPS */ 304#else /* !CONFIG_CGROUPS */
298 305
299static inline int cgroup_init_early(void) { return 0; } 306static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 883928c0e147..d65a1246829f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -43,8 +43,11 @@
43#include <linux/spinlock.h> 43#include <linux/spinlock.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/sort.h> 45#include <linux/sort.h>
46#include <linux/kmod.h>
46#include <asm/atomic.h> 47#include <asm/atomic.h>
47 48
49static DEFINE_MUTEX(cgroup_mutex);
50
48/* Generate an array of cgroup subsystem pointers */ 51/* Generate an array of cgroup subsystem pointers */
49#define SUBSYS(_x) &_x ## _subsys, 52#define SUBSYS(_x) &_x ## _subsys,
50 53
@@ -83,6 +86,13 @@ struct cgroupfs_root {
83 86
84 /* Hierarchy-specific flags */ 87 /* Hierarchy-specific flags */
85 unsigned long flags; 88 unsigned long flags;
89
90 /* The path to use for release notifications. No locking
91 * between setting and use - so if userspace updates this
92 * while child cgroups exist, you could miss a
93 * notification. We ensure that it's always a valid
94 * NUL-terminated string */
95 char release_agent_path[PATH_MAX];
86}; 96};
87 97
88 98
@@ -110,7 +120,13 @@ static int need_forkexit_callback;
110 120
111/* bits in struct cgroup flags field */ 121/* bits in struct cgroup flags field */
112enum { 122enum {
123 /* Control Group is dead */
113 CONT_REMOVED, 124 CONT_REMOVED,
125 /* Control Group has previously had a child cgroup or a task,
126 * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
127 CONT_RELEASABLE,
128 /* Control Group requires release notifications to userspace */
129 CONT_NOTIFY_ON_RELEASE,
114}; 130};
115 131
116/* convenient tests for these bits */ 132/* convenient tests for these bits */
@@ -124,6 +140,19 @@ enum {
124 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 140 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
125}; 141};
126 142
143inline int cgroup_is_releasable(const struct cgroup *cont)
144{
145 const int bits =
146 (1 << CONT_RELEASABLE) |
147 (1 << CONT_NOTIFY_ON_RELEASE);
148 return (cont->flags & bits) == bits;
149}
150
151inline int notify_on_release(const struct cgroup *cont)
152{
153 return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
154}
155
127/* 156/*
128 * for_each_subsys() allows you to iterate on each subsystem attached to 157 * for_each_subsys() allows you to iterate on each subsystem attached to
129 * an active hierarchy 158 * an active hierarchy
@@ -135,6 +164,14 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
135#define for_each_root(_root) \ 164#define for_each_root(_root) \
136list_for_each_entry(_root, &roots, root_list) 165list_for_each_entry(_root, &roots, root_list)
137 166
167/* the list of cgroups eligible for automatic release. Protected by
168 * release_list_lock */
169static LIST_HEAD(release_list);
170static DEFINE_SPINLOCK(release_list_lock);
171static void cgroup_release_agent(struct work_struct *work);
172static DECLARE_WORK(release_agent_work, cgroup_release_agent);
173static void check_for_release(struct cgroup *cont);
174
138/* Link structure for associating css_set objects with cgroups */ 175/* Link structure for associating css_set objects with cgroups */
139struct cg_cgroup_link { 176struct cg_cgroup_link {
140 /* 177 /*
@@ -189,11 +226,8 @@ static int use_task_css_set_links;
189/* 226/*
190 * unlink a css_set from the list and free it 227 * unlink a css_set from the list and free it
191 */ 228 */
192static void release_css_set(struct kref *k) 229static void unlink_css_set(struct css_set *cg)
193{ 230{
194 struct css_set *cg = container_of(k, struct css_set, ref);
195 int i;
196
197 write_lock(&css_set_lock); 231 write_lock(&css_set_lock);
198 list_del(&cg->list); 232 list_del(&cg->list);
199 css_set_count--; 233 css_set_count--;
@@ -206,11 +240,39 @@ static void release_css_set(struct kref *k)
206 kfree(link); 240 kfree(link);
207 } 241 }
208 write_unlock(&css_set_lock); 242 write_unlock(&css_set_lock);
209 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 243}
210 atomic_dec(&cg->subsys[i]->cgroup->count); 244
245static void __release_css_set(struct kref *k, int taskexit)
246{
247 int i;
248 struct css_set *cg = container_of(k, struct css_set, ref);
249
250 unlink_css_set(cg);
251
252 rcu_read_lock();
253 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
254 struct cgroup *cont = cg->subsys[i]->cgroup;
255 if (atomic_dec_and_test(&cont->count) &&
256 notify_on_release(cont)) {
257 if (taskexit)
258 set_bit(CONT_RELEASABLE, &cont->flags);
259 check_for_release(cont);
260 }
261 }
262 rcu_read_unlock();
211 kfree(cg); 263 kfree(cg);
212} 264}
213 265
266static void release_css_set(struct kref *k)
267{
268 __release_css_set(k, 0);
269}
270
271static void release_css_set_taskexit(struct kref *k)
272{
273 __release_css_set(k, 1);
274}
275
214/* 276/*
215 * refcounted get/put for css_set objects 277 * refcounted get/put for css_set objects
216 */ 278 */
@@ -224,6 +286,11 @@ static inline void put_css_set(struct css_set *cg)
224 kref_put(&cg->ref, release_css_set); 286 kref_put(&cg->ref, release_css_set);
225} 287}
226 288
289static inline void put_css_set_taskexit(struct css_set *cg)
290{
291 kref_put(&cg->ref, release_css_set_taskexit);
292}
293
227/* 294/*
228 * find_existing_css_set() is a helper for 295 * find_existing_css_set() is a helper for
229 * find_css_set(), and checks to see whether an existing 296 * find_css_set(), and checks to see whether an existing
@@ -465,8 +532,6 @@ static struct css_set *find_css_set(
465 * update of a tasks cgroup pointer by attach_task() 532 * update of a tasks cgroup pointer by attach_task()
466 */ 533 */
467 534
468static DEFINE_MUTEX(cgroup_mutex);
469
470/** 535/**
471 * cgroup_lock - lock out any changes to cgroup structures 536 * cgroup_lock - lock out any changes to cgroup structures
472 * 537 *
@@ -526,6 +591,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
526 if (S_ISDIR(inode->i_mode)) { 591 if (S_ISDIR(inode->i_mode)) {
527 struct cgroup *cont = dentry->d_fsdata; 592 struct cgroup *cont = dentry->d_fsdata;
528 BUG_ON(!(cgroup_is_removed(cont))); 593 BUG_ON(!(cgroup_is_removed(cont)));
594 /* It's possible for external users to be holding css
595 * reference counts on a cgroup; css_put() needs to
596 * be able to access the cgroup after decrementing
597 * the reference count in order to know if it needs to
598 * queue the cgroup to be handled by the release
599 * agent */
600 synchronize_rcu();
529 kfree(cont); 601 kfree(cont);
530 } 602 }
531 iput(inode); 603 iput(inode);
@@ -657,6 +729,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
657 seq_printf(seq, ",%s", ss->name); 729 seq_printf(seq, ",%s", ss->name);
658 if (test_bit(ROOT_NOPREFIX, &root->flags)) 730 if (test_bit(ROOT_NOPREFIX, &root->flags))
659 seq_puts(seq, ",noprefix"); 731 seq_puts(seq, ",noprefix");
732 if (strlen(root->release_agent_path))
733 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
660 mutex_unlock(&cgroup_mutex); 734 mutex_unlock(&cgroup_mutex);
661 return 0; 735 return 0;
662} 736}
@@ -664,6 +738,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
664struct cgroup_sb_opts { 738struct cgroup_sb_opts {
665 unsigned long subsys_bits; 739 unsigned long subsys_bits;
666 unsigned long flags; 740 unsigned long flags;
741 char *release_agent;
667}; 742};
668 743
669/* Convert a hierarchy specifier into a bitmask of subsystems and 744/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -675,6 +750,7 @@ static int parse_cgroupfs_options(char *data,
675 750
676 opts->subsys_bits = 0; 751 opts->subsys_bits = 0;
677 opts->flags = 0; 752 opts->flags = 0;
753 opts->release_agent = NULL;
678 754
679 while ((token = strsep(&o, ",")) != NULL) { 755 while ((token = strsep(&o, ",")) != NULL) {
680 if (!*token) 756 if (!*token)
@@ -683,6 +759,15 @@ static int parse_cgroupfs_options(char *data,
683 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; 759 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
684 } else if (!strcmp(token, "noprefix")) { 760 } else if (!strcmp(token, "noprefix")) {
685 set_bit(ROOT_NOPREFIX, &opts->flags); 761 set_bit(ROOT_NOPREFIX, &opts->flags);
762 } else if (!strncmp(token, "release_agent=", 14)) {
763 /* Specifying two release agents is forbidden */
764 if (opts->release_agent)
765 return -EINVAL;
766 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
767 if (!opts->release_agent)
768 return -ENOMEM;
769 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
770 opts->release_agent[PATH_MAX - 1] = 0;
686 } else { 771 } else {
687 struct cgroup_subsys *ss; 772 struct cgroup_subsys *ss;
688 int i; 773 int i;
@@ -732,7 +817,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
732 if (!ret) 817 if (!ret)
733 cgroup_populate_dir(cont); 818 cgroup_populate_dir(cont);
734 819
820 if (opts.release_agent)
821 strcpy(root->release_agent_path, opts.release_agent);
735 out_unlock: 822 out_unlock:
823 if (opts.release_agent)
824 kfree(opts.release_agent);
736 mutex_unlock(&cgroup_mutex); 825 mutex_unlock(&cgroup_mutex);
737 mutex_unlock(&cont->dentry->d_inode->i_mutex); 826 mutex_unlock(&cont->dentry->d_inode->i_mutex);
738 return ret; 827 return ret;
@@ -756,6 +845,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
756 INIT_LIST_HEAD(&cont->sibling); 845 INIT_LIST_HEAD(&cont->sibling);
757 INIT_LIST_HEAD(&cont->children); 846 INIT_LIST_HEAD(&cont->children);
758 INIT_LIST_HEAD(&cont->css_sets); 847 INIT_LIST_HEAD(&cont->css_sets);
848 INIT_LIST_HEAD(&cont->release_list);
759} 849}
760 850
761static int cgroup_test_super(struct super_block *sb, void *data) 851static int cgroup_test_super(struct super_block *sb, void *data)
@@ -830,8 +920,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
830 920
831 /* First find the desired set of subsystems */ 921 /* First find the desired set of subsystems */
832 ret = parse_cgroupfs_options(data, &opts); 922 ret = parse_cgroupfs_options(data, &opts);
833 if (ret) 923 if (ret) {
924 if (opts.release_agent)
925 kfree(opts.release_agent);
834 return ret; 926 return ret;
927 }
835 928
836 root = kzalloc(sizeof(*root), GFP_KERNEL); 929 root = kzalloc(sizeof(*root), GFP_KERNEL);
837 if (!root) 930 if (!root)
@@ -840,6 +933,10 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
840 init_cgroup_root(root); 933 init_cgroup_root(root);
841 root->subsys_bits = opts.subsys_bits; 934 root->subsys_bits = opts.subsys_bits;
842 root->flags = opts.flags; 935 root->flags = opts.flags;
936 if (opts.release_agent) {
937 strcpy(root->release_agent_path, opts.release_agent);
938 kfree(opts.release_agent);
939 }
843 940
844 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 941 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
845 942
@@ -1120,7 +1217,7 @@ static int attach_task(struct cgroup *cont, struct task_struct *tsk)
1120 ss->attach(ss, cont, oldcont, tsk); 1217 ss->attach(ss, cont, oldcont, tsk);
1121 } 1218 }
1122 } 1219 }
1123 1220 set_bit(CONT_RELEASABLE, &oldcont->flags);
1124 synchronize_rcu(); 1221 synchronize_rcu();
1125 put_css_set(cg); 1222 put_css_set(cg);
1126 return 0; 1223 return 0;
@@ -1170,6 +1267,9 @@ enum cgroup_filetype {
1170 FILE_ROOT, 1267 FILE_ROOT,
1171 FILE_DIR, 1268 FILE_DIR,
1172 FILE_TASKLIST, 1269 FILE_TASKLIST,
1270 FILE_NOTIFY_ON_RELEASE,
1271 FILE_RELEASABLE,
1272 FILE_RELEASE_AGENT,
1173}; 1273};
1174 1274
1175static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft, 1275static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft,
@@ -1240,6 +1340,32 @@ static ssize_t cgroup_common_file_write(struct cgroup *cont,
1240 case FILE_TASKLIST: 1340 case FILE_TASKLIST:
1241 retval = attach_task_by_pid(cont, buffer); 1341 retval = attach_task_by_pid(cont, buffer);
1242 break; 1342 break;
1343 case FILE_NOTIFY_ON_RELEASE:
1344 clear_bit(CONT_RELEASABLE, &cont->flags);
1345 if (simple_strtoul(buffer, NULL, 10) != 0)
1346 set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
1347 else
1348 clear_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
1349 break;
1350 case FILE_RELEASE_AGENT:
1351 {
1352 struct cgroupfs_root *root = cont->root;
1353 /* Strip trailing newline */
1354 if (nbytes && (buffer[nbytes-1] == '\n')) {
1355 buffer[nbytes-1] = 0;
1356 }
1357 if (nbytes < sizeof(root->release_agent_path)) {
1358 /* We never write anything other than '\0'
1359 * into the last char of release_agent_path,
1360 * so it always remains a NUL-terminated
1361 * string */
1362 strncpy(root->release_agent_path, buffer, nbytes);
1363 root->release_agent_path[nbytes] = 0;
1364 } else {
1365 retval = -ENOSPC;
1366 }
1367 break;
1368 }
1243 default: 1369 default:
1244 retval = -EINVAL; 1370 retval = -EINVAL;
1245 goto out2; 1371 goto out2;
@@ -1281,6 +1407,49 @@ static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft,
1281 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1407 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1282} 1408}
1283 1409
1410static ssize_t cgroup_common_file_read(struct cgroup *cont,
1411 struct cftype *cft,
1412 struct file *file,
1413 char __user *buf,
1414 size_t nbytes, loff_t *ppos)
1415{
1416 enum cgroup_filetype type = cft->private;
1417 char *page;
1418 ssize_t retval = 0;
1419 char *s;
1420
1421 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1422 return -ENOMEM;
1423
1424 s = page;
1425
1426 switch (type) {
1427 case FILE_RELEASE_AGENT:
1428 {
1429 struct cgroupfs_root *root;
1430 size_t n;
1431 mutex_lock(&cgroup_mutex);
1432 root = cont->root;
1433 n = strnlen(root->release_agent_path,
1434 sizeof(root->release_agent_path));
1435 n = min(n, (size_t) PAGE_SIZE);
1436 strncpy(s, root->release_agent_path, n);
1437 mutex_unlock(&cgroup_mutex);
1438 s += n;
1439 break;
1440 }
1441 default:
1442 retval = -EINVAL;
1443 goto out;
1444 }
1445 *s++ = '\n';
1446
1447 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1448out:
1449 free_page((unsigned long)page);
1450 return retval;
1451}
1452
1284static ssize_t cgroup_file_read(struct file *file, char __user *buf, 1453static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1285 size_t nbytes, loff_t *ppos) 1454 size_t nbytes, loff_t *ppos)
1286{ 1455{
@@ -1699,16 +1868,49 @@ static int cgroup_tasks_release(struct inode *unused_inode,
1699 return 0; 1868 return 0;
1700} 1869}
1701 1870
1871static u64 cgroup_read_notify_on_release(struct cgroup *cont,
1872 struct cftype *cft)
1873{
1874 return notify_on_release(cont);
1875}
1876
1877static u64 cgroup_read_releasable(struct cgroup *cont, struct cftype *cft)
1878{
1879 return test_bit(CONT_RELEASABLE, &cont->flags);
1880}
1881
1702/* 1882/*
1703 * for the common functions, 'private' gives the type of file 1883 * for the common functions, 'private' gives the type of file
1704 */ 1884 */
1705static struct cftype cft_tasks = { 1885static struct cftype files[] = {
1706 .name = "tasks", 1886 {
1707 .open = cgroup_tasks_open, 1887 .name = "tasks",
1708 .read = cgroup_tasks_read, 1888 .open = cgroup_tasks_open,
1889 .read = cgroup_tasks_read,
1890 .write = cgroup_common_file_write,
1891 .release = cgroup_tasks_release,
1892 .private = FILE_TASKLIST,
1893 },
1894
1895 {
1896 .name = "notify_on_release",
1897 .read_uint = cgroup_read_notify_on_release,
1898 .write = cgroup_common_file_write,
1899 .private = FILE_NOTIFY_ON_RELEASE,
1900 },
1901
1902 {
1903 .name = "releasable",
1904 .read_uint = cgroup_read_releasable,
1905 .private = FILE_RELEASABLE,
1906 }
1907};
1908
1909static struct cftype cft_release_agent = {
1910 .name = "release_agent",
1911 .read = cgroup_common_file_read,
1709 .write = cgroup_common_file_write, 1912 .write = cgroup_common_file_write,
1710 .release = cgroup_tasks_release, 1913 .private = FILE_RELEASE_AGENT,
1711 .private = FILE_TASKLIST,
1712}; 1914};
1713 1915
1714static int cgroup_populate_dir(struct cgroup *cont) 1916static int cgroup_populate_dir(struct cgroup *cont)
@@ -1719,10 +1921,15 @@ static int cgroup_populate_dir(struct cgroup *cont)
1719 /* First clear out any existing files */ 1921 /* First clear out any existing files */
1720 cgroup_clear_directory(cont->dentry); 1922 cgroup_clear_directory(cont->dentry);
1721 1923
1722 err = cgroup_add_file(cont, NULL, &cft_tasks); 1924 err = cgroup_add_files(cont, NULL, files, ARRAY_SIZE(files));
1723 if (err < 0) 1925 if (err < 0)
1724 return err; 1926 return err;
1725 1927
1928 if (cont == cont->top_cgroup) {
1929 if ((err = cgroup_add_file(cont, NULL, &cft_release_agent)) < 0)
1930 return err;
1931 }
1932
1726 for_each_subsys(cont->root, ss) { 1933 for_each_subsys(cont->root, ss) {
1727 if (ss->populate && (err = ss->populate(ss, cont)) < 0) 1934 if (ss->populate && (err = ss->populate(ss, cont)) < 0)
1728 return err; 1935 return err;
@@ -1779,6 +1986,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
1779 INIT_LIST_HEAD(&cont->sibling); 1986 INIT_LIST_HEAD(&cont->sibling);
1780 INIT_LIST_HEAD(&cont->children); 1987 INIT_LIST_HEAD(&cont->children);
1781 INIT_LIST_HEAD(&cont->css_sets); 1988 INIT_LIST_HEAD(&cont->css_sets);
1989 INIT_LIST_HEAD(&cont->release_list);
1782 1990
1783 cont->parent = parent; 1991 cont->parent = parent;
1784 cont->root = parent->root; 1992 cont->root = parent->root;
@@ -1840,6 +2048,38 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1840 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 2048 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
1841} 2049}
1842 2050
2051static inline int cgroup_has_css_refs(struct cgroup *cont)
2052{
2053 /* Check the reference count on each subsystem. Since we
2054 * already established that there are no tasks in the
2055 * cgroup, if the css refcount is also 0, then there should
2056 * be no outstanding references, so the subsystem is safe to
2057 * destroy. We scan across all subsystems rather than using
2058 * the per-hierarchy linked list of mounted subsystems since
2059 * we can be called via check_for_release() with no
2060 * synchronization other than RCU, and the subsystem linked
2061 * list isn't RCU-safe */
2062 int i;
2063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2064 struct cgroup_subsys *ss = subsys[i];
2065 struct cgroup_subsys_state *css;
2066 /* Skip subsystems not in this hierarchy */
2067 if (ss->root != cont->root)
2068 continue;
2069 css = cont->subsys[ss->subsys_id];
2070 /* When called from check_for_release() it's possible
2071 * that by this point the cgroup has been removed
2072 * and the css deleted. But a false-positive doesn't
2073 * matter, since it can only happen if the cgroup
2074 * has been deleted and hence no longer needs the
2075 * release agent to be called anyway. */
2076 if (css && atomic_read(&css->refcnt)) {
2077 return 1;
2078 }
2079 }
2080 return 0;
2081}
2082
1843static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2083static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1844{ 2084{
1845 struct cgroup *cont = dentry->d_fsdata; 2085 struct cgroup *cont = dentry->d_fsdata;
@@ -1848,7 +2088,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1848 struct cgroup_subsys *ss; 2088 struct cgroup_subsys *ss;
1849 struct super_block *sb; 2089 struct super_block *sb;
1850 struct cgroupfs_root *root; 2090 struct cgroupfs_root *root;
1851 int css_busy = 0;
1852 2091
1853 /* the vfs holds both inode->i_mutex already */ 2092 /* the vfs holds both inode->i_mutex already */
1854 2093
@@ -1866,20 +2105,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1866 root = cont->root; 2105 root = cont->root;
1867 sb = root->sb; 2106 sb = root->sb;
1868 2107
1869 /* Check the reference count on each subsystem. Since we 2108 if (cgroup_has_css_refs(cont)) {
1870 * already established that there are no tasks in the
1871 * cgroup, if the css refcount is also 0, then there should
1872 * be no outstanding references, so the subsystem is safe to
1873 * destroy */
1874 for_each_subsys(root, ss) {
1875 struct cgroup_subsys_state *css;
1876 css = cont->subsys[ss->subsys_id];
1877 if (atomic_read(&css->refcnt)) {
1878 css_busy = 1;
1879 break;
1880 }
1881 }
1882 if (css_busy) {
1883 mutex_unlock(&cgroup_mutex); 2109 mutex_unlock(&cgroup_mutex);
1884 return -EBUSY; 2110 return -EBUSY;
1885 } 2111 }
@@ -1889,7 +2115,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1889 ss->destroy(ss, cont); 2115 ss->destroy(ss, cont);
1890 } 2116 }
1891 2117
2118 spin_lock(&release_list_lock);
1892 set_bit(CONT_REMOVED, &cont->flags); 2119 set_bit(CONT_REMOVED, &cont->flags);
2120 if (!list_empty(&cont->release_list))
2121 list_del(&cont->release_list);
2122 spin_unlock(&release_list_lock);
1893 /* delete my sibling from parent->children */ 2123 /* delete my sibling from parent->children */
1894 list_del(&cont->sibling); 2124 list_del(&cont->sibling);
1895 spin_lock(&cont->dentry->d_lock); 2125 spin_lock(&cont->dentry->d_lock);
@@ -1901,6 +2131,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1901 dput(d); 2131 dput(d);
1902 root->number_of_cgroups--; 2132 root->number_of_cgroups--;
1903 2133
2134 set_bit(CONT_RELEASABLE, &parent->flags);
2135 check_for_release(parent);
2136
1904 mutex_unlock(&cgroup_mutex); 2137 mutex_unlock(&cgroup_mutex);
1905 /* Drop the active superblock reference that we took when we 2138 /* Drop the active superblock reference that we took when we
1906 * created the cgroup */ 2139 * created the cgroup */
@@ -1938,15 +2171,15 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
1938 /* If this subsystem requested that it be notified with fork 2171 /* If this subsystem requested that it be notified with fork
1939 * events, we should send it one now for every process in the 2172 * events, we should send it one now for every process in the
1940 * system */ 2173 * system */
1941 if (ss->fork) { 2174 if (ss->fork) {
1942 struct task_struct *g, *p; 2175 struct task_struct *g, *p;
1943 2176
1944 read_lock(&tasklist_lock); 2177 read_lock(&tasklist_lock);
1945 do_each_thread(g, p) { 2178 do_each_thread(g, p) {
1946 ss->fork(ss, p); 2179 ss->fork(ss, p);
1947 } while_each_thread(g, p); 2180 } while_each_thread(g, p);
1948 read_unlock(&tasklist_lock); 2181 read_unlock(&tasklist_lock);
1949 } 2182 }
1950 2183
1951 need_forkexit_callback |= ss->fork || ss->exit; 2184 need_forkexit_callback |= ss->fork || ss->exit;
1952 2185
@@ -2263,7 +2496,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2263 tsk->cgroups = &init_css_set; 2496 tsk->cgroups = &init_css_set;
2264 task_unlock(tsk); 2497 task_unlock(tsk);
2265 if (cg) 2498 if (cg)
2266 put_css_set(cg); 2499 put_css_set_taskexit(cg);
2267} 2500}
2268 2501
2269/** 2502/**
@@ -2374,7 +2607,10 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2374 2607
2375 out_release: 2608 out_release:
2376 mutex_unlock(&inode->i_mutex); 2609 mutex_unlock(&inode->i_mutex);
2610
2611 mutex_lock(&cgroup_mutex);
2377 put_css_set(cg); 2612 put_css_set(cg);
2613 mutex_unlock(&cgroup_mutex);
2378 deactivate_super(parent->root->sb); 2614 deactivate_super(parent->root->sb);
2379 return ret; 2615 return ret;
2380} 2616}
@@ -2404,3 +2640,111 @@ int cgroup_is_descendant(const struct cgroup *cont)
2404 ret = (cont == target); 2640 ret = (cont == target);
2405 return ret; 2641 return ret;
2406} 2642}
2643
2644static void check_for_release(struct cgroup *cont)
2645{
2646 /* All of these checks rely on RCU to keep the cgroup
2647 * structure alive */
2648 if (cgroup_is_releasable(cont) && !atomic_read(&cont->count)
2649 && list_empty(&cont->children) && !cgroup_has_css_refs(cont)) {
2650 /* Control Group is currently removeable. If it's not
2651 * already queued for a userspace notification, queue
2652 * it now */
2653 int need_schedule_work = 0;
2654 spin_lock(&release_list_lock);
2655 if (!cgroup_is_removed(cont) &&
2656 list_empty(&cont->release_list)) {
2657 list_add(&cont->release_list, &release_list);
2658 need_schedule_work = 1;
2659 }
2660 spin_unlock(&release_list_lock);
2661 if (need_schedule_work)
2662 schedule_work(&release_agent_work);
2663 }
2664}
2665
2666void __css_put(struct cgroup_subsys_state *css)
2667{
2668 struct cgroup *cont = css->cgroup;
2669 rcu_read_lock();
2670 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cont)) {
2671 set_bit(CONT_RELEASABLE, &cont->flags);
2672 check_for_release(cont);
2673 }
2674 rcu_read_unlock();
2675}
2676
2677/*
2678 * Notify userspace when a cgroup is released, by running the
2679 * configured release agent with the name of the cgroup (path
2680 * relative to the root of cgroup file system) as the argument.
2681 *
2682 * Most likely, this user command will try to rmdir this cgroup.
2683 *
2684 * This races with the possibility that some other task will be
2685 * attached to this cgroup before it is removed, or that some other
2686 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
2687 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
2688 * unused, and this cgroup will be reprieved from its death sentence,
2689 * to continue to serve a useful existence. Next time it's released,
2690 * we will get notified again, if it still has 'notify_on_release' set.
2691 *
2692 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
2693 * means only wait until the task is successfully execve()'d. The
2694 * separate release agent task is forked by call_usermodehelper(),
2695 * then control in this thread returns here, without waiting for the
2696 * release agent task. We don't bother to wait because the caller of
2697 * this routine has no use for the exit status of the release agent
2698 * task, so no sense holding our caller up for that.
2699 *
2700 */
2701
2702static void cgroup_release_agent(struct work_struct *work)
2703{
2704 BUG_ON(work != &release_agent_work);
2705 mutex_lock(&cgroup_mutex);
2706 spin_lock(&release_list_lock);
2707 while (!list_empty(&release_list)) {
2708 char *argv[3], *envp[3];
2709 int i;
2710 char *pathbuf;
2711 struct cgroup *cont = list_entry(release_list.next,
2712 struct cgroup,
2713 release_list);
2714 list_del_init(&cont->release_list);
2715 spin_unlock(&release_list_lock);
2716 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2717 if (!pathbuf) {
2718 spin_lock(&release_list_lock);
2719 continue;
2720 }
2721
2722 if (cgroup_path(cont, pathbuf, PAGE_SIZE) < 0) {
2723 kfree(pathbuf);
2724 spin_lock(&release_list_lock);
2725 continue;
2726 }
2727
2728 i = 0;
2729 argv[i++] = cont->root->release_agent_path;
2730 argv[i++] = (char *)pathbuf;
2731 argv[i] = NULL;
2732
2733 i = 0;
2734 /* minimal command environment */
2735 envp[i++] = "HOME=/";
2736 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
2737 envp[i] = NULL;
2738
2739 /* Drop the lock while we invoke the usermode helper,
2740 * since the exec could involve hitting disk and hence
2741 * be a slow process */
2742 mutex_unlock(&cgroup_mutex);
2743 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2744 kfree(pathbuf);
2745 mutex_lock(&cgroup_mutex);
2746 spin_lock(&release_list_lock);
2747 }
2748 spin_unlock(&release_list_lock);
2749 mutex_unlock(&cgroup_mutex);
2750}