diff options
author | Paul Menage <menage@google.com> | 2007-10-19 02:39:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:36 -0400 |
commit | 81a6a5cdd2c5cd70874b88afe524ab09e9e869af (patch) | |
tree | ba46c47a0692b687a96e52e61bfda4f14457017f /kernel/cgroup.c | |
parent | 817929ec274bcfe771586d338bb31d1659615686 (diff) |
Task Control Groups: automatic userspace notification of idle cgroups
Add the following files to the cgroup filesystem:
notify_on_release - configures/reports whether the cgroup subsystem should
attempt to run a release script when this cgroup becomes unused
release_agent - configures/reports the release agent to be used for this
hierarchy (top level in each hierarchy only)
releasable - reports whether this cgroup would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)
To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; cgroups that need to have their release agents invoked by
the workqueue task are linked on to a list.
[pj@sgi.com: Need to include kmod.h]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 428 |
1 files changed, 386 insertions, 42 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 883928c0e147..d65a1246829f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -43,8 +43,11 @@ | |||
43 | #include <linux/spinlock.h> | 43 | #include <linux/spinlock.h> |
44 | #include <linux/string.h> | 44 | #include <linux/string.h> |
45 | #include <linux/sort.h> | 45 | #include <linux/sort.h> |
46 | #include <linux/kmod.h> | ||
46 | #include <asm/atomic.h> | 47 | #include <asm/atomic.h> |
47 | 48 | ||
49 | static DEFINE_MUTEX(cgroup_mutex); | ||
50 | |||
48 | /* Generate an array of cgroup subsystem pointers */ | 51 | /* Generate an array of cgroup subsystem pointers */ |
49 | #define SUBSYS(_x) &_x ## _subsys, | 52 | #define SUBSYS(_x) &_x ## _subsys, |
50 | 53 | ||
@@ -83,6 +86,13 @@ struct cgroupfs_root { | |||
83 | 86 | ||
84 | /* Hierarchy-specific flags */ | 87 | /* Hierarchy-specific flags */ |
85 | unsigned long flags; | 88 | unsigned long flags; |
89 | |||
90 | /* The path to use for release notifications. No locking | ||
91 | * between setting and use - so if userspace updates this | ||
92 | * while child cgroups exist, you could miss a | ||
93 | * notification. We ensure that it's always a valid | ||
94 | * NUL-terminated string */ | ||
95 | char release_agent_path[PATH_MAX]; | ||
86 | }; | 96 | }; |
87 | 97 | ||
88 | 98 | ||
@@ -110,7 +120,13 @@ static int need_forkexit_callback; | |||
110 | 120 | ||
111 | /* bits in struct cgroup flags field */ | 121 | /* bits in struct cgroup flags field */ |
112 | enum { | 122 | enum { |
123 | /* Control Group is dead */ | ||
113 | CONT_REMOVED, | 124 | CONT_REMOVED, |
125 | /* Control Group has previously had a child cgroup or a task, | ||
126 | * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */ | ||
127 | CONT_RELEASABLE, | ||
128 | /* Control Group requires release notifications to userspace */ | ||
129 | CONT_NOTIFY_ON_RELEASE, | ||
114 | }; | 130 | }; |
115 | 131 | ||
116 | /* convenient tests for these bits */ | 132 | /* convenient tests for these bits */ |
@@ -124,6 +140,19 @@ enum { | |||
124 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 140 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
125 | }; | 141 | }; |
126 | 142 | ||
143 | inline int cgroup_is_releasable(const struct cgroup *cont) | ||
144 | { | ||
145 | const int bits = | ||
146 | (1 << CONT_RELEASABLE) | | ||
147 | (1 << CONT_NOTIFY_ON_RELEASE); | ||
148 | return (cont->flags & bits) == bits; | ||
149 | } | ||
150 | |||
151 | inline int notify_on_release(const struct cgroup *cont) | ||
152 | { | ||
153 | return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags); | ||
154 | } | ||
155 | |||
127 | /* | 156 | /* |
128 | * for_each_subsys() allows you to iterate on each subsystem attached to | 157 | * for_each_subsys() allows you to iterate on each subsystem attached to |
129 | * an active hierarchy | 158 | * an active hierarchy |
@@ -135,6 +164,14 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
135 | #define for_each_root(_root) \ | 164 | #define for_each_root(_root) \ |
136 | list_for_each_entry(_root, &roots, root_list) | 165 | list_for_each_entry(_root, &roots, root_list) |
137 | 166 | ||
167 | /* the list of cgroups eligible for automatic release. Protected by | ||
168 | * release_list_lock */ | ||
169 | static LIST_HEAD(release_list); | ||
170 | static DEFINE_SPINLOCK(release_list_lock); | ||
171 | static void cgroup_release_agent(struct work_struct *work); | ||
172 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | ||
173 | static void check_for_release(struct cgroup *cont); | ||
174 | |||
138 | /* Link structure for associating css_set objects with cgroups */ | 175 | /* Link structure for associating css_set objects with cgroups */ |
139 | struct cg_cgroup_link { | 176 | struct cg_cgroup_link { |
140 | /* | 177 | /* |
@@ -189,11 +226,8 @@ static int use_task_css_set_links; | |||
189 | /* | 226 | /* |
190 | * unlink a css_set from the list and free it | 227 | * unlink a css_set from the list and free it |
191 | */ | 228 | */ |
192 | static void release_css_set(struct kref *k) | 229 | static void unlink_css_set(struct css_set *cg) |
193 | { | 230 | { |
194 | struct css_set *cg = container_of(k, struct css_set, ref); | ||
195 | int i; | ||
196 | |||
197 | write_lock(&css_set_lock); | 231 | write_lock(&css_set_lock); |
198 | list_del(&cg->list); | 232 | list_del(&cg->list); |
199 | css_set_count--; | 233 | css_set_count--; |
@@ -206,11 +240,39 @@ static void release_css_set(struct kref *k) | |||
206 | kfree(link); | 240 | kfree(link); |
207 | } | 241 | } |
208 | write_unlock(&css_set_lock); | 242 | write_unlock(&css_set_lock); |
209 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 243 | } |
210 | atomic_dec(&cg->subsys[i]->cgroup->count); | 244 | |
245 | static void __release_css_set(struct kref *k, int taskexit) | ||
246 | { | ||
247 | int i; | ||
248 | struct css_set *cg = container_of(k, struct css_set, ref); | ||
249 | |||
250 | unlink_css_set(cg); | ||
251 | |||
252 | rcu_read_lock(); | ||
253 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
254 | struct cgroup *cont = cg->subsys[i]->cgroup; | ||
255 | if (atomic_dec_and_test(&cont->count) && | ||
256 | notify_on_release(cont)) { | ||
257 | if (taskexit) | ||
258 | set_bit(CONT_RELEASABLE, &cont->flags); | ||
259 | check_for_release(cont); | ||
260 | } | ||
261 | } | ||
262 | rcu_read_unlock(); | ||
211 | kfree(cg); | 263 | kfree(cg); |
212 | } | 264 | } |
213 | 265 | ||
266 | static void release_css_set(struct kref *k) | ||
267 | { | ||
268 | __release_css_set(k, 0); | ||
269 | } | ||
270 | |||
271 | static void release_css_set_taskexit(struct kref *k) | ||
272 | { | ||
273 | __release_css_set(k, 1); | ||
274 | } | ||
275 | |||
214 | /* | 276 | /* |
215 | * refcounted get/put for css_set objects | 277 | * refcounted get/put for css_set objects |
216 | */ | 278 | */ |
@@ -224,6 +286,11 @@ static inline void put_css_set(struct css_set *cg) | |||
224 | kref_put(&cg->ref, release_css_set); | 286 | kref_put(&cg->ref, release_css_set); |
225 | } | 287 | } |
226 | 288 | ||
289 | static inline void put_css_set_taskexit(struct css_set *cg) | ||
290 | { | ||
291 | kref_put(&cg->ref, release_css_set_taskexit); | ||
292 | } | ||
293 | |||
227 | /* | 294 | /* |
228 | * find_existing_css_set() is a helper for | 295 | * find_existing_css_set() is a helper for |
229 | * find_css_set(), and checks to see whether an existing | 296 | * find_css_set(), and checks to see whether an existing |
@@ -465,8 +532,6 @@ static struct css_set *find_css_set( | |||
465 | * update of a tasks cgroup pointer by attach_task() | 532 | * update of a tasks cgroup pointer by attach_task() |
466 | */ | 533 | */ |
467 | 534 | ||
468 | static DEFINE_MUTEX(cgroup_mutex); | ||
469 | |||
470 | /** | 535 | /** |
471 | * cgroup_lock - lock out any changes to cgroup structures | 536 | * cgroup_lock - lock out any changes to cgroup structures |
472 | * | 537 | * |
@@ -526,6 +591,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
526 | if (S_ISDIR(inode->i_mode)) { | 591 | if (S_ISDIR(inode->i_mode)) { |
527 | struct cgroup *cont = dentry->d_fsdata; | 592 | struct cgroup *cont = dentry->d_fsdata; |
528 | BUG_ON(!(cgroup_is_removed(cont))); | 593 | BUG_ON(!(cgroup_is_removed(cont))); |
594 | /* It's possible for external users to be holding css | ||
595 | * reference counts on a cgroup; css_put() needs to | ||
596 | * be able to access the cgroup after decrementing | ||
597 | * the reference count in order to know if it needs to | ||
598 | * queue the cgroup to be handled by the release | ||
599 | * agent */ | ||
600 | synchronize_rcu(); | ||
529 | kfree(cont); | 601 | kfree(cont); |
530 | } | 602 | } |
531 | iput(inode); | 603 | iput(inode); |
@@ -657,6 +729,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
657 | seq_printf(seq, ",%s", ss->name); | 729 | seq_printf(seq, ",%s", ss->name); |
658 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 730 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
659 | seq_puts(seq, ",noprefix"); | 731 | seq_puts(seq, ",noprefix"); |
732 | if (strlen(root->release_agent_path)) | ||
733 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | ||
660 | mutex_unlock(&cgroup_mutex); | 734 | mutex_unlock(&cgroup_mutex); |
661 | return 0; | 735 | return 0; |
662 | } | 736 | } |
@@ -664,6 +738,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
664 | struct cgroup_sb_opts { | 738 | struct cgroup_sb_opts { |
665 | unsigned long subsys_bits; | 739 | unsigned long subsys_bits; |
666 | unsigned long flags; | 740 | unsigned long flags; |
741 | char *release_agent; | ||
667 | }; | 742 | }; |
668 | 743 | ||
669 | /* Convert a hierarchy specifier into a bitmask of subsystems and | 744 | /* Convert a hierarchy specifier into a bitmask of subsystems and |
@@ -675,6 +750,7 @@ static int parse_cgroupfs_options(char *data, | |||
675 | 750 | ||
676 | opts->subsys_bits = 0; | 751 | opts->subsys_bits = 0; |
677 | opts->flags = 0; | 752 | opts->flags = 0; |
753 | opts->release_agent = NULL; | ||
678 | 754 | ||
679 | while ((token = strsep(&o, ",")) != NULL) { | 755 | while ((token = strsep(&o, ",")) != NULL) { |
680 | if (!*token) | 756 | if (!*token) |
@@ -683,6 +759,15 @@ static int parse_cgroupfs_options(char *data, | |||
683 | opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; | 759 | opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; |
684 | } else if (!strcmp(token, "noprefix")) { | 760 | } else if (!strcmp(token, "noprefix")) { |
685 | set_bit(ROOT_NOPREFIX, &opts->flags); | 761 | set_bit(ROOT_NOPREFIX, &opts->flags); |
762 | } else if (!strncmp(token, "release_agent=", 14)) { | ||
763 | /* Specifying two release agents is forbidden */ | ||
764 | if (opts->release_agent) | ||
765 | return -EINVAL; | ||
766 | opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); | ||
767 | if (!opts->release_agent) | ||
768 | return -ENOMEM; | ||
769 | strncpy(opts->release_agent, token + 14, PATH_MAX - 1); | ||
770 | opts->release_agent[PATH_MAX - 1] = 0; | ||
686 | } else { | 771 | } else { |
687 | struct cgroup_subsys *ss; | 772 | struct cgroup_subsys *ss; |
688 | int i; | 773 | int i; |
@@ -732,7 +817,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
732 | if (!ret) | 817 | if (!ret) |
733 | cgroup_populate_dir(cont); | 818 | cgroup_populate_dir(cont); |
734 | 819 | ||
820 | if (opts.release_agent) | ||
821 | strcpy(root->release_agent_path, opts.release_agent); | ||
735 | out_unlock: | 822 | out_unlock: |
823 | if (opts.release_agent) | ||
824 | kfree(opts.release_agent); | ||
736 | mutex_unlock(&cgroup_mutex); | 825 | mutex_unlock(&cgroup_mutex); |
737 | mutex_unlock(&cont->dentry->d_inode->i_mutex); | 826 | mutex_unlock(&cont->dentry->d_inode->i_mutex); |
738 | return ret; | 827 | return ret; |
@@ -756,6 +845,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
756 | INIT_LIST_HEAD(&cont->sibling); | 845 | INIT_LIST_HEAD(&cont->sibling); |
757 | INIT_LIST_HEAD(&cont->children); | 846 | INIT_LIST_HEAD(&cont->children); |
758 | INIT_LIST_HEAD(&cont->css_sets); | 847 | INIT_LIST_HEAD(&cont->css_sets); |
848 | INIT_LIST_HEAD(&cont->release_list); | ||
759 | } | 849 | } |
760 | 850 | ||
761 | static int cgroup_test_super(struct super_block *sb, void *data) | 851 | static int cgroup_test_super(struct super_block *sb, void *data) |
@@ -830,8 +920,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
830 | 920 | ||
831 | /* First find the desired set of subsystems */ | 921 | /* First find the desired set of subsystems */ |
832 | ret = parse_cgroupfs_options(data, &opts); | 922 | ret = parse_cgroupfs_options(data, &opts); |
833 | if (ret) | 923 | if (ret) { |
924 | if (opts.release_agent) | ||
925 | kfree(opts.release_agent); | ||
834 | return ret; | 926 | return ret; |
927 | } | ||
835 | 928 | ||
836 | root = kzalloc(sizeof(*root), GFP_KERNEL); | 929 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
837 | if (!root) | 930 | if (!root) |
@@ -840,6 +933,10 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
840 | init_cgroup_root(root); | 933 | init_cgroup_root(root); |
841 | root->subsys_bits = opts.subsys_bits; | 934 | root->subsys_bits = opts.subsys_bits; |
842 | root->flags = opts.flags; | 935 | root->flags = opts.flags; |
936 | if (opts.release_agent) { | ||
937 | strcpy(root->release_agent_path, opts.release_agent); | ||
938 | kfree(opts.release_agent); | ||
939 | } | ||
843 | 940 | ||
844 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); | 941 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); |
845 | 942 | ||
@@ -1120,7 +1217,7 @@ static int attach_task(struct cgroup *cont, struct task_struct *tsk) | |||
1120 | ss->attach(ss, cont, oldcont, tsk); | 1217 | ss->attach(ss, cont, oldcont, tsk); |
1121 | } | 1218 | } |
1122 | } | 1219 | } |
1123 | 1220 | set_bit(CONT_RELEASABLE, &oldcont->flags); | |
1124 | synchronize_rcu(); | 1221 | synchronize_rcu(); |
1125 | put_css_set(cg); | 1222 | put_css_set(cg); |
1126 | return 0; | 1223 | return 0; |
@@ -1170,6 +1267,9 @@ enum cgroup_filetype { | |||
1170 | FILE_ROOT, | 1267 | FILE_ROOT, |
1171 | FILE_DIR, | 1268 | FILE_DIR, |
1172 | FILE_TASKLIST, | 1269 | FILE_TASKLIST, |
1270 | FILE_NOTIFY_ON_RELEASE, | ||
1271 | FILE_RELEASABLE, | ||
1272 | FILE_RELEASE_AGENT, | ||
1173 | }; | 1273 | }; |
1174 | 1274 | ||
1175 | static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft, | 1275 | static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft, |
@@ -1240,6 +1340,32 @@ static ssize_t cgroup_common_file_write(struct cgroup *cont, | |||
1240 | case FILE_TASKLIST: | 1340 | case FILE_TASKLIST: |
1241 | retval = attach_task_by_pid(cont, buffer); | 1341 | retval = attach_task_by_pid(cont, buffer); |
1242 | break; | 1342 | break; |
1343 | case FILE_NOTIFY_ON_RELEASE: | ||
1344 | clear_bit(CONT_RELEASABLE, &cont->flags); | ||
1345 | if (simple_strtoul(buffer, NULL, 10) != 0) | ||
1346 | set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags); | ||
1347 | else | ||
1348 | clear_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags); | ||
1349 | break; | ||
1350 | case FILE_RELEASE_AGENT: | ||
1351 | { | ||
1352 | struct cgroupfs_root *root = cont->root; | ||
1353 | /* Strip trailing newline */ | ||
1354 | if (nbytes && (buffer[nbytes-1] == '\n')) { | ||
1355 | buffer[nbytes-1] = 0; | ||
1356 | } | ||
1357 | if (nbytes < sizeof(root->release_agent_path)) { | ||
1358 | /* We never write anything other than '\0' | ||
1359 | * into the last char of release_agent_path, | ||
1360 | * so it always remains a NUL-terminated | ||
1361 | * string */ | ||
1362 | strncpy(root->release_agent_path, buffer, nbytes); | ||
1363 | root->release_agent_path[nbytes] = 0; | ||
1364 | } else { | ||
1365 | retval = -ENOSPC; | ||
1366 | } | ||
1367 | break; | ||
1368 | } | ||
1243 | default: | 1369 | default: |
1244 | retval = -EINVAL; | 1370 | retval = -EINVAL; |
1245 | goto out2; | 1371 | goto out2; |
@@ -1281,6 +1407,49 @@ static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft, | |||
1281 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 1407 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
1282 | } | 1408 | } |
1283 | 1409 | ||
1410 | static ssize_t cgroup_common_file_read(struct cgroup *cont, | ||
1411 | struct cftype *cft, | ||
1412 | struct file *file, | ||
1413 | char __user *buf, | ||
1414 | size_t nbytes, loff_t *ppos) | ||
1415 | { | ||
1416 | enum cgroup_filetype type = cft->private; | ||
1417 | char *page; | ||
1418 | ssize_t retval = 0; | ||
1419 | char *s; | ||
1420 | |||
1421 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) | ||
1422 | return -ENOMEM; | ||
1423 | |||
1424 | s = page; | ||
1425 | |||
1426 | switch (type) { | ||
1427 | case FILE_RELEASE_AGENT: | ||
1428 | { | ||
1429 | struct cgroupfs_root *root; | ||
1430 | size_t n; | ||
1431 | mutex_lock(&cgroup_mutex); | ||
1432 | root = cont->root; | ||
1433 | n = strnlen(root->release_agent_path, | ||
1434 | sizeof(root->release_agent_path)); | ||
1435 | n = min(n, (size_t) PAGE_SIZE); | ||
1436 | strncpy(s, root->release_agent_path, n); | ||
1437 | mutex_unlock(&cgroup_mutex); | ||
1438 | s += n; | ||
1439 | break; | ||
1440 | } | ||
1441 | default: | ||
1442 | retval = -EINVAL; | ||
1443 | goto out; | ||
1444 | } | ||
1445 | *s++ = '\n'; | ||
1446 | |||
1447 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); | ||
1448 | out: | ||
1449 | free_page((unsigned long)page); | ||
1450 | return retval; | ||
1451 | } | ||
1452 | |||
1284 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 1453 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
1285 | size_t nbytes, loff_t *ppos) | 1454 | size_t nbytes, loff_t *ppos) |
1286 | { | 1455 | { |
@@ -1699,16 +1868,49 @@ static int cgroup_tasks_release(struct inode *unused_inode, | |||
1699 | return 0; | 1868 | return 0; |
1700 | } | 1869 | } |
1701 | 1870 | ||
1871 | static u64 cgroup_read_notify_on_release(struct cgroup *cont, | ||
1872 | struct cftype *cft) | ||
1873 | { | ||
1874 | return notify_on_release(cont); | ||
1875 | } | ||
1876 | |||
1877 | static u64 cgroup_read_releasable(struct cgroup *cont, struct cftype *cft) | ||
1878 | { | ||
1879 | return test_bit(CONT_RELEASABLE, &cont->flags); | ||
1880 | } | ||
1881 | |||
1702 | /* | 1882 | /* |
1703 | * for the common functions, 'private' gives the type of file | 1883 | * for the common functions, 'private' gives the type of file |
1704 | */ | 1884 | */ |
1705 | static struct cftype cft_tasks = { | 1885 | static struct cftype files[] = { |
1706 | .name = "tasks", | 1886 | { |
1707 | .open = cgroup_tasks_open, | 1887 | .name = "tasks", |
1708 | .read = cgroup_tasks_read, | 1888 | .open = cgroup_tasks_open, |
1889 | .read = cgroup_tasks_read, | ||
1890 | .write = cgroup_common_file_write, | ||
1891 | .release = cgroup_tasks_release, | ||
1892 | .private = FILE_TASKLIST, | ||
1893 | }, | ||
1894 | |||
1895 | { | ||
1896 | .name = "notify_on_release", | ||
1897 | .read_uint = cgroup_read_notify_on_release, | ||
1898 | .write = cgroup_common_file_write, | ||
1899 | .private = FILE_NOTIFY_ON_RELEASE, | ||
1900 | }, | ||
1901 | |||
1902 | { | ||
1903 | .name = "releasable", | ||
1904 | .read_uint = cgroup_read_releasable, | ||
1905 | .private = FILE_RELEASABLE, | ||
1906 | } | ||
1907 | }; | ||
1908 | |||
1909 | static struct cftype cft_release_agent = { | ||
1910 | .name = "release_agent", | ||
1911 | .read = cgroup_common_file_read, | ||
1709 | .write = cgroup_common_file_write, | 1912 | .write = cgroup_common_file_write, |
1710 | .release = cgroup_tasks_release, | 1913 | .private = FILE_RELEASE_AGENT, |
1711 | .private = FILE_TASKLIST, | ||
1712 | }; | 1914 | }; |
1713 | 1915 | ||
1714 | static int cgroup_populate_dir(struct cgroup *cont) | 1916 | static int cgroup_populate_dir(struct cgroup *cont) |
@@ -1719,10 +1921,15 @@ static int cgroup_populate_dir(struct cgroup *cont) | |||
1719 | /* First clear out any existing files */ | 1921 | /* First clear out any existing files */ |
1720 | cgroup_clear_directory(cont->dentry); | 1922 | cgroup_clear_directory(cont->dentry); |
1721 | 1923 | ||
1722 | err = cgroup_add_file(cont, NULL, &cft_tasks); | 1924 | err = cgroup_add_files(cont, NULL, files, ARRAY_SIZE(files)); |
1723 | if (err < 0) | 1925 | if (err < 0) |
1724 | return err; | 1926 | return err; |
1725 | 1927 | ||
1928 | if (cont == cont->top_cgroup) { | ||
1929 | if ((err = cgroup_add_file(cont, NULL, &cft_release_agent)) < 0) | ||
1930 | return err; | ||
1931 | } | ||
1932 | |||
1726 | for_each_subsys(cont->root, ss) { | 1933 | for_each_subsys(cont->root, ss) { |
1727 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) | 1934 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) |
1728 | return err; | 1935 | return err; |
@@ -1779,6 +1986,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
1779 | INIT_LIST_HEAD(&cont->sibling); | 1986 | INIT_LIST_HEAD(&cont->sibling); |
1780 | INIT_LIST_HEAD(&cont->children); | 1987 | INIT_LIST_HEAD(&cont->children); |
1781 | INIT_LIST_HEAD(&cont->css_sets); | 1988 | INIT_LIST_HEAD(&cont->css_sets); |
1989 | INIT_LIST_HEAD(&cont->release_list); | ||
1782 | 1990 | ||
1783 | cont->parent = parent; | 1991 | cont->parent = parent; |
1784 | cont->root = parent->root; | 1992 | cont->root = parent->root; |
@@ -1840,6 +2048,38 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
1840 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 2048 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
1841 | } | 2049 | } |
1842 | 2050 | ||
2051 | static inline int cgroup_has_css_refs(struct cgroup *cont) | ||
2052 | { | ||
2053 | /* Check the reference count on each subsystem. Since we | ||
2054 | * already established that there are no tasks in the | ||
2055 | * cgroup, if the css refcount is also 0, then there should | ||
2056 | * be no outstanding references, so the subsystem is safe to | ||
2057 | * destroy. We scan across all subsystems rather than using | ||
2058 | * the per-hierarchy linked list of mounted subsystems since | ||
2059 | * we can be called via check_for_release() with no | ||
2060 | * synchronization other than RCU, and the subsystem linked | ||
2061 | * list isn't RCU-safe */ | ||
2062 | int i; | ||
2063 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
2064 | struct cgroup_subsys *ss = subsys[i]; | ||
2065 | struct cgroup_subsys_state *css; | ||
2066 | /* Skip subsystems not in this hierarchy */ | ||
2067 | if (ss->root != cont->root) | ||
2068 | continue; | ||
2069 | css = cont->subsys[ss->subsys_id]; | ||
2070 | /* When called from check_for_release() it's possible | ||
2071 | * that by this point the cgroup has been removed | ||
2072 | * and the css deleted. But a false-positive doesn't | ||
2073 | * matter, since it can only happen if the cgroup | ||
2074 | * has been deleted and hence no longer needs the | ||
2075 | * release agent to be called anyway. */ | ||
2076 | if (css && atomic_read(&css->refcnt)) { | ||
2077 | return 1; | ||
2078 | } | ||
2079 | } | ||
2080 | return 0; | ||
2081 | } | ||
2082 | |||
1843 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 2083 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
1844 | { | 2084 | { |
1845 | struct cgroup *cont = dentry->d_fsdata; | 2085 | struct cgroup *cont = dentry->d_fsdata; |
@@ -1848,7 +2088,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1848 | struct cgroup_subsys *ss; | 2088 | struct cgroup_subsys *ss; |
1849 | struct super_block *sb; | 2089 | struct super_block *sb; |
1850 | struct cgroupfs_root *root; | 2090 | struct cgroupfs_root *root; |
1851 | int css_busy = 0; | ||
1852 | 2091 | ||
1853 | /* the vfs holds both inode->i_mutex already */ | 2092 | /* the vfs holds both inode->i_mutex already */ |
1854 | 2093 | ||
@@ -1866,20 +2105,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1866 | root = cont->root; | 2105 | root = cont->root; |
1867 | sb = root->sb; | 2106 | sb = root->sb; |
1868 | 2107 | ||
1869 | /* Check the reference count on each subsystem. Since we | 2108 | if (cgroup_has_css_refs(cont)) { |
1870 | * already established that there are no tasks in the | ||
1871 | * cgroup, if the css refcount is also 0, then there should | ||
1872 | * be no outstanding references, so the subsystem is safe to | ||
1873 | * destroy */ | ||
1874 | for_each_subsys(root, ss) { | ||
1875 | struct cgroup_subsys_state *css; | ||
1876 | css = cont->subsys[ss->subsys_id]; | ||
1877 | if (atomic_read(&css->refcnt)) { | ||
1878 | css_busy = 1; | ||
1879 | break; | ||
1880 | } | ||
1881 | } | ||
1882 | if (css_busy) { | ||
1883 | mutex_unlock(&cgroup_mutex); | 2109 | mutex_unlock(&cgroup_mutex); |
1884 | return -EBUSY; | 2110 | return -EBUSY; |
1885 | } | 2111 | } |
@@ -1889,7 +2115,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1889 | ss->destroy(ss, cont); | 2115 | ss->destroy(ss, cont); |
1890 | } | 2116 | } |
1891 | 2117 | ||
2118 | spin_lock(&release_list_lock); | ||
1892 | set_bit(CONT_REMOVED, &cont->flags); | 2119 | set_bit(CONT_REMOVED, &cont->flags); |
2120 | if (!list_empty(&cont->release_list)) | ||
2121 | list_del(&cont->release_list); | ||
2122 | spin_unlock(&release_list_lock); | ||
1893 | /* delete my sibling from parent->children */ | 2123 | /* delete my sibling from parent->children */ |
1894 | list_del(&cont->sibling); | 2124 | list_del(&cont->sibling); |
1895 | spin_lock(&cont->dentry->d_lock); | 2125 | spin_lock(&cont->dentry->d_lock); |
@@ -1901,6 +2131,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1901 | dput(d); | 2131 | dput(d); |
1902 | root->number_of_cgroups--; | 2132 | root->number_of_cgroups--; |
1903 | 2133 | ||
2134 | set_bit(CONT_RELEASABLE, &parent->flags); | ||
2135 | check_for_release(parent); | ||
2136 | |||
1904 | mutex_unlock(&cgroup_mutex); | 2137 | mutex_unlock(&cgroup_mutex); |
1905 | /* Drop the active superblock reference that we took when we | 2138 | /* Drop the active superblock reference that we took when we |
1906 | * created the cgroup */ | 2139 | * created the cgroup */ |
@@ -1938,15 +2171,15 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) | |||
1938 | /* If this subsystem requested that it be notified with fork | 2171 | /* If this subsystem requested that it be notified with fork |
1939 | * events, we should send it one now for every process in the | 2172 | * events, we should send it one now for every process in the |
1940 | * system */ | 2173 | * system */ |
1941 | if (ss->fork) { | 2174 | if (ss->fork) { |
1942 | struct task_struct *g, *p; | 2175 | struct task_struct *g, *p; |
1943 | 2176 | ||
1944 | read_lock(&tasklist_lock); | 2177 | read_lock(&tasklist_lock); |
1945 | do_each_thread(g, p) { | 2178 | do_each_thread(g, p) { |
1946 | ss->fork(ss, p); | 2179 | ss->fork(ss, p); |
1947 | } while_each_thread(g, p); | 2180 | } while_each_thread(g, p); |
1948 | read_unlock(&tasklist_lock); | 2181 | read_unlock(&tasklist_lock); |
1949 | } | 2182 | } |
1950 | 2183 | ||
1951 | need_forkexit_callback |= ss->fork || ss->exit; | 2184 | need_forkexit_callback |= ss->fork || ss->exit; |
1952 | 2185 | ||
@@ -2263,7 +2496,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
2263 | tsk->cgroups = &init_css_set; | 2496 | tsk->cgroups = &init_css_set; |
2264 | task_unlock(tsk); | 2497 | task_unlock(tsk); |
2265 | if (cg) | 2498 | if (cg) |
2266 | put_css_set(cg); | 2499 | put_css_set_taskexit(cg); |
2267 | } | 2500 | } |
2268 | 2501 | ||
2269 | /** | 2502 | /** |
@@ -2374,7 +2607,10 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) | |||
2374 | 2607 | ||
2375 | out_release: | 2608 | out_release: |
2376 | mutex_unlock(&inode->i_mutex); | 2609 | mutex_unlock(&inode->i_mutex); |
2610 | |||
2611 | mutex_lock(&cgroup_mutex); | ||
2377 | put_css_set(cg); | 2612 | put_css_set(cg); |
2613 | mutex_unlock(&cgroup_mutex); | ||
2378 | deactivate_super(parent->root->sb); | 2614 | deactivate_super(parent->root->sb); |
2379 | return ret; | 2615 | return ret; |
2380 | } | 2616 | } |
@@ -2404,3 +2640,111 @@ int cgroup_is_descendant(const struct cgroup *cont) | |||
2404 | ret = (cont == target); | 2640 | ret = (cont == target); |
2405 | return ret; | 2641 | return ret; |
2406 | } | 2642 | } |
2643 | |||
2644 | static void check_for_release(struct cgroup *cont) | ||
2645 | { | ||
2646 | /* All of these checks rely on RCU to keep the cgroup | ||
2647 | * structure alive */ | ||
2648 | if (cgroup_is_releasable(cont) && !atomic_read(&cont->count) | ||
2649 | && list_empty(&cont->children) && !cgroup_has_css_refs(cont)) { | ||
2650 | /* Control Group is currently removeable. If it's not | ||
2651 | * already queued for a userspace notification, queue | ||
2652 | * it now */ | ||
2653 | int need_schedule_work = 0; | ||
2654 | spin_lock(&release_list_lock); | ||
2655 | if (!cgroup_is_removed(cont) && | ||
2656 | list_empty(&cont->release_list)) { | ||
2657 | list_add(&cont->release_list, &release_list); | ||
2658 | need_schedule_work = 1; | ||
2659 | } | ||
2660 | spin_unlock(&release_list_lock); | ||
2661 | if (need_schedule_work) | ||
2662 | schedule_work(&release_agent_work); | ||
2663 | } | ||
2664 | } | ||
2665 | |||
2666 | void __css_put(struct cgroup_subsys_state *css) | ||
2667 | { | ||
2668 | struct cgroup *cont = css->cgroup; | ||
2669 | rcu_read_lock(); | ||
2670 | if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cont)) { | ||
2671 | set_bit(CONT_RELEASABLE, &cont->flags); | ||
2672 | check_for_release(cont); | ||
2673 | } | ||
2674 | rcu_read_unlock(); | ||
2675 | } | ||
2676 | |||
2677 | /* | ||
2678 | * Notify userspace when a cgroup is released, by running the | ||
2679 | * configured release agent with the name of the cgroup (path | ||
2680 | * relative to the root of cgroup file system) as the argument. | ||
2681 | * | ||
2682 | * Most likely, this user command will try to rmdir this cgroup. | ||
2683 | * | ||
2684 | * This races with the possibility that some other task will be | ||
2685 | * attached to this cgroup before it is removed, or that some other | ||
2686 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. | ||
2687 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer | ||
2688 | * unused, and this cgroup will be reprieved from its death sentence, | ||
2689 | * to continue to serve a useful existence. Next time it's released, | ||
2690 | * we will get notified again, if it still has 'notify_on_release' set. | ||
2691 | * | ||
2692 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which | ||
2693 | * means only wait until the task is successfully execve()'d. The | ||
2694 | * separate release agent task is forked by call_usermodehelper(), | ||
2695 | * then control in this thread returns here, without waiting for the | ||
2696 | * release agent task. We don't bother to wait because the caller of | ||
2697 | * this routine has no use for the exit status of the release agent | ||
2698 | * task, so no sense holding our caller up for that. | ||
2699 | * | ||
2700 | */ | ||
2701 | |||
2702 | static void cgroup_release_agent(struct work_struct *work) | ||
2703 | { | ||
2704 | BUG_ON(work != &release_agent_work); | ||
2705 | mutex_lock(&cgroup_mutex); | ||
2706 | spin_lock(&release_list_lock); | ||
2707 | while (!list_empty(&release_list)) { | ||
2708 | char *argv[3], *envp[3]; | ||
2709 | int i; | ||
2710 | char *pathbuf; | ||
2711 | struct cgroup *cont = list_entry(release_list.next, | ||
2712 | struct cgroup, | ||
2713 | release_list); | ||
2714 | list_del_init(&cont->release_list); | ||
2715 | spin_unlock(&release_list_lock); | ||
2716 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2717 | if (!pathbuf) { | ||
2718 | spin_lock(&release_list_lock); | ||
2719 | continue; | ||
2720 | } | ||
2721 | |||
2722 | if (cgroup_path(cont, pathbuf, PAGE_SIZE) < 0) { | ||
2723 | kfree(pathbuf); | ||
2724 | spin_lock(&release_list_lock); | ||
2725 | continue; | ||
2726 | } | ||
2727 | |||
2728 | i = 0; | ||
2729 | argv[i++] = cont->root->release_agent_path; | ||
2730 | argv[i++] = (char *)pathbuf; | ||
2731 | argv[i] = NULL; | ||
2732 | |||
2733 | i = 0; | ||
2734 | /* minimal command environment */ | ||
2735 | envp[i++] = "HOME=/"; | ||
2736 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
2737 | envp[i] = NULL; | ||
2738 | |||
2739 | /* Drop the lock while we invoke the usermode helper, | ||
2740 | * since the exec could involve hitting disk and hence | ||
2741 | * be a slow process */ | ||
2742 | mutex_unlock(&cgroup_mutex); | ||
2743 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
2744 | kfree(pathbuf); | ||
2745 | mutex_lock(&cgroup_mutex); | ||
2746 | spin_lock(&release_list_lock); | ||
2747 | } | ||
2748 | spin_unlock(&release_list_lock); | ||
2749 | mutex_unlock(&cgroup_mutex); | ||
2750 | } | ||