aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c880
1 files changed, 616 insertions, 264 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,12 +52,12 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h> 58#include <linux/eventfd.h>
60#include <linux/poll.h> 59#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61 61
62#include <asm/atomic.h> 62#include <asm/atomic.h>
63 63
@@ -138,7 +138,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
@@ -158,7 +158,7 @@ struct css_id {
158}; 158};
159 159
160/* 160/*
161 * cgroup_event represents events which userspace want to recieve. 161 * cgroup_event represents events which userspace want to receive.
162 */ 162 */
163struct cgroup_event { 163struct cgroup_event {
164 /* 164 /*
@@ -244,6 +244,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 245}
246 246
247static int clone_children(const struct cgroup *cgrp)
248{
249 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
250}
251
247/* 252/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 253 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 254 * an active hierarchy
@@ -322,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
322 return &css_set_table[index]; 327 return &css_set_table[index];
323} 328}
324 329
325static void free_css_set_rcu(struct rcu_head *obj)
326{
327 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
328 kfree(cg);
329}
330
331/* We don't maintain the lists running through each css_set to its 330/* We don't maintain the lists running through each css_set to its
332 * task until after the first call to cgroup_iter_start(). This 331 * task until after the first call to cgroup_iter_start(). This
333 * reduces the fork()/exit() overhead for people who have cgroups 332 * reduces the fork()/exit() overhead for people who have cgroups
@@ -371,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
371 } 370 }
372 371
373 write_unlock(&css_set_lock); 372 write_unlock(&css_set_lock);
374 call_rcu(&cg->rcu_head, free_css_set_rcu); 373 kfree_rcu(cg, rcu_head);
375} 374}
376 375
377/* 376/*
@@ -760,6 +759,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 */ 759 */
761 760
762static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 761static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
762static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
763static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 763static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
764static int cgroup_populate_dir(struct cgroup *cgrp); 764static int cgroup_populate_dir(struct cgroup *cgrp);
765static const struct inode_operations cgroup_dir_inode_operations; 765static const struct inode_operations cgroup_dir_inode_operations;
@@ -778,6 +778,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 778 struct inode *inode = new_inode(sb);
779 779
780 if (inode) { 780 if (inode) {
781 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 782 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 783 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 784 inode->i_gid = current_fsgid();
@@ -806,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
806 return ret; 807 return ret;
807} 808}
808 809
809static void free_cgroup_rcu(struct rcu_head *obj)
810{
811 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
812
813 kfree(cgrp);
814}
815
816static void cgroup_diput(struct dentry *dentry, struct inode *inode) 810static void cgroup_diput(struct dentry *dentry, struct inode *inode)
817{ 811{
818 /* is dentry a directory ? if so, kfree() associated cgroup */ 812 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -850,11 +844,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
850 */ 844 */
851 BUG_ON(!list_empty(&cgrp->pidlists)); 845 BUG_ON(!list_empty(&cgrp->pidlists));
852 846
853 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 847 kfree_rcu(cgrp, rcu_head);
854 } 848 }
855 iput(inode); 849 iput(inode);
856} 850}
857 851
852static int cgroup_delete(const struct dentry *d)
853{
854 return 1;
855}
856
858static void remove_dir(struct dentry *d) 857static void remove_dir(struct dentry *d)
859{ 858{
860 struct dentry *parent = dget(d->d_parent); 859 struct dentry *parent = dget(d->d_parent);
@@ -869,25 +868,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
869 struct list_head *node; 868 struct list_head *node;
870 869
871 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 870 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
872 spin_lock(&dcache_lock); 871 spin_lock(&dentry->d_lock);
873 node = dentry->d_subdirs.next; 872 node = dentry->d_subdirs.next;
874 while (node != &dentry->d_subdirs) { 873 while (node != &dentry->d_subdirs) {
875 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 874 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
875
876 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
876 list_del_init(node); 877 list_del_init(node);
877 if (d->d_inode) { 878 if (d->d_inode) {
878 /* This should never be called on a cgroup 879 /* This should never be called on a cgroup
879 * directory with child cgroups */ 880 * directory with child cgroups */
880 BUG_ON(d->d_inode->i_mode & S_IFDIR); 881 BUG_ON(d->d_inode->i_mode & S_IFDIR);
881 d = dget_locked(d); 882 dget_dlock(d);
882 spin_unlock(&dcache_lock); 883 spin_unlock(&d->d_lock);
884 spin_unlock(&dentry->d_lock);
883 d_delete(d); 885 d_delete(d);
884 simple_unlink(dentry->d_inode, d); 886 simple_unlink(dentry->d_inode, d);
885 dput(d); 887 dput(d);
886 spin_lock(&dcache_lock); 888 spin_lock(&dentry->d_lock);
887 } 889 } else
890 spin_unlock(&d->d_lock);
888 node = dentry->d_subdirs.next; 891 node = dentry->d_subdirs.next;
889 } 892 }
890 spin_unlock(&dcache_lock); 893 spin_unlock(&dentry->d_lock);
891} 894}
892 895
893/* 896/*
@@ -895,11 +898,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
895 */ 898 */
896static void cgroup_d_remove_dir(struct dentry *dentry) 899static void cgroup_d_remove_dir(struct dentry *dentry)
897{ 900{
901 struct dentry *parent;
902
898 cgroup_clear_directory(dentry); 903 cgroup_clear_directory(dentry);
899 904
900 spin_lock(&dcache_lock); 905 parent = dentry->d_parent;
906 spin_lock(&parent->d_lock);
907 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
901 list_del_init(&dentry->d_u.d_child); 908 list_del_init(&dentry->d_u.d_child);
902 spin_unlock(&dcache_lock); 909 spin_unlock(&dentry->d_lock);
910 spin_unlock(&parent->d_lock);
903 remove_dir(dentry); 911 remove_dir(dentry);
904} 912}
905 913
@@ -1040,6 +1048,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1048 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1049 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1050 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1051 if (clone_children(&root->top_cgroup))
1052 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1053 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1054 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1055 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1060,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1060 unsigned long subsys_bits;
1051 unsigned long flags; 1061 unsigned long flags;
1052 char *release_agent; 1062 char *release_agent;
1063 bool clone_children;
1053 char *name; 1064 char *name;
1054 /* User explicitly requested empty subsystem */ 1065 /* User explicitly requested empty subsystem */
1055 bool none; 1066 bool none;
@@ -1066,7 +1077,8 @@ struct cgroup_sb_opts {
1066 */ 1077 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1078static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1079{
1069 char *token, *o = data ?: "all"; 1080 char *token, *o = data;
1081 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1082 unsigned long mask = (unsigned long)-1;
1071 int i; 1083 int i;
1072 bool module_pin_failed = false; 1084 bool module_pin_failed = false;
@@ -1082,22 +1094,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1094 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1095 if (!*token)
1084 return -EINVAL; 1096 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1097 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1098 /* Explicitly have no subsystems */
1097 opts->none = true; 1099 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1100 continue;
1101 }
1102 if (!strcmp(token, "all")) {
1103 /* Mutually exclusive option 'all' + subsystem name */
1104 if (one_ss)
1105 return -EINVAL;
1106 all_ss = true;
1107 continue;
1108 }
1109 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1110 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1111 continue;
1112 }
1113 if (!strcmp(token, "clone_children")) {
1114 opts->clone_children = true;
1115 continue;
1116 }
1117 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1118 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1119 if (opts->release_agent)
1103 return -EINVAL; 1120 return -EINVAL;
@@ -1105,7 +1122,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1122 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1123 if (!opts->release_agent)
1107 return -ENOMEM; 1124 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1125 continue;
1126 }
1127 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1128 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1129 /* Can't specify an empty name */
1111 if (!strlen(name)) 1130 if (!strlen(name))
@@ -1127,20 +1146,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1127 GFP_KERNEL); 1146 GFP_KERNEL);
1128 if (!opts->name) 1147 if (!opts->name)
1129 return -ENOMEM; 1148 return -ENOMEM;
1130 } else { 1149
1131 struct cgroup_subsys *ss; 1150 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1151 }
1133 ss = subsys[i]; 1152
1134 if (ss == NULL) 1153 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1154 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1155 if (ss == NULL)
1137 if (!ss->disabled) 1156 continue;
1138 set_bit(i, &opts->subsys_bits); 1157 if (strcmp(token, ss->name))
1139 break; 1158 continue;
1140 } 1159 if (ss->disabled)
1141 } 1160 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1161
1143 return -ENOENT; 1162 /* Mutually exclusive option 'all' + subsystem name */
1163 if (all_ss)
1164 return -EINVAL;
1165 set_bit(i, &opts->subsys_bits);
1166 one_ss = true;
1167
1168 break;
1169 }
1170 if (i == CGROUP_SUBSYS_COUNT)
1171 return -ENOENT;
1172 }
1173
1174 /*
1175 * If the 'all' option was specified select all the subsystems,
1176 * otherwise 'all, 'none' and a subsystem name options were not
1177 * specified, let's default to 'all'
1178 */
1179 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1180 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181 struct cgroup_subsys *ss = subsys[i];
1182 if (ss == NULL)
1183 continue;
1184 if (ss->disabled)
1185 continue;
1186 set_bit(i, &opts->subsys_bits);
1144 } 1187 }
1145 } 1188 }
1146 1189
@@ -1222,7 +1265,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1265 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1266 struct cgroup_sb_opts opts;
1224 1267
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1228 1270
@@ -1255,7 +1297,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1297 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1298 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1299 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1300 return ret;
1260} 1301}
1261 1302
@@ -1357,6 +1398,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1398 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1399 if (opts->name)
1359 strcpy(root->name, opts->name); 1400 strcpy(root->name, opts->name);
1401 if (opts->clone_children)
1402 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1403 return root;
1361} 1404}
1362 1405
@@ -1400,6 +1443,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1400 1443
1401static int cgroup_get_rootdir(struct super_block *sb) 1444static int cgroup_get_rootdir(struct super_block *sb)
1402{ 1445{
1446 static const struct dentry_operations cgroup_dops = {
1447 .d_iput = cgroup_diput,
1448 .d_delete = cgroup_delete,
1449 };
1450
1403 struct inode *inode = 1451 struct inode *inode =
1404 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1452 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1405 struct dentry *dentry; 1453 struct dentry *dentry;
@@ -1417,12 +1465,14 @@ static int cgroup_get_rootdir(struct super_block *sb)
1417 return -ENOMEM; 1465 return -ENOMEM;
1418 } 1466 }
1419 sb->s_root = dentry; 1467 sb->s_root = dentry;
1468 /* for everything else we want ->d_op set */
1469 sb->s_d_op = &cgroup_dops;
1420 return 0; 1470 return 0;
1421} 1471}
1422 1472
1423static int cgroup_get_sb(struct file_system_type *fs_type, 1473static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1424 int flags, const char *unused_dev_name, 1474 int flags, const char *unused_dev_name,
1425 void *data, struct vfsmount *mnt) 1475 void *data)
1426{ 1476{
1427 struct cgroup_sb_opts opts; 1477 struct cgroup_sb_opts opts;
1428 struct cgroupfs_root *root; 1478 struct cgroupfs_root *root;
@@ -1556,10 +1606,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1556 drop_parsed_module_refcounts(opts.subsys_bits); 1606 drop_parsed_module_refcounts(opts.subsys_bits);
1557 } 1607 }
1558 1608
1559 simple_set_mnt(mnt, sb);
1560 kfree(opts.release_agent); 1609 kfree(opts.release_agent);
1561 kfree(opts.name); 1610 kfree(opts.name);
1562 return 0; 1611 return dget(sb->s_root);
1563 1612
1564 drop_new_super: 1613 drop_new_super:
1565 deactivate_locked_super(sb); 1614 deactivate_locked_super(sb);
@@ -1568,8 +1617,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1617 out_err:
1569 kfree(opts.release_agent); 1618 kfree(opts.release_agent);
1570 kfree(opts.name); 1619 kfree(opts.name);
1571 1620 return ERR_PTR(ret);
1572 return ret;
1573} 1621}
1574 1622
1575static void cgroup_kill_sb(struct super_block *sb) { 1623static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1667,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1619 1667
1620static struct file_system_type cgroup_fs_type = { 1668static struct file_system_type cgroup_fs_type = {
1621 .name = "cgroup", 1669 .name = "cgroup",
1622 .get_sb = cgroup_get_sb, 1670 .mount = cgroup_mount,
1623 .kill_sb = cgroup_kill_sb, 1671 .kill_sb = cgroup_kill_sb,
1624}; 1672};
1625 1673
@@ -1688,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1688} 1736}
1689EXPORT_SYMBOL_GPL(cgroup_path); 1737EXPORT_SYMBOL_GPL(cgroup_path);
1690 1738
1739/*
1740 * cgroup_task_migrate - move a task from one cgroup to another.
1741 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */
1746static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee)
1748{
1749 struct css_set *oldcg;
1750 struct css_set *newcg;
1751
1752 /*
1753 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex.
1756 */
1757 task_lock(tsk);
1758 oldcg = tsk->cgroups;
1759 get_css_set(oldcg);
1760 task_unlock(tsk);
1761
1762 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) {
1764 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg);
1769 get_css_set(newcg);
1770 read_unlock(&css_set_lock);
1771 } else {
1772 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) {
1776 put_css_set(oldcg);
1777 return -ENOMEM;
1778 }
1779 }
1780 put_css_set(oldcg);
1781
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk);
1786 put_css_set(newcg);
1787 return -ESRCH;
1788 }
1789 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk);
1791
1792 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock);
1797
1798 /*
1799 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU.
1802 */
1803 put_css_set(oldcg);
1804
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0;
1807}
1808
1691/** 1809/**
1692 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1693 * @cgrp: the cgroup the task is attaching to 1811 * @cgrp: the cgroup the task is attaching to
@@ -1698,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1698 */ 1816 */
1699int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1700{ 1818{
1701 int retval = 0; 1819 int retval;
1702 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 struct cgroup_subsys *ss, *failed_ss = NULL;
1703 struct cgroup *oldcgrp; 1821 struct cgroup *oldcgrp;
1704 struct css_set *cg;
1705 struct css_set *newcg;
1706 struct cgroupfs_root *root = cgrp->root; 1822 struct cgroupfs_root *root = cgrp->root;
1707 1823
1708 /* Nothing to do if the task is already in that cgroup */ 1824 /* Nothing to do if the task is already in that cgroup */
@@ -1712,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1712 1828
1713 for_each_subsys(root, ss) { 1829 for_each_subsys(root, ss) {
1714 if (ss->can_attach) { 1830 if (ss->can_attach) {
1715 retval = ss->can_attach(ss, cgrp, tsk, false); 1831 retval = ss->can_attach(ss, cgrp, tsk);
1716 if (retval) { 1832 if (retval) {
1717 /* 1833 /*
1718 * Remember on which subsystem the can_attach() 1834 * Remember on which subsystem the can_attach()
@@ -1724,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1724 goto out; 1840 goto out;
1725 } 1841 }
1726 } 1842 }
1843 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) {
1846 failed_ss = ss;
1847 goto out;
1848 }
1849 }
1727 } 1850 }
1728 1851
1729 task_lock(tsk); 1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1730 cg = tsk->cgroups; 1853 if (retval)
1731 get_css_set(cg);
1732 task_unlock(tsk);
1733 /*
1734 * Locate or allocate a new css_set for this task,
1735 * based on its final set of cgroups
1736 */
1737 newcg = find_css_set(cg, cgrp);
1738 put_css_set(cg);
1739 if (!newcg) {
1740 retval = -ENOMEM;
1741 goto out;
1742 }
1743
1744 task_lock(tsk);
1745 if (tsk->flags & PF_EXITING) {
1746 task_unlock(tsk);
1747 put_css_set(newcg);
1748 retval = -ESRCH;
1749 goto out; 1854 goto out;
1750 }
1751 rcu_assign_pointer(tsk->cgroups, newcg);
1752 task_unlock(tsk);
1753
1754 /* Update the css_set linked lists if we're using them */
1755 write_lock(&css_set_lock);
1756 if (!list_empty(&tsk->cg_list)) {
1757 list_del(&tsk->cg_list);
1758 list_add(&tsk->cg_list, &newcg->tasks);
1759 }
1760 write_unlock(&css_set_lock);
1761 1855
1762 for_each_subsys(root, ss) { 1856 for_each_subsys(root, ss) {
1857 if (ss->pre_attach)
1858 ss->pre_attach(cgrp);
1859 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk);
1763 if (ss->attach) 1861 if (ss->attach)
1764 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1862 ss->attach(ss, cgrp, oldcgrp, tsk);
1765 } 1863 }
1766 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1864
1767 synchronize_rcu(); 1865 synchronize_rcu();
1768 put_css_set(cg);
1769 1866
1770 /* 1867 /*
1771 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1784,7 +1881,7 @@ out:
1784 */ 1881 */
1785 break; 1882 break;
1786 if (ss->cancel_attach) 1883 if (ss->cancel_attach)
1787 ss->cancel_attach(ss, cgrp, tsk, false); 1884 ss->cancel_attach(ss, cgrp, tsk);
1788 } 1885 }
1789 } 1886 }
1790 return retval; 1887 return retval;
@@ -1815,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1815EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1816 1913
1817/* 1914/*
1818 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 * cgroup_attach_proc works in two stages, the first of which prefetches all
1819 * held. May take task_lock of task 1916 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */
1920struct cg_list_entry {
1921 struct css_set *cg;
1922 struct list_head links;
1923};
1924
1925static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list)
1928{
1929 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg)
1936 get_css_set(newcg);
1937 read_unlock(&css_set_lock);
1938
1939 /* doesn't exist at all? */
1940 if (!newcg)
1941 return false;
1942 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg);
1946 return true;
1947 }
1948 }
1949
1950 /* not found */
1951 put_css_set(newcg);
1952 return false;
1953}
1954
1955/*
1956 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */
1959static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list)
1961{
1962 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry;
1964
1965 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp);
1967 if (!newcg)
1968 return -ENOMEM;
1969 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) {
1972 put_css_set(newcg);
1973 return -ENOMEM;
1974 }
1975 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list);
1977 return 0;
1978}
1979
1980/**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached
1984 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn.
1820 */ 1987 */
1821static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1988int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989{
1990 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */
1998 struct task_struct *tsk;
1999 struct flex_array *group;
2000 /*
2001 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes.
2004 */
2005 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008 /*
2009 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate.
2014 */
2015 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL);
2019 if (!group)
2020 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval)
2024 goto out_free_group_list;
2025
2026 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock();
2028 if (!thread_group_leader(leader)) {
2029 /*
2030 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking".
2035 */
2036 rcu_read_unlock();
2037 retval = -EAGAIN;
2038 goto out_free_group_list;
2039 }
2040 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader;
2042 i = 0;
2043 do {
2044 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size);
2046 get_task_struct(tsk);
2047 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations.
2050 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0);
2053 i++;
2054 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */
2056 group_size = i;
2057 rcu_read_unlock();
2058
2059 /*
2060 * step 1: check that we can legitimately attach to the cgroup.
2061 */
2062 for_each_subsys(root, ss) {
2063 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) {
2066 failed_ss = ss;
2067 goto out_cancel_attach;
2068 }
2069 }
2070 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) {
2077 failed_ss = ss;
2078 cancel_failed_ss = true;
2079 goto out_cancel_attach;
2080 }
2081 }
2082 }
2083 }
2084
2085 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary.
2088 */
2089 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp)
2095 continue;
2096 /* get old css_set pointer */
2097 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups;
2104 get_css_set(oldcg);
2105 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */
2109 put_css_set(oldcg);
2110 } else {
2111 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg);
2114 if (retval)
2115 goto out_list_teardown;
2116 }
2117 }
2118
2119 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is
2123 * the commit point.
2124 */
2125 for_each_subsys(root, ss) {
2126 if (ss->pre_attach)
2127 ss->pre_attach(cgrp);
2128 }
2129 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp)
2134 continue;
2135 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) {
2137 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk);
2139 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH);
2143 }
2144 /* nothing is sensitive to fork() after this point. */
2145
2146 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that.
2150 */
2151 for_each_subsys(root, ss) {
2152 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader);
2154 }
2155
2156 /*
2157 * step 5: success! and cleanup
2158 */
2159 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0;
2162out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg);
2167 kfree(cg_entry);
2168 }
2169out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */
2171 if (retval) {
2172 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader);
2176 break;
2177 }
2178 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader);
2180 }
2181 }
2182 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk);
2186 }
2187out_free_group_list:
2188 flex_array_free(group);
2189 return retval;
2190}
2191
2192/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task.
2196 */
2197static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1822{ 2198{
1823 struct task_struct *tsk; 2199 struct task_struct *tsk;
1824 const struct cred *cred = current_cred(), *tcred; 2200 const struct cred *cred = current_cred(), *tcred;
1825 int ret; 2201 int ret;
1826 2202
2203 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV;
2205
1827 if (pid) { 2206 if (pid) {
1828 rcu_read_lock(); 2207 rcu_read_lock();
1829 tsk = find_task_by_vpid(pid); 2208 tsk = find_task_by_vpid(pid);
1830 if (!tsk || tsk->flags & PF_EXITING) { 2209 if (!tsk) {
2210 rcu_read_unlock();
2211 cgroup_unlock();
2212 return -ESRCH;
2213 }
2214 if (threadgroup) {
2215 /*
2216 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later.
2220 */
2221 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */
1831 rcu_read_unlock(); 2224 rcu_read_unlock();
2225 cgroup_unlock();
1832 return -ESRCH; 2226 return -ESRCH;
1833 } 2227 }
1834 2228
2229 /*
2230 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them.
2232 */
1835 tcred = __task_cred(tsk); 2233 tcred = __task_cred(tsk);
1836 if (cred->euid && 2234 if (cred->euid &&
1837 cred->euid != tcred->uid && 2235 cred->euid != tcred->uid &&
1838 cred->euid != tcred->suid) { 2236 cred->euid != tcred->suid) {
1839 rcu_read_unlock(); 2237 rcu_read_unlock();
2238 cgroup_unlock();
1840 return -EACCES; 2239 return -EACCES;
1841 } 2240 }
1842 get_task_struct(tsk); 2241 get_task_struct(tsk);
1843 rcu_read_unlock(); 2242 rcu_read_unlock();
1844 } else { 2243 } else {
1845 tsk = current; 2244 if (threadgroup)
2245 tsk = current->group_leader;
2246 else
2247 tsk = current;
1846 get_task_struct(tsk); 2248 get_task_struct(tsk);
1847 } 2249 }
1848 2250
1849 ret = cgroup_attach_task(cgrp, tsk); 2251 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk);
2255 } else {
2256 ret = cgroup_attach_task(cgrp, tsk);
2257 }
1850 put_task_struct(tsk); 2258 put_task_struct(tsk);
2259 cgroup_unlock();
1851 return ret; 2260 return ret;
1852} 2261}
1853 2262
1854static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2263static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1855{ 2264{
2265 return attach_task_by_pid(cgrp, pid, false);
2266}
2267
2268static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269{
1856 int ret; 2270 int ret;
1857 if (!cgroup_lock_live_group(cgrp)) 2271 do {
1858 return -ENODEV; 2272 /*
1859 ret = attach_task_by_pid(cgrp, pid); 2273 * attach_proc fails with -EAGAIN if threadgroup leadership
1860 cgroup_unlock(); 2274 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over.
2276 */
2277 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN);
1861 return ret; 2279 return ret;
1862} 2280}
1863 2281
@@ -1883,6 +2301,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1883 const char *buffer) 2301 const char *buffer)
1884{ 2302{
1885 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2303 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2304 if (strlen(buffer) >= PATH_MAX)
2305 return -EINVAL;
1886 if (!cgroup_lock_live_group(cgrp)) 2306 if (!cgroup_lock_live_group(cgrp))
1887 return -ENODEV; 2307 return -ENODEV;
1888 strcpy(cgrp->root->release_agent_path, buffer); 2308 strcpy(cgrp->root->release_agent_path, buffer);
@@ -2140,12 +2560,20 @@ static const struct file_operations cgroup_file_operations = {
2140}; 2560};
2141 2561
2142static const struct inode_operations cgroup_dir_inode_operations = { 2562static const struct inode_operations cgroup_dir_inode_operations = {
2143 .lookup = simple_lookup, 2563 .lookup = cgroup_lookup,
2144 .mkdir = cgroup_mkdir, 2564 .mkdir = cgroup_mkdir,
2145 .rmdir = cgroup_rmdir, 2565 .rmdir = cgroup_rmdir,
2146 .rename = cgroup_rename, 2566 .rename = cgroup_rename,
2147}; 2567};
2148 2568
2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2570{
2571 if (dentry->d_name.len > NAME_MAX)
2572 return ERR_PTR(-ENAMETOOLONG);
2573 d_add(dentry, NULL);
2574 return NULL;
2575}
2576
2149/* 2577/*
2150 * Check if a file is a control file 2578 * Check if a file is a control file
2151 */ 2579 */
@@ -2159,10 +2587,6 @@ static inline struct cftype *__file_cft(struct file *file)
2159static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2587static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2160 struct super_block *sb) 2588 struct super_block *sb)
2161{ 2589{
2162 static const struct dentry_operations cgroup_dops = {
2163 .d_iput = cgroup_diput,
2164 };
2165
2166 struct inode *inode; 2590 struct inode *inode;
2167 2591
2168 if (!dentry) 2592 if (!dentry)
@@ -2188,7 +2612,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2188 inode->i_size = 0; 2612 inode->i_size = 0;
2189 inode->i_fop = &cgroup_file_operations; 2613 inode->i_fop = &cgroup_file_operations;
2190 } 2614 }
2191 dentry->d_op = &cgroup_dops;
2192 d_instantiate(dentry, inode); 2615 d_instantiate(dentry, inode);
2193 dget(dentry); /* Extra count - pin the dentry in core */ 2616 dget(dentry); /* Extra count - pin the dentry in core */
2194 return 0; 2617 return 0;
@@ -3176,6 +3599,23 @@ fail:
3176 return ret; 3599 return ret;
3177} 3600}
3178 3601
3602static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3603 struct cftype *cft)
3604{
3605 return clone_children(cgrp);
3606}
3607
3608static int cgroup_clone_children_write(struct cgroup *cgrp,
3609 struct cftype *cft,
3610 u64 val)
3611{
3612 if (val)
3613 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3614 else
3615 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3616 return 0;
3617}
3618
3179/* 3619/*
3180 * for the common functions, 'private' gives the type of file 3620 * for the common functions, 'private' gives the type of file
3181 */ 3621 */
@@ -3192,9 +3632,9 @@ static struct cftype files[] = {
3192 { 3632 {
3193 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3194 .open = cgroup_procs_open, 3634 .open = cgroup_procs_open,
3195 /* .write_u64 = cgroup_procs_write, TODO */ 3635 .write_u64 = cgroup_procs_write,
3196 .release = cgroup_pidlist_release, 3636 .release = cgroup_pidlist_release,
3197 .mode = S_IRUGO, 3637 .mode = S_IRUGO | S_IWUSR,
3198 }, 3638 },
3199 { 3639 {
3200 .name = "notify_on_release", 3640 .name = "notify_on_release",
@@ -3206,6 +3646,11 @@ static struct cftype files[] = {
3206 .write_string = cgroup_write_event_control, 3646 .write_string = cgroup_write_event_control,
3207 .mode = S_IWUGO, 3647 .mode = S_IWUGO,
3208 }, 3648 },
3649 {
3650 .name = "cgroup.clone_children",
3651 .read_u64 = cgroup_clone_children_read,
3652 .write_u64 = cgroup_clone_children_write,
3653 },
3209}; 3654};
3210 3655
3211static struct cftype cft_release_agent = { 3656static struct cftype cft_release_agent = {
@@ -3335,6 +3780,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3335 if (notify_on_release(parent)) 3780 if (notify_on_release(parent))
3336 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3781 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3337 3782
3783 if (clone_children(parent))
3784 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3785
3338 for_each_subsys(root, ss) { 3786 for_each_subsys(root, ss) {
3339 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3787 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3340 3788
@@ -3349,6 +3797,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3349 goto err_destroy; 3797 goto err_destroy;
3350 } 3798 }
3351 /* At error, ->destroy() callback has to free assigned ID. */ 3799 /* At error, ->destroy() callback has to free assigned ID. */
3800 if (clone_children(parent) && ss->post_clone)
3801 ss->post_clone(ss, cgrp);
3352 } 3802 }
3353 3803
3354 cgroup_lock_hierarchy(root); 3804 cgroup_lock_hierarchy(root);
@@ -3563,17 +4013,15 @@ again:
3563 spin_lock(&release_list_lock); 4013 spin_lock(&release_list_lock);
3564 set_bit(CGRP_REMOVED, &cgrp->flags); 4014 set_bit(CGRP_REMOVED, &cgrp->flags);
3565 if (!list_empty(&cgrp->release_list)) 4015 if (!list_empty(&cgrp->release_list))
3566 list_del(&cgrp->release_list); 4016 list_del_init(&cgrp->release_list);
3567 spin_unlock(&release_list_lock); 4017 spin_unlock(&release_list_lock);
3568 4018
3569 cgroup_lock_hierarchy(cgrp->root); 4019 cgroup_lock_hierarchy(cgrp->root);
3570 /* delete this cgroup from parent->children */ 4020 /* delete this cgroup from parent->children */
3571 list_del(&cgrp->sibling); 4021 list_del_init(&cgrp->sibling);
3572 cgroup_unlock_hierarchy(cgrp->root); 4022 cgroup_unlock_hierarchy(cgrp->root);
3573 4023
3574 spin_lock(&cgrp->dentry->d_lock);
3575 d = dget(cgrp->dentry); 4024 d = dget(cgrp->dentry);
3576 spin_unlock(&d->d_lock);
3577 4025
3578 cgroup_d_remove_dir(d); 4026 cgroup_d_remove_dir(d);
3579 dput(d); 4027 dput(d);
@@ -3789,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3789 subsys[ss->subsys_id] = NULL; 4237 subsys[ss->subsys_id] = NULL;
3790 4238
3791 /* remove subsystem from rootnode's list of subsystems */ 4239 /* remove subsystem from rootnode's list of subsystems */
3792 list_del(&ss->sibling); 4240 list_del_init(&ss->sibling);
3793 4241
3794 /* 4242 /*
3795 * disentangle the css from all css_sets attached to the dummytop. as 4243 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4140,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child)
4140 */ 4588 */
4141void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4589void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4142{ 4590{
4143 int i;
4144 struct css_set *cg; 4591 struct css_set *cg;
4145 4592 int i;
4146 if (run_callbacks && need_forkexit_callback) {
4147 /*
4148 * modular subsystems can't use callbacks, so no need to lock
4149 * the subsys array
4150 */
4151 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4152 struct cgroup_subsys *ss = subsys[i];
4153 if (ss->exit)
4154 ss->exit(ss, tsk);
4155 }
4156 }
4157 4593
4158 /* 4594 /*
4159 * Unlink from the css_set task list if necessary. 4595 * Unlink from the css_set task list if necessary.
@@ -4163,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4163 if (!list_empty(&tsk->cg_list)) { 4599 if (!list_empty(&tsk->cg_list)) {
4164 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4165 if (!list_empty(&tsk->cg_list)) 4601 if (!list_empty(&tsk->cg_list))
4166 list_del(&tsk->cg_list); 4602 list_del_init(&tsk->cg_list);
4167 write_unlock(&css_set_lock); 4603 write_unlock(&css_set_lock);
4168 } 4604 }
4169 4605
@@ -4171,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4171 task_lock(tsk); 4607 task_lock(tsk);
4172 cg = tsk->cgroups; 4608 cg = tsk->cgroups;
4173 tsk->cgroups = &init_css_set; 4609 tsk->cgroups = &init_css_set;
4174 task_unlock(tsk);
4175 if (cg)
4176 put_css_set_taskexit(cg);
4177}
4178
4179/**
4180 * cgroup_clone - clone the cgroup the given subsystem is attached to
4181 * @tsk: the task to be moved
4182 * @subsys: the given subsystem
4183 * @nodename: the name for the new cgroup
4184 *
4185 * Duplicate the current cgroup in the hierarchy that the given
4186 * subsystem is attached to, and move this task into the new
4187 * child.
4188 */
4189int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4190 char *nodename)
4191{
4192 struct dentry *dentry;
4193 int ret = 0;
4194 struct cgroup *parent, *child;
4195 struct inode *inode;
4196 struct css_set *cg;
4197 struct cgroupfs_root *root;
4198 struct cgroup_subsys *ss;
4199
4200 /* We shouldn't be called by an unregistered subsystem */
4201 BUG_ON(!subsys->active);
4202
4203 /* First figure out what hierarchy and cgroup we're dealing
4204 * with, and pin them so we can drop cgroup_mutex */
4205 mutex_lock(&cgroup_mutex);
4206 again:
4207 root = subsys->root;
4208 if (root == &rootnode) {
4209 mutex_unlock(&cgroup_mutex);
4210 return 0;
4211 }
4212 4610
4213 /* Pin the hierarchy */ 4611 if (run_callbacks && need_forkexit_callback) {
4214 if (!atomic_inc_not_zero(&root->sb->s_active)) { 4612 /*
4215 /* We race with the final deactivate_super() */ 4613 * modular subsystems can't use callbacks, so no need to lock
4216 mutex_unlock(&cgroup_mutex); 4614 * the subsys array
4217 return 0; 4615 */
4616 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617 struct cgroup_subsys *ss = subsys[i];
4618 if (ss->exit) {
4619 struct cgroup *old_cgrp =
4620 rcu_dereference_raw(cg->subsys[i])->cgroup;
4621 struct cgroup *cgrp = task_cgroup(tsk, i);
4622 ss->exit(ss, cgrp, old_cgrp, tsk);
4623 }
4624 }
4218 } 4625 }
4219
4220 /* Keep the cgroup alive */
4221 task_lock(tsk);
4222 parent = task_cgroup(tsk, subsys->subsys_id);
4223 cg = tsk->cgroups;
4224 get_css_set(cg);
4225 task_unlock(tsk); 4626 task_unlock(tsk);
4226 4627
4227 mutex_unlock(&cgroup_mutex); 4628 if (cg)
4228 4629 put_css_set_taskexit(cg);
4229 /* Now do the VFS work to create a cgroup */
4230 inode = parent->dentry->d_inode;
4231
4232 /* Hold the parent directory mutex across this operation to
4233 * stop anyone else deleting the new cgroup */
4234 mutex_lock(&inode->i_mutex);
4235 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4236 if (IS_ERR(dentry)) {
4237 printk(KERN_INFO
4238 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4239 PTR_ERR(dentry));
4240 ret = PTR_ERR(dentry);
4241 goto out_release;
4242 }
4243
4244 /* Create the cgroup directory, which also creates the cgroup */
4245 ret = vfs_mkdir(inode, dentry, 0755);
4246 child = __d_cgrp(dentry);
4247 dput(dentry);
4248 if (ret) {
4249 printk(KERN_INFO
4250 "Failed to create cgroup %s: %d\n", nodename,
4251 ret);
4252 goto out_release;
4253 }
4254
4255 /* The cgroup now exists. Retake cgroup_mutex and check
4256 * that we're still in the same state that we thought we
4257 * were. */
4258 mutex_lock(&cgroup_mutex);
4259 if ((root != subsys->root) ||
4260 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4261 /* Aargh, we raced ... */
4262 mutex_unlock(&inode->i_mutex);
4263 put_css_set(cg);
4264
4265 deactivate_super(root->sb);
4266 /* The cgroup is still accessible in the VFS, but
4267 * we're not going to try to rmdir() it at this
4268 * point. */
4269 printk(KERN_INFO
4270 "Race in cgroup_clone() - leaking cgroup %s\n",
4271 nodename);
4272 goto again;
4273 }
4274
4275 /* do any required auto-setup */
4276 for_each_subsys(root, ss) {
4277 if (ss->post_clone)
4278 ss->post_clone(ss, child);
4279 }
4280
4281 /* All seems fine. Finish by moving the task into the new cgroup */
4282 ret = cgroup_attach_task(child, tsk);
4283 mutex_unlock(&cgroup_mutex);
4284
4285 out_release:
4286 mutex_unlock(&inode->i_mutex);
4287
4288 mutex_lock(&cgroup_mutex);
4289 put_css_set(cg);
4290 mutex_unlock(&cgroup_mutex);
4291 deactivate_super(root->sb);
4292 return ret;
4293} 4630}
4294 4631
4295/** 4632/**
@@ -4530,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4530 return ret; 4867 return ret;
4531} 4868}
4532 4869
4533static void __free_css_id_cb(struct rcu_head *head)
4534{
4535 struct css_id *id;
4536
4537 id = container_of(head, struct css_id, rcu_head);
4538 kfree(id);
4539}
4540
4541void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4870void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4542{ 4871{
4543 struct css_id *id = css->id; 4872 struct css_id *id = css->id;
@@ -4552,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4552 spin_lock(&ss->id_lock); 4881 spin_lock(&ss->id_lock);
4553 idr_remove(&ss->idr, id->id); 4882 idr_remove(&ss->idr, id->id);
4554 spin_unlock(&ss->id_lock); 4883 spin_unlock(&ss->id_lock);
4555 call_rcu(&id->rcu_head, __free_css_id_cb); 4884 kfree_rcu(id, rcu_head);
4556} 4885}
4557EXPORT_SYMBOL_GPL(free_css_id); 4886EXPORT_SYMBOL_GPL(free_css_id);
4558 4887
@@ -4723,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4723 return ret; 5052 return ret;
4724} 5053}
4725 5054
5055/*
5056 * get corresponding css from file open on cgroupfs directory
5057 */
5058struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059{
5060 struct cgroup *cgrp;
5061 struct inode *inode;
5062 struct cgroup_subsys_state *css;
5063
5064 inode = f->f_dentry->d_inode;
5065 /* check in cgroup filesystem dir */
5066 if (inode->i_op != &cgroup_dir_inode_operations)
5067 return ERR_PTR(-EBADF);
5068
5069 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070 return ERR_PTR(-EINVAL);
5071
5072 /* get cgroup */
5073 cgrp = __d_cgrp(f->f_dentry);
5074 css = cgrp->subsys[id];
5075 return css ? css : ERR_PTR(-ENOENT);
5076}
5077
4726#ifdef CONFIG_CGROUP_DEBUG 5078#ifdef CONFIG_CGROUP_DEBUG
4727static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5079static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4728 struct cgroup *cont) 5080 struct cgroup *cont)