aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/sparc/kernel/leon_pci_grpci2.c1
-rw-r--r--arch/sparc/kernel/sun4m_irq.c2
-rw-r--r--block/blk-cgroup.c11
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-throttle.c8
-rw-r--r--block/cfq-iosched.c7
-rw-r--r--fs/bio.c2
-rw-r--r--fs/kernfs/dir.c1
-rw-r--r--include/linux/cgroup.h275
-rw-r--r--include/linux/cgroup_subsys.h30
-rw-r--r--include/linux/hugetlb_cgroup.h2
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/net/cls_cgroup.h2
-rw-r--r--include/net/netprio_cgroup.h17
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c3711
-rw-r--r--kernel/cgroup_freezer.c40
-rw-r--r--kernel/cpuset.c262
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--mm/hugetlb_cgroup.c11
-rw-r--r--mm/memcontrol.c110
-rw-r--r--mm/memory-failure.c8
-rw-r--r--net/Kconfig2
-rw-r--r--net/core/netclassid_cgroup.c15
-rw-r--r--net/core/netprio_cgroup.c41
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--security/device_cgroup.c12
32 files changed, 1908 insertions, 2734 deletions
diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c
index 5f0402aab7fb..24d6a4446349 100644
--- a/arch/sparc/kernel/leon_pci_grpci2.c
+++ b/arch/sparc/kernel/leon_pci_grpci2.c
@@ -8,6 +8,7 @@
8#include <linux/of_device.h> 8#include <linux/of_device.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/pci.h> 10#include <linux/pci.h>
11#include <linux/slab.h>
11#include <linux/delay.h> 12#include <linux/delay.h>
12#include <linux/export.h> 13#include <linux/export.h>
13#include <asm/io.h> 14#include <asm/io.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index c5ade9d27a1d..8bb3b3fddea7 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -9,6 +9,8 @@
9 * Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk) 9 * Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk)
10 */ 10 */
11 11
12#include <linux/slab.h>
13
12#include <asm/timer.h> 14#include <asm/timer.h>
13#include <asm/traps.h> 15#include <asm/traps.h>
14#include <asm/pgalloc.h> 16#include <asm/pgalloc.h>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b6e95b5e262f..e4a4145926f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
894 int ret = 0; 894 int ret = 0;
895 895
896 /* task_lock() is needed to avoid races with exit_io_context() */ 896 /* task_lock() is needed to avoid races with exit_io_context() */
897 cgroup_taskset_for_each(task, css, tset) { 897 cgroup_taskset_for_each(task, tset) {
898 task_lock(task); 898 task_lock(task);
899 ioc = task->io_context; 899 ioc = task->io_context;
900 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 900 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -906,17 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
906 return ret; 906 return ret;
907} 907}
908 908
909struct cgroup_subsys blkio_subsys = { 909struct cgroup_subsys blkio_cgrp_subsys = {
910 .name = "blkio",
911 .css_alloc = blkcg_css_alloc, 910 .css_alloc = blkcg_css_alloc,
912 .css_offline = blkcg_css_offline, 911 .css_offline = blkcg_css_offline,
913 .css_free = blkcg_css_free, 912 .css_free = blkcg_css_free,
914 .can_attach = blkcg_can_attach, 913 .can_attach = blkcg_can_attach,
915 .subsys_id = blkio_subsys_id,
916 .base_cftypes = blkcg_files, 914 .base_cftypes = blkcg_files,
917 .module = THIS_MODULE,
918}; 915};
919EXPORT_SYMBOL_GPL(blkio_subsys); 916EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
920 917
921/** 918/**
922 * blkcg_activate_policy - activate a blkcg policy on a request_queue 919 * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1106,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1106 1103
1107 /* everything is in place, add intf files for the new policy */ 1104 /* everything is in place, add intf files for the new policy */
1108 if (pol->cftypes) 1105 if (pol->cftypes)
1109 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); 1106 WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
1110 ret = 0; 1107 ret = 0;
1111out_unlock: 1108out_unlock:
1112 mutex_unlock(&blkcg_pol_mutex); 1109 mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 604f6d99ab92..371fe8e92ab5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
186 186
187static inline struct blkcg *task_blkcg(struct task_struct *tsk) 187static inline struct blkcg *task_blkcg(struct task_struct *tsk)
188{ 188{
189 return css_to_blkcg(task_css(tsk, blkio_subsys_id)); 189 return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
190} 190}
191 191
192static inline struct blkcg *bio_blkcg(struct bio *bio) 192static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
241 */ 241 */
242static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) 242static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
243{ 243{
244 int ret; 244 char *p;
245 245
246 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); 246 p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
247 if (ret) 247 if (!p) {
248 strncpy(buf, "<unavailable>", buflen); 248 strncpy(buf, "<unavailable>", buflen);
249 return ret; 249 return -ENAMETOOLONG;
250 }
251
252 memmove(buf, p, buf + buflen - p);
253 return 0;
250} 254}
251 255
252/** 256/**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1474c3ab7e72..033745cd7fba 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1408,13 +1408,13 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1408} 1408}
1409 1409
1410static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, 1410static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1411 const char *buf) 1411 char *buf)
1412{ 1412{
1413 return tg_set_conf(css, cft, buf, true); 1413 return tg_set_conf(css, cft, buf, true);
1414} 1414}
1415 1415
1416static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, 1416static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
1417 const char *buf) 1417 char *buf)
1418{ 1418{
1419 return tg_set_conf(css, cft, buf, false); 1419 return tg_set_conf(css, cft, buf, false);
1420} 1420}
@@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = {
1425 .private = offsetof(struct throtl_grp, bps[READ]), 1425 .private = offsetof(struct throtl_grp, bps[READ]),
1426 .seq_show = tg_print_conf_u64, 1426 .seq_show = tg_print_conf_u64,
1427 .write_string = tg_set_conf_u64, 1427 .write_string = tg_set_conf_u64,
1428 .max_write_len = 256,
1429 }, 1428 },
1430 { 1429 {
1431 .name = "throttle.write_bps_device", 1430 .name = "throttle.write_bps_device",
1432 .private = offsetof(struct throtl_grp, bps[WRITE]), 1431 .private = offsetof(struct throtl_grp, bps[WRITE]),
1433 .seq_show = tg_print_conf_u64, 1432 .seq_show = tg_print_conf_u64,
1434 .write_string = tg_set_conf_u64, 1433 .write_string = tg_set_conf_u64,
1435 .max_write_len = 256,
1436 }, 1434 },
1437 { 1435 {
1438 .name = "throttle.read_iops_device", 1436 .name = "throttle.read_iops_device",
1439 .private = offsetof(struct throtl_grp, iops[READ]), 1437 .private = offsetof(struct throtl_grp, iops[READ]),
1440 .seq_show = tg_print_conf_uint, 1438 .seq_show = tg_print_conf_uint,
1441 .write_string = tg_set_conf_uint, 1439 .write_string = tg_set_conf_uint,
1442 .max_write_len = 256,
1443 }, 1440 },
1444 { 1441 {
1445 .name = "throttle.write_iops_device", 1442 .name = "throttle.write_iops_device",
1446 .private = offsetof(struct throtl_grp, iops[WRITE]), 1443 .private = offsetof(struct throtl_grp, iops[WRITE]),
1447 .seq_show = tg_print_conf_uint, 1444 .seq_show = tg_print_conf_uint,
1448 .write_string = tg_set_conf_uint, 1445 .write_string = tg_set_conf_uint,
1449 .max_write_len = 256,
1450 }, 1446 },
1451 { 1447 {
1452 .name = "throttle.io_service_bytes", 1448 .name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5873e4ada9eb..e0985f1955e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1701,13 +1701,13 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
1701} 1701}
1702 1702
1703static int cfqg_set_weight_device(struct cgroup_subsys_state *css, 1703static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
1704 struct cftype *cft, const char *buf) 1704 struct cftype *cft, char *buf)
1705{ 1705{
1706 return __cfqg_set_weight_device(css, cft, buf, false); 1706 return __cfqg_set_weight_device(css, cft, buf, false);
1707} 1707}
1708 1708
1709static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, 1709static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
1710 struct cftype *cft, const char *buf) 1710 struct cftype *cft, char *buf)
1711{ 1711{
1712 return __cfqg_set_weight_device(css, cft, buf, true); 1712 return __cfqg_set_weight_device(css, cft, buf, true);
1713} 1713}
@@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = {
1838 .flags = CFTYPE_ONLY_ON_ROOT, 1838 .flags = CFTYPE_ONLY_ON_ROOT,
1839 .seq_show = cfqg_print_leaf_weight_device, 1839 .seq_show = cfqg_print_leaf_weight_device,
1840 .write_string = cfqg_set_leaf_weight_device, 1840 .write_string = cfqg_set_leaf_weight_device,
1841 .max_write_len = 256,
1842 }, 1841 },
1843 { 1842 {
1844 .name = "weight", 1843 .name = "weight",
@@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = {
1853 .flags = CFTYPE_NOT_ON_ROOT, 1852 .flags = CFTYPE_NOT_ON_ROOT,
1854 .seq_show = cfqg_print_weight_device, 1853 .seq_show = cfqg_print_weight_device,
1855 .write_string = cfqg_set_weight_device, 1854 .write_string = cfqg_set_weight_device,
1856 .max_write_len = 256,
1857 }, 1855 },
1858 { 1856 {
1859 .name = "weight", 1857 .name = "weight",
@@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = {
1866 .name = "leaf_weight_device", 1864 .name = "leaf_weight_device",
1867 .seq_show = cfqg_print_leaf_weight_device, 1865 .seq_show = cfqg_print_leaf_weight_device,
1868 .write_string = cfqg_set_leaf_weight_device, 1866 .write_string = cfqg_set_leaf_weight_device,
1869 .max_write_len = 256,
1870 }, 1867 },
1871 { 1868 {
1872 .name = "leaf_weight", 1869 .name = "leaf_weight",
diff --git a/fs/bio.c b/fs/bio.c
index b2dd42ed9edd..b1bc722b89aa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1969,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
1969 1969
1970 /* associate blkcg if exists */ 1970 /* associate blkcg if exists */
1971 rcu_read_lock(); 1971 rcu_read_lock();
1972 css = task_css(current, blkio_subsys_id); 1972 css = task_css(current, blkio_cgrp_id);
1973 if (css && css_tryget(css)) 1973 if (css && css_tryget(css))
1974 bio->bi_css = css; 1974 bio->bi_css = css;
1975 rcu_read_unlock(); 1975 rcu_read_unlock();
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 0bd05ab26003..78f3403300af 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -112,6 +112,7 @@ char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags); 112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113 return p; 113 return p;
114} 114}
115EXPORT_SYMBOL_GPL(kernfs_path);
115 116
116/** 117/**
117 * pr_cont_kernfs_name - pr_cont name of a kernfs_node 118 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9450f025fe0c..c2515851c1aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,18 +14,17 @@
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <linux/rculist.h> 15#include <linux/rculist.h>
16#include <linux/cgroupstats.h> 16#include <linux/cgroupstats.h>
17#include <linux/prio_heap.h>
18#include <linux/rwsem.h> 17#include <linux/rwsem.h>
19#include <linux/idr.h> 18#include <linux/idr.h>
20#include <linux/workqueue.h> 19#include <linux/workqueue.h>
21#include <linux/xattr.h>
22#include <linux/fs.h> 20#include <linux/fs.h>
23#include <linux/percpu-refcount.h> 21#include <linux/percpu-refcount.h>
24#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/kernfs.h>
25 24
26#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
27 26
28struct cgroupfs_root; 27struct cgroup_root;
29struct cgroup_subsys; 28struct cgroup_subsys;
30struct inode; 29struct inode;
31struct cgroup; 30struct cgroup;
@@ -34,31 +33,16 @@ extern int cgroup_init_early(void);
34extern int cgroup_init(void); 33extern int cgroup_init(void);
35extern void cgroup_fork(struct task_struct *p); 34extern void cgroup_fork(struct task_struct *p);
36extern void cgroup_post_fork(struct task_struct *p); 35extern void cgroup_post_fork(struct task_struct *p);
37extern void cgroup_exit(struct task_struct *p, int run_callbacks); 36extern void cgroup_exit(struct task_struct *p);
38extern int cgroupstats_build(struct cgroupstats *stats, 37extern int cgroupstats_build(struct cgroupstats *stats,
39 struct dentry *dentry); 38 struct dentry *dentry);
40extern int cgroup_load_subsys(struct cgroup_subsys *ss);
41extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
42 39
43extern int proc_cgroup_show(struct seq_file *, void *); 40extern int proc_cgroup_show(struct seq_file *, void *);
44 41
45/* 42/* define the enumeration of all cgroup subsystems */
46 * Define the enumeration of all cgroup subsystems. 43#define SUBSYS(_x) _x ## _cgrp_id,
47 *
48 * We define ids for builtin subsystems and then modular ones.
49 */
50#define SUBSYS(_x) _x ## _subsys_id,
51enum cgroup_subsys_id { 44enum cgroup_subsys_id {
52#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
53#include <linux/cgroup_subsys.h>
54#undef IS_SUBSYS_ENABLED
55 CGROUP_BUILTIN_SUBSYS_COUNT,
56
57 __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
58
59#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
60#include <linux/cgroup_subsys.h> 45#include <linux/cgroup_subsys.h>
61#undef IS_SUBSYS_ENABLED
62 CGROUP_SUBSYS_COUNT, 46 CGROUP_SUBSYS_COUNT,
63}; 47};
64#undef SUBSYS 48#undef SUBSYS
@@ -153,11 +137,6 @@ enum {
153 CGRP_SANE_BEHAVIOR, 137 CGRP_SANE_BEHAVIOR,
154}; 138};
155 139
156struct cgroup_name {
157 struct rcu_head rcu_head;
158 char name[];
159};
160
161struct cgroup { 140struct cgroup {
162 unsigned long flags; /* "unsigned long" so bitops work */ 141 unsigned long flags; /* "unsigned long" so bitops work */
163 142
@@ -174,16 +153,17 @@ struct cgroup {
174 /* the number of attached css's */ 153 /* the number of attached css's */
175 int nr_css; 154 int nr_css;
176 155
156 atomic_t refcnt;
157
177 /* 158 /*
178 * We link our 'sibling' struct into our parent's 'children'. 159 * We link our 'sibling' struct into our parent's 'children'.
179 * Our children link their 'sibling' into our 'children'. 160 * Our children link their 'sibling' into our 'children'.
180 */ 161 */
181 struct list_head sibling; /* my parent's children */ 162 struct list_head sibling; /* my parent's children */
182 struct list_head children; /* my children */ 163 struct list_head children; /* my children */
183 struct list_head files; /* my files */
184 164
185 struct cgroup *parent; /* my parent */ 165 struct cgroup *parent; /* my parent */
186 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 166 struct kernfs_node *kn; /* cgroup kernfs entry */
187 167
188 /* 168 /*
189 * Monotonically increasing unique serial number which defines a 169 * Monotonically increasing unique serial number which defines a
@@ -193,23 +173,13 @@ struct cgroup {
193 */ 173 */
194 u64 serial_nr; 174 u64 serial_nr;
195 175
196 /* 176 /* The bitmask of subsystems attached to this cgroup */
197 * This is a copy of dentry->d_name, and it's needed because 177 unsigned long subsys_mask;
198 * we can't use dentry->d_name in cgroup_path().
199 *
200 * You must acquire rcu_read_lock() to access cgrp->name, and
201 * the only place that can change it is rename(), which is
202 * protected by parent dir's i_mutex.
203 *
204 * Normally you should use cgroup_name() wrapper rather than
205 * access it directly.
206 */
207 struct cgroup_name __rcu *name;
208 178
209 /* Private pointers for each registered subsystem */ 179 /* Private pointers for each registered subsystem */
210 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 180 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
211 181
212 struct cgroupfs_root *root; 182 struct cgroup_root *root;
213 183
214 /* 184 /*
215 * List of cgrp_cset_links pointing at css_sets with tasks in this 185 * List of cgrp_cset_links pointing at css_sets with tasks in this
@@ -237,14 +207,11 @@ struct cgroup {
237 /* For css percpu_ref killing and RCU-protected deletion */ 207 /* For css percpu_ref killing and RCU-protected deletion */
238 struct rcu_head rcu_head; 208 struct rcu_head rcu_head;
239 struct work_struct destroy_work; 209 struct work_struct destroy_work;
240
241 /* directory xattrs */
242 struct simple_xattrs xattrs;
243}; 210};
244 211
245#define MAX_CGROUP_ROOT_NAMELEN 64 212#define MAX_CGROUP_ROOT_NAMELEN 64
246 213
247/* cgroupfs_root->flags */ 214/* cgroup_root->flags */
248enum { 215enum {
249 /* 216 /*
250 * Unfortunately, cgroup core and various controllers are riddled 217 * Unfortunately, cgroup core and various controllers are riddled
@@ -262,8 +229,8 @@ enum {
262 * 229 *
263 * The followings are the behaviors currently affected this flag. 230 * The followings are the behaviors currently affected this flag.
264 * 231 *
265 * - Mount options "noprefix" and "clone_children" are disallowed. 232 * - Mount options "noprefix", "xattr", "clone_children",
266 * Also, cgroupfs file cgroup.clone_children is not created. 233 * "release_agent" and "name" are disallowed.
267 * 234 *
268 * - When mounting an existing superblock, mount options should 235 * - When mounting an existing superblock, mount options should
269 * match. 236 * match.
@@ -281,6 +248,11 @@ enum {
281 * - "release_agent" and "notify_on_release" are removed. 248 * - "release_agent" and "notify_on_release" are removed.
282 * Replacement notification mechanism will be implemented. 249 * Replacement notification mechanism will be implemented.
283 * 250 *
251 * - "cgroup.clone_children" is removed.
252 *
253 * - If mount is requested with sane_behavior but without any
254 * subsystem, the default unified hierarchy is mounted.
255 *
284 * - cpuset: tasks will be kept in empty cpusets when hotplug happens 256 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
285 * and take masks of ancestors with non-empty cpus/mems, instead of 257 * and take masks of ancestors with non-empty cpus/mems, instead of
286 * being moved to an ancestor. 258 * being moved to an ancestor.
@@ -300,29 +272,24 @@ enum {
300 272
301 /* mount options live below bit 16 */ 273 /* mount options live below bit 16 */
302 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, 274 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
303
304 CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */
305}; 275};
306 276
307/* 277/*
308 * A cgroupfs_root represents the root of a cgroup hierarchy, and may be 278 * A cgroup_root represents the root of a cgroup hierarchy, and may be
309 * associated with a superblock to form an active hierarchy. This is 279 * associated with a kernfs_root to form an active hierarchy. This is
310 * internal to cgroup core. Don't access directly from controllers. 280 * internal to cgroup core. Don't access directly from controllers.
311 */ 281 */
312struct cgroupfs_root { 282struct cgroup_root {
313 struct super_block *sb; 283 struct kernfs_root *kf_root;
314
315 /* The bitmask of subsystems attached to this hierarchy */
316 unsigned long subsys_mask;
317 284
318 /* Unique id for this hierarchy. */ 285 /* Unique id for this hierarchy. */
319 int hierarchy_id; 286 int hierarchy_id;
320 287
321 /* The root cgroup for this hierarchy */ 288 /* The root cgroup. Root is destroyed on its release. */
322 struct cgroup top_cgroup; 289 struct cgroup cgrp;
323 290
324 /* Tracks how many cgroups are currently defined in hierarchy.*/ 291 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
325 int number_of_cgroups; 292 atomic_t nr_cgrps;
326 293
327 /* A list running through the active hierarchies */ 294 /* A list running through the active hierarchies */
328 struct list_head root_list; 295 struct list_head root_list;
@@ -360,10 +327,14 @@ struct css_set {
360 struct hlist_node hlist; 327 struct hlist_node hlist;
361 328
362 /* 329 /*
363 * List running through all tasks using this cgroup 330 * Lists running through all tasks using this cgroup group.
364 * group. Protected by css_set_lock 331 * mg_tasks lists tasks which belong to this cset but are in the
332 * process of being migrated out or in. Protected by
333 * css_set_rwsem, but, during migration, once tasks are moved to
334 * mg_tasks, it can be read safely while holding cgroup_mutex.
365 */ 335 */
366 struct list_head tasks; 336 struct list_head tasks;
337 struct list_head mg_tasks;
367 338
368 /* 339 /*
369 * List of cgrp_cset_links pointing at cgroups referenced from this 340 * List of cgrp_cset_links pointing at cgroups referenced from this
@@ -372,13 +343,29 @@ struct css_set {
372 struct list_head cgrp_links; 343 struct list_head cgrp_links;
373 344
374 /* 345 /*
375 * Set of subsystem states, one for each subsystem. This array 346 * Set of subsystem states, one for each subsystem. This array is
376 * is immutable after creation apart from the init_css_set 347 * immutable after creation apart from the init_css_set during
377 * during subsystem registration (at boot time) and modular subsystem 348 * subsystem registration (at boot time).
378 * loading/unloading.
379 */ 349 */
380 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 350 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
381 351
352 /*
353 * List of csets participating in the on-going migration either as
354 * source or destination. Protected by cgroup_mutex.
355 */
356 struct list_head mg_preload_node;
357 struct list_head mg_node;
358
359 /*
360 * If this cset is acting as the source of migration the following
361 * two fields are set. mg_src_cgrp is the source cgroup of the
362 * on-going migration and mg_dst_cset is the destination cset the
363 * target tasks on this cset should be migrated to. Protected by
364 * cgroup_mutex.
365 */
366 struct cgroup *mg_src_cgrp;
367 struct css_set *mg_dst_cset;
368
382 /* For RCU-protected deletion */ 369 /* For RCU-protected deletion */
383 struct rcu_head rcu_head; 370 struct rcu_head rcu_head;
384}; 371};
@@ -397,6 +384,7 @@ enum {
397 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 384 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
398 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 385 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
399 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 386 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
387 CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */
400}; 388};
401 389
402#define MAX_CFTYPE_NAME 64 390#define MAX_CFTYPE_NAME 64
@@ -416,8 +404,9 @@ struct cftype {
416 umode_t mode; 404 umode_t mode;
417 405
418 /* 406 /*
419 * If non-zero, defines the maximum length of string that can 407 * The maximum length of string, excluding trailing nul, that can
420 * be passed to write_string; defaults to 64 408 * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is
409 * assumed.
421 */ 410 */
422 size_t max_write_len; 411 size_t max_write_len;
423 412
@@ -425,10 +414,12 @@ struct cftype {
425 unsigned int flags; 414 unsigned int flags;
426 415
427 /* 416 /*
428 * The subsys this file belongs to. Initialized automatically 417 * Fields used for internal bookkeeping. Initialized automatically
429 * during registration. NULL for cgroup core files. 418 * during registration.
430 */ 419 */
431 struct cgroup_subsys *ss; 420 struct cgroup_subsys *ss; /* NULL for cgroup core files */
421 struct list_head node; /* anchored at ss->cfts */
422 struct kernfs_ops *kf_ops;
432 423
433 /* 424 /*
434 * read_u64() is a shortcut for the common case of returning a 425 * read_u64() is a shortcut for the common case of returning a
@@ -467,7 +458,7 @@ struct cftype {
467 * Returns 0 or -ve error code. 458 * Returns 0 or -ve error code.
468 */ 459 */
469 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, 460 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
470 const char *buffer); 461 char *buffer);
471 /* 462 /*
472 * trigger() callback can be used to get some kick from the 463 * trigger() callback can be used to get some kick from the
473 * userspace, when the actual string written is not important 464 * userspace, when the actual string written is not important
@@ -475,37 +466,18 @@ struct cftype {
475 * kick type for multiplexing. 466 * kick type for multiplexing.
476 */ 467 */
477 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 468 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
478};
479 469
480/* 470#ifdef CONFIG_DEBUG_LOCK_ALLOC
481 * cftype_sets describe cftypes belonging to a subsystem and are chained at 471 struct lock_class_key lockdep_key;
482 * cgroup_subsys->cftsets. Each cftset points to an array of cftypes 472#endif
483 * terminated by zero length name.
484 */
485struct cftype_set {
486 struct list_head node; /* chained at subsys->cftsets */
487 struct cftype *cfts;
488}; 473};
489 474
490/* 475extern struct cgroup_root cgrp_dfl_root;
491 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
492 * access directly.
493 */
494struct cfent {
495 struct list_head node;
496 struct dentry *dentry;
497 struct cftype *type;
498 struct cgroup_subsys_state *css;
499
500 /* file xattrs */
501 struct simple_xattrs xattrs;
502};
503 476
504/* seq_file->private points to the following, only ->priv is public */ 477static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
505struct cgroup_open_file { 478{
506 struct cfent *cfe; 479 return cgrp->root == &cgrp_dfl_root;
507 void *priv; 480}
508};
509 481
510/* 482/*
511 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 483 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
@@ -516,34 +488,63 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
516 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; 488 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
517} 489}
518 490
519/* Caller should hold rcu_read_lock() */ 491/* no synchronization, the result can only be used as a hint */
520static inline const char *cgroup_name(const struct cgroup *cgrp) 492static inline bool cgroup_has_tasks(struct cgroup *cgrp)
521{ 493{
522 return rcu_dereference(cgrp->name)->name; 494 return !list_empty(&cgrp->cset_links);
523} 495}
524 496
525static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) 497/* returns ino associated with a cgroup, 0 indicates unmounted root */
498static inline ino_t cgroup_ino(struct cgroup *cgrp)
526{ 499{
527 struct cgroup_open_file *of = seq->private; 500 if (cgrp->kn)
528 return of->cfe->css; 501 return cgrp->kn->ino;
502 else
503 return 0;
529} 504}
530 505
531static inline struct cftype *seq_cft(struct seq_file *seq) 506static inline struct cftype *seq_cft(struct seq_file *seq)
532{ 507{
533 struct cgroup_open_file *of = seq->private; 508 struct kernfs_open_file *of = seq->private;
534 return of->cfe->type; 509
510 return of->kn->priv;
511}
512
513struct cgroup_subsys_state *seq_css(struct seq_file *seq);
514
515/*
516 * Name / path handling functions. All are thin wrappers around the kernfs
517 * counterparts and can be called under any context.
518 */
519
520static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
521{
522 return kernfs_name(cgrp->kn, buf, buflen);
535} 523}
536 524
525static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
526 size_t buflen)
527{
528 return kernfs_path(cgrp->kn, buf, buflen);
529}
530
531static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
532{
533 pr_cont_kernfs_name(cgrp->kn);
534}
535
536static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
537{
538 pr_cont_kernfs_path(cgrp->kn);
539}
540
541char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
542
537int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 543int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
538int cgroup_rm_cftypes(struct cftype *cfts); 544int cgroup_rm_cftypes(struct cftype *cfts);
539 545
540bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 546bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
541 547
542int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
543int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
544
545int cgroup_task_count(const struct cgroup *cgrp);
546
547/* 548/*
548 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 549 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
549 * methods. 550 * methods.
@@ -551,22 +552,15 @@ int cgroup_task_count(const struct cgroup *cgrp);
551struct cgroup_taskset; 552struct cgroup_taskset;
552struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 553struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
553struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 554struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
554struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
555 int subsys_id);
556int cgroup_taskset_size(struct cgroup_taskset *tset);
557 555
558/** 556/**
559 * cgroup_taskset_for_each - iterate cgroup_taskset 557 * cgroup_taskset_for_each - iterate cgroup_taskset
560 * @task: the loop cursor 558 * @task: the loop cursor
561 * @skip_css: skip if task's css matches this, %NULL to iterate through all
562 * @tset: taskset to iterate 559 * @tset: taskset to iterate
563 */ 560 */
564#define cgroup_taskset_for_each(task, skip_css, tset) \ 561#define cgroup_taskset_for_each(task, tset) \
565 for ((task) = cgroup_taskset_first((tset)); (task); \ 562 for ((task) = cgroup_taskset_first((tset)); (task); \
566 (task) = cgroup_taskset_next((tset))) \ 563 (task) = cgroup_taskset_next((tset)))
567 if (!(skip_css) || \
568 cgroup_taskset_cur_css((tset), \
569 (skip_css)->ss->subsys_id) != (skip_css))
570 564
571/* 565/*
572 * Control Group subsystem type. 566 * Control Group subsystem type.
@@ -591,7 +585,6 @@ struct cgroup_subsys {
591 struct task_struct *task); 585 struct task_struct *task);
592 void (*bind)(struct cgroup_subsys_state *root_css); 586 void (*bind)(struct cgroup_subsys_state *root_css);
593 587
594 int subsys_id;
595 int disabled; 588 int disabled;
596 int early_init; 589 int early_init;
597 590
@@ -610,27 +603,26 @@ struct cgroup_subsys {
610 bool broken_hierarchy; 603 bool broken_hierarchy;
611 bool warned_broken_hierarchy; 604 bool warned_broken_hierarchy;
612 605
606 /* the following two fields are initialized automtically during boot */
607 int id;
613#define MAX_CGROUP_TYPE_NAMELEN 32 608#define MAX_CGROUP_TYPE_NAMELEN 32
614 const char *name; 609 const char *name;
615 610
616 /* link to parent, protected by cgroup_lock() */ 611 /* link to parent, protected by cgroup_lock() */
617 struct cgroupfs_root *root; 612 struct cgroup_root *root;
618 613
619 /* list of cftype_sets */ 614 /*
620 struct list_head cftsets; 615 * List of cftypes. Each entry is the first entry of an array
616 * terminated by zero length name.
617 */
618 struct list_head cfts;
621 619
622 /* base cftypes, automatically [de]registered with subsys itself */ 620 /* base cftypes, automatically registered with subsys itself */
623 struct cftype *base_cftypes; 621 struct cftype *base_cftypes;
624 struct cftype_set base_cftset;
625
626 /* should be defined only by modular subsystems */
627 struct module *module;
628}; 622};
629 623
630#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 624#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
631#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
632#include <linux/cgroup_subsys.h> 625#include <linux/cgroup_subsys.h>
633#undef IS_SUBSYS_ENABLED
634#undef SUBSYS 626#undef SUBSYS
635 627
636/** 628/**
@@ -661,10 +653,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
661 */ 653 */
662#ifdef CONFIG_PROVE_RCU 654#ifdef CONFIG_PROVE_RCU
663extern struct mutex cgroup_mutex; 655extern struct mutex cgroup_mutex;
656extern struct rw_semaphore css_set_rwsem;
664#define task_css_set_check(task, __c) \ 657#define task_css_set_check(task, __c) \
665 rcu_dereference_check((task)->cgroups, \ 658 rcu_dereference_check((task)->cgroups, \
666 lockdep_is_held(&(task)->alloc_lock) || \ 659 lockdep_is_held(&cgroup_mutex) || \
667 lockdep_is_held(&cgroup_mutex) || (__c)) 660 lockdep_is_held(&css_set_rwsem) || \
661 ((task)->flags & PF_EXITING) || (__c))
668#else 662#else
669#define task_css_set_check(task, __c) \ 663#define task_css_set_check(task, __c) \
670 rcu_dereference((task)->cgroups) 664 rcu_dereference((task)->cgroups)
@@ -837,16 +831,11 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
837struct task_struct *css_task_iter_next(struct css_task_iter *it); 831struct task_struct *css_task_iter_next(struct css_task_iter *it);
838void css_task_iter_end(struct css_task_iter *it); 832void css_task_iter_end(struct css_task_iter *it);
839 833
840int css_scan_tasks(struct cgroup_subsys_state *css,
841 bool (*test)(struct task_struct *, void *),
842 void (*process)(struct task_struct *, void *),
843 void *data, struct ptr_heap *heap);
844
845int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 834int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
846int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 835int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
847 836
848struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 837struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
849 struct cgroup_subsys *ss); 838 struct cgroup_subsys *ss);
850 839
851#else /* !CONFIG_CGROUPS */ 840#else /* !CONFIG_CGROUPS */
852 841
@@ -854,7 +843,7 @@ static inline int cgroup_init_early(void) { return 0; }
854static inline int cgroup_init(void) { return 0; } 843static inline int cgroup_init(void) { return 0; }
855static inline void cgroup_fork(struct task_struct *p) {} 844static inline void cgroup_fork(struct task_struct *p) {}
856static inline void cgroup_post_fork(struct task_struct *p) {} 845static inline void cgroup_post_fork(struct task_struct *p) {}
857static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 846static inline void cgroup_exit(struct task_struct *p) {}
858 847
859static inline int cgroupstats_build(struct cgroupstats *stats, 848static inline int cgroupstats_build(struct cgroupstats *stats,
860 struct dentry *dentry) 849 struct dentry *dentry)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 7b99d717411d..768fe44e19f0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,51 +3,51 @@
3 * 3 *
4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
5 */ 5 */
6#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS) 6#if IS_ENABLED(CONFIG_CPUSETS)
7SUBSYS(cpuset) 7SUBSYS(cpuset)
8#endif 8#endif
9 9
10#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG) 10#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
11SUBSYS(debug) 11SUBSYS(debug)
12#endif 12#endif
13 13
14#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED) 14#if IS_ENABLED(CONFIG_CGROUP_SCHED)
15SUBSYS(cpu_cgroup) 15SUBSYS(cpu)
16#endif 16#endif
17 17
18#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT) 18#if IS_ENABLED(CONFIG_CGROUP_CPUACCT)
19SUBSYS(cpuacct) 19SUBSYS(cpuacct)
20#endif 20#endif
21 21
22#if IS_SUBSYS_ENABLED(CONFIG_MEMCG) 22#if IS_ENABLED(CONFIG_MEMCG)
23SUBSYS(mem_cgroup) 23SUBSYS(memory)
24#endif 24#endif
25 25
26#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE) 26#if IS_ENABLED(CONFIG_CGROUP_DEVICE)
27SUBSYS(devices) 27SUBSYS(devices)
28#endif 28#endif
29 29
30#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER) 30#if IS_ENABLED(CONFIG_CGROUP_FREEZER)
31SUBSYS(freezer) 31SUBSYS(freezer)
32#endif 32#endif
33 33
34#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID) 34#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
35SUBSYS(net_cls) 35SUBSYS(net_cls)
36#endif 36#endif
37 37
38#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP) 38#if IS_ENABLED(CONFIG_BLK_CGROUP)
39SUBSYS(blkio) 39SUBSYS(blkio)
40#endif 40#endif
41 41
42#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) 42#if IS_ENABLED(CONFIG_CGROUP_PERF)
43SUBSYS(perf) 43SUBSYS(perf_event)
44#endif 44#endif
45 45
46#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO) 46#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
47SUBSYS(net_prio) 47SUBSYS(net_prio)
48#endif 48#endif
49 49
50#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) 50#if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
51SUBSYS(hugetlb) 51SUBSYS(hugetlb)
52#endif 52#endif
53/* 53/*
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 787bba3bf552..0129f89cf98d 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
49 49
50static inline bool hugetlb_cgroup_disabled(void) 50static inline bool hugetlb_cgroup_disabled(void)
51{ 51{
52 if (hugetlb_subsys.disabled) 52 if (hugetlb_cgrp_subsys.disabled)
53 return true; 53 return true;
54 return false; 54 return false;
55} 55}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index abd0113b6620..eccfb4a4b379 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -162,7 +162,7 @@ extern int do_swap_account;
162 162
163static inline bool mem_cgroup_disabled(void) 163static inline bool mem_cgroup_disabled(void)
164{ 164{
165 if (mem_cgroup_subsys.disabled) 165 if (memory_cgrp_subsys.disabled)
166 return true; 166 return true;
167 return false; 167 return false;
168} 168}
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 9cf2d5ef38d9..c15d39456e14 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
34 return 0; 34 return 0;
35 35
36 rcu_read_lock(); 36 rcu_read_lock();
37 classid = container_of(task_css(p, net_cls_subsys_id), 37 classid = container_of(task_css(p, net_cls_cgrp_id),
38 struct cgroup_cls_state, css)->classid; 38 struct cgroup_cls_state, css)->classid;
39 rcu_read_unlock(); 39 rcu_read_unlock();
40 40
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index dafc09f0fdbc..f2a9597ff53c 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,32 +27,17 @@ struct netprio_map {
27 27
28void sock_update_netprioidx(struct sock *sk); 28void sock_update_netprioidx(struct sock *sk);
29 29
30#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO)
31static inline u32 task_netprioidx(struct task_struct *p) 30static inline u32 task_netprioidx(struct task_struct *p)
32{ 31{
33 struct cgroup_subsys_state *css; 32 struct cgroup_subsys_state *css;
34 u32 idx; 33 u32 idx;
35 34
36 rcu_read_lock(); 35 rcu_read_lock();
37 css = task_css(p, net_prio_subsys_id); 36 css = task_css(p, net_prio_cgrp_id);
38 idx = css->cgroup->id; 37 idx = css->cgroup->id;
39 rcu_read_unlock(); 38 rcu_read_unlock();
40 return idx; 39 return idx;
41} 40}
42#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO)
43static inline u32 task_netprioidx(struct task_struct *p)
44{
45 struct cgroup_subsys_state *css;
46 u32 idx = 0;
47
48 rcu_read_lock();
49 css = task_css(p, net_prio_subsys_id);
50 if (css)
51 idx = css->cgroup->id;
52 rcu_read_unlock();
53 return idx;
54}
55#endif
56#else /* !CONFIG_CGROUP_NET_PRIO */ 41#else /* !CONFIG_CGROUP_NET_PRIO */
57static inline u32 task_netprioidx(struct task_struct *p) 42static inline u32 task_netprioidx(struct task_struct *p)
58{ 43{
diff --git a/init/Kconfig b/init/Kconfig
index d56cb03c1b49..62b66acfdb30 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,6 +854,7 @@ config NUMA_BALANCING
854 854
855menuconfig CGROUPS 855menuconfig CGROUPS
856 boolean "Control Group support" 856 boolean "Control Group support"
857 select KERNFS
857 help 858 help
858 This option adds support for grouping sets of processes together, for 859 This option adds support for grouping sets of processes together, for
859 use with process control subsystems such as Cpusets, CFS, memory 860 use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c753ddd223b..fede3d3f28ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,331 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421 1392
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1490
1525 /* First find the desired set of subsystems */ 1491 /*
1492 * The first time anyone tries to mount a cgroup, enable the list
1493 * linking each css_set to its tasks and fix up all existing tasks.
1494 */
1495 if (!use_task_css_set_links)
1496 cgroup_enable_task_cg_lists();
1497retry:
1498 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1499 mutex_lock(&cgroup_mutex);
1500
1501 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1502 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1503 if (ret)
1530 goto out_err; 1504 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1505
1543 /* Locate an existing or new sb for this hierarchy */ 1506 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1508 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1509 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1510 cgroup_get(&root->cgrp);
1548 goto out_err; 1511 ret = 0;
1512 goto out_unlock;
1549 } 1513 }
1550 1514
1551 root = sb->s_fs_info; 1515 for_each_root(root) {
1552 BUG_ON(!root); 1516 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561
1562 ret = cgroup_get_rootdir(sb);
1563 if (ret)
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593 1517
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1518 if (root == &cgrp_dfl_root)
1595 ret = cgroup_init_root_id(root, 2, 0); 1519 continue;
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1520
1621 /* 1521 /*
1622 * There must be no failure case after here, since rebinding 1522 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1523 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1524 * Remember whether name matched.
1625 */ 1525 */
1526 if (opts.name) {
1527 if (strcmp(opts.name, root->name))
1528 continue;
1529 name_match = true;
1530 }
1626 1531
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1532 /*
1647 * We re-used an existing hierarchy - the new root (if 1533 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1534 * subsystems) then they must match.
1649 */ 1535 */
1650 cgroup_free_root(opts.new_root); 1536 if ((opts.subsys_mask || opts.none) &&
1537 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1538 if (!name_match)
1539 continue;
1540 ret = -EBUSY;
1541 goto out_unlock;
1542 }
1651 1543
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1544 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1545 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1546 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1547 ret = -EINVAL;
1656 goto drop_new_super; 1548 goto out_unlock;
1657 } else { 1549 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1550 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1551 }
1660 } 1552 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666
1667 rm_base_files:
1668 free_cgrp_cset_links(&tmp_links);
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 revert_creds(cred);
1671 unlock_drop:
1672 cgroup_exit_root_id(root);
1673 mutex_unlock(&cgroup_root_mutex);
1674 mutex_unlock(&cgroup_mutex);
1675 mutex_unlock(&inode->i_mutex);
1676 drop_new_super:
1677 deactivate_locked_super(sb);
1678 out_err:
1679 kfree(opts.release_agent);
1680 kfree(opts.name);
1681 return ERR_PTR(ret);
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695 1553
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1554 /*
1697 mutex_lock(&cgroup_mutex); 1555 * A root's lifetime is governed by its root cgroup. Zero
1698 mutex_lock(&cgroup_root_mutex); 1556 * ref indicate that the root is being destroyed. Wait for
1557 * destruction to complete so that the subsystems are free.
1558 * We can use wait_queue for the wait but this path is
1559 * super cold. Let's just sleep for a bit and retry.
1560 */
1561 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1562 mutex_unlock(&cgroup_mutex);
1563 mutex_unlock(&cgroup_tree_mutex);
1564 kfree(opts.release_agent);
1565 kfree(opts.name);
1566 msleep(10);
1567 goto retry;
1568 }
1699 1569
1700 /* Rebind all subsystems back to the default hierarchy */ 1570 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1571 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1572 }
1706 1573
1707 /* 1574 /*
1708 * Release all the links from cset_links to this hierarchy's 1575 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1576 * specification is allowed for already existing hierarchies but we
1577 * can't create new one without subsys specification.
1710 */ 1578 */
1711 write_lock(&css_set_lock); 1579 if (!opts.subsys_mask && !opts.none) {
1712 1580 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1581 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1582 }
1718 write_unlock(&css_set_lock);
1719 1583
1720 if (!list_empty(&root->root_list)) { 1584 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1585 if (!root) {
1722 cgroup_root_count--; 1586 ret = -ENOMEM;
1587 goto out_unlock;
1723 } 1588 }
1724 1589
1725 cgroup_exit_root_id(root); 1590 init_cgroup_root(root, &opts);
1726 1591
1727 mutex_unlock(&cgroup_root_mutex); 1592 ret = cgroup_setup_root(root, opts.subsys_mask);
1593 if (ret)
1594 cgroup_free_root(root);
1595
1596out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1598 mutex_unlock(&cgroup_tree_mutex);
1730 1599
1731 simple_xattrs_free(&cgrp->xattrs); 1600 kfree(opts.release_agent);
1601 kfree(opts.name);
1732 1602
1733 kill_litter_super(sb); 1603 if (ret)
1734 cgroup_free_root(root); 1604 return ERR_PTR(ret);
1605
1606 dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
1607 if (IS_ERR(dentry))
1608 cgroup_put(&root->cgrp);
1609 return dentry;
1610}
1611
1612static void cgroup_kill_sb(struct super_block *sb)
1613{
1614 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1615 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1616
1617 cgroup_put(&root->cgrp);
1618 kernfs_kill_sb(sb);
1735} 1619}
1736 1620
1737static struct file_system_type cgroup_fs_type = { 1621static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1627,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1627static struct kobject *cgroup_kobj;
1744 1628
1745/** 1629/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1630 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1631 * @task: target task
1799 * @buf: the buffer to write the path into 1632 * @buf: the buffer to write the path into
@@ -1804,49 +1637,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1637 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1638 * cgroup controller callbacks.
1806 * 1639 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1640 * Return value is the same as kernfs_path().
1808 */ 1641 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1642char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1643{
1811 struct cgroupfs_root *root; 1644 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1645 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1646 int hierarchy_id = 1;
1814 1647 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1648
1818 mutex_lock(&cgroup_mutex); 1649 mutex_lock(&cgroup_mutex);
1650 down_read(&css_set_rwsem);
1819 1651
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1652 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1653
1822 if (root) { 1654 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1655 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1656 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1657 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1658 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1659 if (strlcpy(buf, "/", buflen) < buflen)
1660 path = buf;
1828 } 1661 }
1829 1662
1663 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1664 mutex_unlock(&cgroup_mutex);
1831 return ret; 1665 return path;
1832} 1666}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1667EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1668
1835/* 1669/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1670struct cgroup_taskset {
1845 struct task_and_cgroup single; 1671 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1672 struct list_head src_csets;
1847 int tc_array_len; 1673 struct list_head dst_csets;
1848 int idx; 1674
1849 struct cgroup *cur_cgrp; 1675 /*
1676 * Fields for cgroup_taskset_*() iteration.
1677 *
1678 * Before migration is committed, the target migration tasks are on
1679 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1680 * the csets on ->dst_csets. ->csets point to either ->src_csets
1681 * or ->dst_csets depending on whether migration is committed.
1682 *
1683 * ->cur_csets and ->cur_task point to the current task position
1684 * during iteration.
1685 */
1686 struct list_head *csets;
1687 struct css_set *cur_cset;
1688 struct task_struct *cur_task;
1850}; 1689};
1851 1690
1852/** 1691/**
@@ -1857,15 +1696,11 @@ struct cgroup_taskset {
1857 */ 1696 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1697struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1698{
1860 if (tset->tc_array) { 1699 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1700 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1701
1863 } else { 1702 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1703}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1704
1870/** 1705/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1706 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1711,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1711 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1712struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1713{
1879 struct task_and_cgroup *tc; 1714 struct css_set *cset = tset->cur_cset;
1715 struct task_struct *task = tset->cur_task;
1880 1716
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1717 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1718 if (!task)
1719 task = list_first_entry(&cset->mg_tasks,
1720 struct task_struct, cg_list);
1721 else
1722 task = list_next_entry(task, cg_list);
1883 1723
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1724 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1725 tset->cur_cset = cset;
1886 return tc->task; 1726 tset->cur_task = task;
1887} 1727 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1728 }
1889 1729
1890/** 1730 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1731 task = NULL;
1892 * @tset: taskset of interest 1732 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1733
1906/** 1734 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1735}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 1736
1916 1737/**
1917/*
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1739 * @old_cgrp; the cgroup @tsk is being migrated from
1740 * @tsk: the task being migrated
1741 * @new_cset: the new css_set @tsk is being attached to
1919 * 1742 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1743 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1744 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1745static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1746 struct task_struct *tsk,
@@ -1925,6 +1748,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1748{
1926 struct css_set *old_cset; 1749 struct css_set *old_cset;
1927 1750
1751 lockdep_assert_held(&cgroup_mutex);
1752 lockdep_assert_held(&css_set_rwsem);
1753
1928 /* 1754 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1755 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1756 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1759,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1759 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1760 old_cset = task_css_set(tsk);
1935 1761
1936 task_lock(tsk); 1762 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1763 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1764
1940 /* Update the css_set linked lists if we're using them */ 1765 /*
1941 write_lock(&css_set_lock); 1766 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1767 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1768 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1769 * tset's dst_csets list.
1770 */
1771 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1772
1946 /* 1773 /*
1947 * We just gained a reference on old_cset by taking it from the 1774 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1776,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1776 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1777 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1778 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1779 put_css_set_locked(old_cset, false);
1953} 1780}
1954 1781
1955/** 1782/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1783 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1784 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1785 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1786 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1787 * those functions for details.
1963 */ 1788 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1789static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1790{
1967 int retval, i, group_size; 1791 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1792
1976 /* 1793 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1794
1978 * every thread, we cannot iterate the thread group list, since it needs 1795 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1796 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1797 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1798 cset->mg_dst_cset = NULL;
1982 */ 1799 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1800 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1801 }
1985 else 1802 up_write(&css_set_rwsem);
1986 group_size = 1; 1803}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1804
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1805/**
1989 if (!group) 1806 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1807 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1808 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1809 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1810 *
1994 goto out_free_group_list; 1811 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1812 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1813 * up by cgroup_migrate_finish().
1814 *
1815 * This function may be called without holding threadgroup_lock even if the
1816 * target is a process. Threads may be created and destroyed but as long
1817 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1818 * the preloaded css_sets are guaranteed to cover all migrations.
1819 */
1820static void cgroup_migrate_add_src(struct css_set *src_cset,
1821 struct cgroup *dst_cgrp,
1822 struct list_head *preloaded_csets)
1823{
1824 struct cgroup *src_cgrp;
1825
1826 lockdep_assert_held(&cgroup_mutex);
1827 lockdep_assert_held(&css_set_rwsem);
1828
1829 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1830
1831 /* nothing to do if this cset already belongs to the cgroup */
1832 if (src_cgrp == dst_cgrp)
1833 return;
1834
1835 if (!list_empty(&src_cset->mg_preload_node))
1836 return;
1837
1838 WARN_ON(src_cset->mg_src_cgrp);
1839 WARN_ON(!list_empty(&src_cset->mg_tasks));
1840 WARN_ON(!list_empty(&src_cset->mg_node));
1841
1842 src_cset->mg_src_cgrp = src_cgrp;
1843 get_css_set(src_cset);
1844 list_add(&src_cset->mg_preload_node, preloaded_csets);
1845}
1846
1847/**
1848 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1849 * @dst_cgrp: the destination cgroup
1850 * @preloaded_csets: list of preloaded source css_sets
1851 *
1852 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1853 * have been preloaded to @preloaded_csets. This function looks up and
1854 * pins all destination css_sets, links each to its source, and put them on
1855 * @preloaded_csets.
1856 *
1857 * This function must be called after cgroup_migrate_add_src() has been
1858 * called on each migration source css_set. After migration is performed
1859 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1860 * @preloaded_csets.
1861 */
1862static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1863 struct list_head *preloaded_csets)
1864{
1865 LIST_HEAD(csets);
1866 struct css_set *src_cset;
1867
1868 lockdep_assert_held(&cgroup_mutex);
1869
1870 /* look up the dst cset for each src cset and link it to src */
1871 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1872 struct css_set *dst_cset;
1873
1874 dst_cset = find_css_set(src_cset, dst_cgrp);
1875 if (!dst_cset)
1876 goto err;
1877
1878 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1879 src_cset->mg_dst_cset = dst_cset;
1880
1881 if (list_empty(&dst_cset->mg_preload_node))
1882 list_add(&dst_cset->mg_preload_node, &csets);
1883 else
1884 put_css_set(dst_cset, false);
1885 }
1886
1887 list_splice(&csets, preloaded_csets);
1888 return 0;
1889err:
1890 cgroup_migrate_finish(&csets);
1891 return -ENOMEM;
1892}
1893
1894/**
1895 * cgroup_migrate - migrate a process or task to a cgroup
1896 * @cgrp: the destination cgroup
1897 * @leader: the leader of the process or the task to migrate
1898 * @threadgroup: whether @leader points to the whole process or a single task
1899 *
1900 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1901 * process, the caller must be holding threadgroup_lock of @leader. The
1902 * caller is also responsible for invoking cgroup_migrate_add_src() and
1903 * cgroup_migrate_prepare_dst() on the targets before invoking this
1904 * function and following up with cgroup_migrate_finish().
1905 *
1906 * As long as a controller's ->can_attach() doesn't fail, this function is
1907 * guaranteed to succeed. This means that, excluding ->can_attach()
1908 * failure, when migrating multiple targets, the success or failure can be
1909 * decided for all targets by invoking group_migrate_prepare_dst() before
1910 * actually starting migrating.
1911 */
1912static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1913 bool threadgroup)
1914{
1915 struct cgroup_taskset tset = {
1916 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1917 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1918 .csets = &tset.src_csets,
1919 };
1920 struct cgroup_subsys_state *css, *failed_css = NULL;
1921 struct css_set *cset, *tmp_cset;
1922 struct task_struct *task, *tmp_task;
1923 int i, ret;
1995 1924
1996 i = 0;
1997 /* 1925 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1926 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1927 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1928 * take an rcu_read_lock.
2001 */ 1929 */
1930 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1931 rcu_read_lock();
1932 task = leader;
2003 do { 1933 do {
2004 struct task_and_cgroup ent; 1934 /* @task either already exited or can't exit until the end */
1935 if (task->flags & PF_EXITING)
1936 goto next;
2005 1937
2006 /* @tsk either already exited or can't exit until the end */ 1938 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1939 if (list_empty(&task->cg_list))
2008 goto next; 1940 goto next;
2009 1941
2010 /* as per above, nr_threads may decrease, but not increase. */ 1942 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1943 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1944 goto next;
1945
2017 /* 1946 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1947 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1948 * Take care to avoid disturbing the ordering.
2020 */ 1949 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1950 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1951 if (list_empty(&cset->mg_node))
2023 i++; 1952 list_add_tail(&cset->mg_node, &tset.src_csets);
1953 if (list_empty(&cset->mg_dst_cset->mg_node))
1954 list_move_tail(&cset->mg_dst_cset->mg_node,
1955 &tset.dst_csets);
2024 next: 1956 next:
2025 if (!threadgroup) 1957 if (!threadgroup)
2026 break; 1958 break;
2027 } while_each_thread(leader, tsk); 1959 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1960 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1961 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1962
2034 /* methods shouldn't be called if no task is actually migrating */ 1963 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1964 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1965 return 0;
2037 goto out_free_group_list;
2038 1966
2039 /* 1967 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1968 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1969 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1970 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1971 if (ret) {
2046 failed_css = css; 1972 failed_css = css;
2047 goto out_cancel_attach; 1973 goto out_cancel_attach;
2048 } 1974 }
@@ -2050,70 +1976,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1976 }
2051 1977
2052 /* 1978 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1979 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1980 * the new cgroup. There are no failure cases after here, so this
1981 * is the commit point.
2055 */ 1982 */
2056 for (i = 0; i < group_size; i++) { 1983 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1984 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1985 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1986 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1987 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1988 }
1989 up_write(&css_set_rwsem);
2067 1990
2068 /* 1991 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1992 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1993 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1994 * controllers that migration is complete.
2072 */ 1995 */
2073 for (i = 0; i < group_size; i++) { 1996 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1997
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1998 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 1999 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2000 css->ss->attach(css, &tset);
2085 2001
2086 /* 2002 ret = 0;
2087 * step 5: success! and cleanup 2003 goto out_release_tset;
2088 */ 2004
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2005out_cancel_attach:
2100 if (retval) { 2006 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2007 if (css == failed_css)
2102 if (css == failed_css) 2008 break;
2103 break; 2009 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2010 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2011 }
2108out_free_group_list: 2012out_release_tset:
2109 flex_array_free(group); 2013 down_write(&css_set_rwsem);
2110 return retval; 2014 list_splice_init(&tset.dst_csets, &tset.src_csets);
2015 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2016 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2017 list_del_init(&cset->mg_node);
2018 }
2019 up_write(&css_set_rwsem);
2020 return ret;
2021}
2022
2023/**
2024 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2025 * @dst_cgrp: the cgroup to attach to
2026 * @leader: the task or the leader of the threadgroup to be attached
2027 * @threadgroup: attach the whole threadgroup?
2028 *
2029 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2030 */
2031static int cgroup_attach_task(struct cgroup *dst_cgrp,
2032 struct task_struct *leader, bool threadgroup)
2033{
2034 LIST_HEAD(preloaded_csets);
2035 struct task_struct *task;
2036 int ret;
2037
2038 /* look up all src csets */
2039 down_read(&css_set_rwsem);
2040 rcu_read_lock();
2041 task = leader;
2042 do {
2043 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2044 &preloaded_csets);
2045 if (!threadgroup)
2046 break;
2047 } while_each_thread(leader, task);
2048 rcu_read_unlock();
2049 up_read(&css_set_rwsem);
2050
2051 /* prepare dst csets and commit */
2052 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2053 if (!ret)
2054 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2055
2056 cgroup_migrate_finish(&preloaded_csets);
2057 return ret;
2111} 2058}
2112 2059
2113/* 2060/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2061 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2062 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2063 * cgroup_mutex and threadgroup.
2117 */ 2064 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2065static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2066{
@@ -2198,12 +2145,19 @@ out_unlock_cgroup:
2198 */ 2145 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2146int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2147{
2201 struct cgroupfs_root *root; 2148 struct cgroup_root *root;
2202 int retval = 0; 2149 int retval = 0;
2203 2150
2204 mutex_lock(&cgroup_mutex); 2151 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2152 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2153 struct cgroup *from_cgrp;
2154
2155 if (root == &cgrp_dfl_root)
2156 continue;
2157
2158 down_read(&css_set_rwsem);
2159 from_cgrp = task_cgroup_from_root(from, root);
2160 up_read(&css_set_rwsem);
2207 2161
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2162 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2163 if (retval)
@@ -2228,16 +2182,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2182}
2229 2183
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2184static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2185 struct cftype *cft, char *buffer)
2232{ 2186{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2187 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2188
2235 return -EINVAL; 2189 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2190 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2191 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2192 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2193 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2194 sizeof(root->release_agent_path));
2195 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2196 mutex_unlock(&cgroup_mutex);
2242 return 0; 2197 return 0;
2243} 2198}
@@ -2262,32 +2217,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2217 return 0;
2263} 2218}
2264 2219
2265/* A buffer size big enough for numbers or short strings */ 2220static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2221 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2222{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2223 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2224 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2225 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2226 int ret;
2277 2227
2278 if (nbytes >= max_bytes) 2228 /*
2279 return -E2BIG; 2229 * kernfs guarantees that a file isn't deleted with operations in
2280 2230 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2231 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2232 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2233 */
2284 2234 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2235 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2236 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2237
2292 if (cft->write_string) { 2238 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2239 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2252,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2252 } else {
2307 ret = -EINVAL; 2253 ret = -EINVAL;
2308 } 2254 }
2309out_free: 2255
2310 kfree(buf);
2311 return ret ?: nbytes; 2256 return ret ?: nbytes;
2312} 2257}
2313 2258
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2259static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2260{
2321 struct cftype *cft = seq_cft(seq); 2261 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2262}
2333 2263
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2264static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2265{
2336 struct cftype *cft = seq_cft(seq); 2266 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2267}
2349 2268
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2269static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2270{
2352 struct cftype *cft = seq_cft(seq); 2271 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2272}
2357 2273
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2274static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2288,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2288 return 0;
2373} 2289}
2374 2290
2375static struct seq_operations cgroup_seq_operations = { 2291static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2292 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2293 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2294 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2295};
2381 2296
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2297static struct kernfs_ops cgroup_kf_ops = {
2383{ 2298 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2299 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2300 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2301 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2302 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2303 .seq_show = cgroup_seqfile_show,
2389 int err; 2304};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2305
2440/* 2306/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2307 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2308 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2309static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2310 const char *new_name_str)
2445{ 2311{
2312 struct cgroup *cgrp = kn->priv;
2446 int ret; 2313 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2314
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2315 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2316 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2317 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2318 return -EIO;
2462 2319
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2320 /*
2466 * This isn't a proper migration and its usefulness is very 2321 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2322 * limited. Disallow if sane_behavior.
@@ -2469,218 +2324,40 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2324 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2325 return -EPERM;
2471 2326
2472 name = cgroup_alloc_name(new_dentry); 2327 /*
2473 if (!name) 2328 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2329 * active_ref. kernfs_rename() doesn't require active_ref
2475 2330 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2331 */
2477 if (ret) { 2332 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2333 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582
2583 inode = cgroup_new_inode(mode, sb);
2584 if (!inode)
2585 return -ENOMEM;
2586
2587 if (S_ISDIR(mode)) {
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590
2591 /* start off with i_nlink == 2 (for "." entry) */
2592 inc_nlink(inode);
2593 inc_nlink(dentry->d_parent->d_inode);
2594
2595 /*
2596 * Control reaches here with cgroup_mutex held.
2597 * @inode->i_mutex should nest outside cgroup_mutex but we
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612}
2613
2614/**
2615 * cgroup_file_mode - deduce file mode of a control file
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{
2625 umode_t mode = 0;
2626 2334
2627 if (cft->mode) 2335 mutex_lock(&cgroup_tree_mutex);
2628 return cft->mode; 2336 mutex_lock(&cgroup_mutex);
2629 2337
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2338 ret = kernfs_rename(kn, new_parent, new_name_str);
2631 mode |= S_IRUGO;
2632 2339
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2340 mutex_unlock(&cgroup_mutex);
2634 cft->trigger) 2341 mutex_unlock(&cgroup_tree_mutex);
2635 mode |= S_IWUSR;
2636 2342
2637 return mode; 2343 kernfs_unbreak_active_protection(kn);
2344 kernfs_unbreak_active_protection(new_parent);
2345 return ret;
2638} 2346}
2639 2347
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2348static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2349{
2642 struct dentry *dir = cgrp->dentry; 2350 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2351 struct kernfs_node *kn;
2644 struct dentry *dentry; 2352 struct lock_class_key *key = NULL;
2645 struct cfent *cfe;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662 2353
2663 dentry = lookup_one_len(name, dir, strlen(name)); 2354#ifdef CONFIG_DEBUG_LOCK_ALLOC
2664 if (IS_ERR(dentry)) { 2355 key = &cft->lockdep_key;
2665 error = PTR_ERR(dentry); 2356#endif
2666 goto out; 2357 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2667 } 2358 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2668 2359 NULL, false, key);
2669 cfe->type = (void *)cft; 2360 return PTR_ERR_OR_ZERO(kn);
2670 cfe->dentry = dentry;
2671 dentry->d_fsdata = cfe;
2672 simple_xattrs_init(&cfe->xattrs);
2673
2674 mode = cgroup_file_mode(cft);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2676 if (!error) {
2677 list_add_tail(&cfe->node, &parent->files);
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2361}
2685 2362
2686/** 2363/**
@@ -2700,11 +2377,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2377 struct cftype *cft;
2701 int ret; 2378 int ret;
2702 2379
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2380 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2381
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2382 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2383 /* does cft->flags tell us to skip this file on @cgrp? */
2384 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2385 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2386 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2387 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2388 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2404,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2404 return 0;
2727} 2405}
2728 2406
2729static void cgroup_cfts_prepare(void) 2407static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2408{
2744 LIST_HEAD(pending); 2409 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2410 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2411 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2412 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2413 int ret = 0;
2753 2414
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2415 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2416
2761 /* 2417 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2418 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2419 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2420
2768 /* add/rm files for all cgroups created before */ 2421 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2422 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2425,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2425 if (cgroup_is_dead(cgrp))
2773 continue; 2426 continue;
2774 2427
2775 inode = cgrp->dentry->d_inode; 2428 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2429 if (ret)
2787 break; 2430 break;
2788 } 2431 }
2789 mutex_unlock(&cgroup_mutex); 2432
2790 dput(prev); 2433 if (is_add && !ret)
2791 deactivate_super(sb); 2434 kernfs_activate(root->kn);
2792 return ret; 2435 return ret;
2793} 2436}
2794 2437
2795/** 2438static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2439{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2440 struct cftype *cft;
2813 int ret;
2814 2441
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2442 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2443 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2444 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2445 kfree(cft->kf_ops);
2446 cft->kf_ops = NULL;
2447 cft->ss = NULL;
2448 }
2449}
2818 2450
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2451static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2452{
2453 struct cftype *cft;
2454
2455 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2456 struct kernfs_ops *kf_ops;
2457
2458 WARN_ON(cft->ss || cft->kf_ops);
2459
2460 if (cft->seq_start)
2461 kf_ops = &cgroup_kf_ops;
2462 else
2463 kf_ops = &cgroup_kf_single_ops;
2464
2465 /*
2466 * Ugh... if @cft wants a custom max_write_len, we need to
2467 * make a copy of kf_ops to set its atomic_write_len.
2468 */
2469 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2470 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2471 if (!kf_ops) {
2472 cgroup_exit_cftypes(cfts);
2473 return -ENOMEM;
2474 }
2475 kf_ops->atomic_write_len = cft->max_write_len;
2476 }
2477
2478 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2479 cft->ss = ss;
2480 }
2821 2481
2822 cgroup_cfts_prepare(); 2482 return 0;
2823 set->cfts = cfts; 2483}
2824 list_add_tail(&set->node, &ss->cftsets); 2484
2825 ret = cgroup_cfts_commit(cfts, true); 2485static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2486{
2827 cgroup_rm_cftypes(cfts); 2487 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2488
2489 if (!cfts || !cfts[0].ss)
2490 return -ENOENT;
2491
2492 list_del(&cfts->node);
2493 cgroup_apply_cftypes(cfts, false);
2494 cgroup_exit_cftypes(cfts);
2495 return 0;
2829} 2496}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2497
2832/** 2498/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2499 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2508,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2508 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2509int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2510{
2845 struct cftype_set *set; 2511 int ret;
2846 2512
2847 if (!cfts || !cfts[0].ss) 2513 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2514 ret = cgroup_rm_cftypes_locked(cfts);
2515 mutex_unlock(&cgroup_tree_mutex);
2516 return ret;
2517}
2849 2518
2850 cgroup_cfts_prepare(); 2519/**
2520 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2521 * @ss: target cgroup subsystem
2522 * @cfts: zero-length name terminated array of cftypes
2523 *
2524 * Register @cfts to @ss. Files described by @cfts are created for all
2525 * existing cgroups to which @ss is attached and all future cgroups will
2526 * have them too. This function can be called anytime whether @ss is
2527 * attached or not.
2528 *
2529 * Returns 0 on successful registration, -errno on failure. Note that this
2530 * function currently returns 0 as long as @cfts registration is successful
2531 * even if some file creation attempts on existing cgroups fail.
2532 */
2533int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2534{
2535 int ret;
2851 2536
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2537 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2538 return 0;
2854 list_del(&set->node); 2539
2855 kfree(set); 2540 ret = cgroup_init_cftypes(ss, cfts);
2856 cgroup_cfts_commit(cfts, false); 2541 if (ret)
2857 return 0; 2542 return ret;
2858 } 2543
2859 } 2544 mutex_lock(&cgroup_tree_mutex);
2860 2545
2861 cgroup_cfts_commit(NULL, false); 2546 list_add_tail(&cfts->node, &ss->cfts);
2862 return -ENOENT; 2547 ret = cgroup_apply_cftypes(cfts, true);
2548 if (ret)
2549 cgroup_rm_cftypes_locked(cfts);
2550
2551 mutex_unlock(&cgroup_tree_mutex);
2552 return ret;
2863} 2553}
2864 2554
2865/** 2555/**
@@ -2868,57 +2558,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2558 *
2869 * Return the number of tasks in the cgroup. 2559 * Return the number of tasks in the cgroup.
2870 */ 2560 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2561static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2562{
2873 int count = 0; 2563 int count = 0;
2874 struct cgrp_cset_link *link; 2564 struct cgrp_cset_link *link;
2875 2565
2876 read_lock(&css_set_lock); 2566 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2568 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2569 up_read(&css_set_rwsem);
2880 return count; 2570 return count;
2881} 2571}
2882 2572
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2573/**
2923 * css_next_child - find the next child of a given css 2574 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2575 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2588,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2588 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2589 struct cgroup *next;
2939 2590
2940 cgroup_assert_mutex_or_rcu_locked(); 2591 cgroup_assert_mutexes_or_rcu_locked();
2941 2592
2942 /* 2593 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2594 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2624,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2624
2974 return cgroup_css(next, parent_css->ss); 2625 return cgroup_css(next, parent_css->ss);
2975} 2626}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2627
2978/** 2628/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2629 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2645,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2645{
2996 struct cgroup_subsys_state *next; 2646 struct cgroup_subsys_state *next;
2997 2647
2998 cgroup_assert_mutex_or_rcu_locked(); 2648 cgroup_assert_mutexes_or_rcu_locked();
2999 2649
3000 /* if first iteration, visit @root */ 2650 /* if first iteration, visit @root */
3001 if (!pos) 2651 if (!pos)
@@ -3016,7 +2666,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2666
3017 return NULL; 2667 return NULL;
3018} 2668}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2669
3021/** 2670/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2671 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2685,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2685{
3037 struct cgroup_subsys_state *last, *tmp; 2686 struct cgroup_subsys_state *last, *tmp;
3038 2687
3039 cgroup_assert_mutex_or_rcu_locked(); 2688 cgroup_assert_mutexes_or_rcu_locked();
3040 2689
3041 do { 2690 do {
3042 last = pos; 2691 last = pos;
@@ -3048,7 +2697,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2697
3049 return last; 2698 return last;
3050} 2699}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2700
3053static struct cgroup_subsys_state * 2701static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2702css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2732,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2732{
3085 struct cgroup_subsys_state *next; 2733 struct cgroup_subsys_state *next;
3086 2734
3087 cgroup_assert_mutex_or_rcu_locked(); 2735 cgroup_assert_mutexes_or_rcu_locked();
3088 2736
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2737 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2738 if (!pos)
@@ -3102,7 +2750,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2750 /* no sibling left, visit parent */
3103 return css_parent(pos); 2751 return css_parent(pos);
3104} 2752}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2753
3107/** 2754/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2755 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2772,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2772 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2773 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2774 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2775 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2776
3129 it->cset_link = l; 2777 it->cset_link = l;
3130 it->task = cset->tasks.next; 2778
2779 if (!list_empty(&cset->tasks))
2780 it->task = cset->tasks.next;
2781 else
2782 it->task = cset->mg_tasks.next;
3131} 2783}
3132 2784
3133/** 2785/**
@@ -3146,17 +2798,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2798 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2799void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2800 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2801 __acquires(css_set_rwsem)
3150{ 2802{
3151 /* 2803 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2804 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2805
3159 read_lock(&css_set_lock); 2806 down_read(&css_set_rwsem);
3160 2807
3161 it->origin_css = css; 2808 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2809 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2823,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2823{
3177 struct task_struct *res; 2824 struct task_struct *res;
3178 struct list_head *l = it->task; 2825 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2826 struct cgrp_cset_link *link = list_entry(it->cset_link,
2827 struct cgrp_cset_link, cset_link);
3180 2828
3181 /* If the iterator cg is NULL, we have no tasks */ 2829 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2830 if (!it->cset_link)
3183 return NULL; 2831 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2832 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2833
2834 /*
2835 * Advance iterator to find next entry. cset->tasks is consumed
2836 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2837 * next cset.
2838 */
3186 l = l->next; 2839 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2840
3188 if (l == &link->cset->tasks) { 2841 if (l == &link->cset->tasks)
3189 /* 2842 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2843
3191 * next cgrp_cset_link. 2844 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2845 css_advance_task_iter(it);
3194 } else { 2846 else
3195 it->task = l; 2847 it->task = l;
3196 } 2848
3197 return res; 2849 return res;
3198} 2850}
3199 2851
@@ -3204,191 +2856,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2856 * Finish task iteration started by css_task_iter_start().
3205 */ 2857 */
3206void css_task_iter_end(struct css_task_iter *it) 2858void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2859 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2860{
3241 struct task_struct *t1 = p1; 2861 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2862}
3245 2863
3246/** 2864/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2865 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2866 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2867 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2868 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2869 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2870 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2871 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2872 * parent's migration is complete or put into the target cgroup. No task
2873 * can slip out of migration through forking.
3274 */ 2874 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2875int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2876{
3280 int retval, i; 2877 LIST_HEAD(preloaded_csets);
2878 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2879 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2880 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2881 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2882
3300 again: 2883 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2884
3349 if (heap->size) { 2885 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2886 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2887 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2888 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2889 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2890
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2891 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2892 if (ret)
3376 struct cgroup *new_cgroup = data; 2893 goto out_err;
3377 2894
3378 mutex_lock(&cgroup_mutex); 2895 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2896 * Migrate tasks one-by-one until @form is empty. This fails iff
2897 * ->can_attach() fails.
2898 */
2899 do {
2900 css_task_iter_start(&from->dummy_css, &it);
2901 task = css_task_iter_next(&it);
2902 if (task)
2903 get_task_struct(task);
2904 css_task_iter_end(&it);
2905
2906 if (task) {
2907 ret = cgroup_migrate(to, task, false);
2908 put_task_struct(task);
2909 }
2910 } while (task && !ret);
2911out_err:
2912 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
3381} 2914 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2915}
3393 2916
3394/* 2917/*
@@ -3687,21 +3210,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3210 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3211int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3212{
3690 int ret = -EINVAL; 3213 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3214 struct cgroup *cgrp;
3692 struct css_task_iter it; 3215 struct css_task_iter it;
3693 struct task_struct *tsk; 3216 struct task_struct *tsk;
3694 3217
3218 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3219 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3220 kernfs_type(kn) != KERNFS_DIR)
3221 return -EINVAL;
3222
3223 mutex_lock(&cgroup_mutex);
3224
3695 /* 3225 /*
3696 * Validate dentry by checking the superblock operations, 3226 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3227 * @kn->priv's validity. For this and css_tryget_from_dir(),
3228 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3229 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3230 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3231 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3232 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3233 rcu_read_unlock();
3703 ret = 0; 3234 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3235 return -ENOENT;
3236 }
3237 rcu_read_unlock();
3705 3238
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3239 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3240 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3259,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3259 }
3727 css_task_iter_end(&it); 3260 css_task_iter_end(&it);
3728 3261
3729err: 3262 mutex_unlock(&cgroup_mutex);
3730 return ret; 3263 return 0;
3731} 3264}
3732 3265
3733 3266
@@ -3745,7 +3278,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3278 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3279 * next pid to display, if any
3747 */ 3280 */
3748 struct cgroup_open_file *of = s->private; 3281 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3282 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3283 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3284 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3333,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3333
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3334static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3335{
3803 struct cgroup_open_file *of = s->private; 3336 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3337 struct cgroup_pidlist *l = of->priv;
3805 3338
3806 if (l) 3339 if (l)
@@ -3811,7 +3344,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3344
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3345static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3346{
3814 struct cgroup_open_file *of = s->private; 3347 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3348 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3349 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3350 pid_t *end = l->list + l->length;
@@ -3861,23 +3394,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3394 return 0;
3862} 3395}
3863 3396
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3397static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3398 struct cftype *cft)
3883{ 3399{
@@ -3944,7 +3460,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3460 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3461 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3462 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3463 .max_write_len = PATH_MAX - 1,
3948 }, 3464 },
3949 { } /* terminate */ 3465 { } /* terminate */
3950}; 3466};
@@ -3963,13 +3479,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3479
3964 /* process cftsets of each subsystem */ 3480 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3481 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3482 struct cftype *cfts;
3967 3483
3968 if (!test_bit(i, &subsys_mask)) 3484 if (!test_bit(i, &subsys_mask))
3969 continue; 3485 continue;
3970 3486
3971 list_for_each_entry(set, &ss->cftsets, node) { 3487 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3488 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3489 if (ret < 0)
3974 goto err; 3490 goto err;
3975 } 3491 }
@@ -4012,7 +3528,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3528 css_put(css->parent);
4013 3529
4014 css->ss->css_free(css); 3530 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3531 cgroup_put(cgrp);
4016} 3532}
4017 3533
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3534static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3536,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3536 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3537 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3538
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3539 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3540 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3541}
@@ -4033,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3545 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3546 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3547
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3548 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3549 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3550}
4039 3551
@@ -4058,6 +3570,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3570 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3571 int ret = 0;
4060 3572
3573 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3574 lockdep_assert_held(&cgroup_mutex);
4062 3575
4063 if (ss->css_online) 3576 if (ss->css_online)
@@ -4065,7 +3578,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3578 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3579 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3580 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3581 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3582 }
4070 return ret; 3583 return ret;
4071} 3584}
@@ -4075,6 +3588,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3588{
4076 struct cgroup_subsys *ss = css->ss; 3589 struct cgroup_subsys *ss = css->ss;
4077 3590
3591 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3592 lockdep_assert_held(&cgroup_mutex);
4079 3593
4080 if (!(css->flags & CSS_ONLINE)) 3594 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3599,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3599
4086 css->flags &= ~CSS_ONLINE; 3600 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3601 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3602 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3603}
4090 3604
4091/** 3605/**
@@ -4103,7 +3617,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3617 struct cgroup_subsys_state *css;
4104 int err; 3618 int err;
4105 3619
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3620 lockdep_assert_held(&cgroup_mutex);
4108 3621
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3622 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3629,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4116 3629
4117 init_css(css, ss, cgrp); 3630 init_css(css, ss, cgrp);
4118 3631
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3632 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3633 if (err)
4121 goto err_free_percpu_ref; 3634 goto err_free_percpu_ref;
4122 3635
@@ -4124,9 +3637,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4124 if (err) 3637 if (err)
4125 goto err_clear_dir; 3638 goto err_clear_dir;
4126 3639
4127 dget(cgrp->dentry); 3640 cgroup_get(cgrp);
4128 css_get(css->parent); 3641 css_get(css->parent);
4129 3642
3643 cgrp->subsys_mask |= 1 << ss->id;
3644
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3645 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3646 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3647 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4139,7 +3654,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4139 return 0; 3654 return 0;
4140 3655
4141err_clear_dir: 3656err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3657 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4143err_free_percpu_ref: 3658err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt); 3659 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css: 3660err_free_css:
@@ -4147,35 +3662,34 @@ err_free_css:
4147 return err; 3662 return err;
4148} 3663}
4149 3664
4150/* 3665/**
4151 * cgroup_create - create a cgroup 3666 * cgroup_create - create a cgroup
4152 * @parent: cgroup that will be parent of the new cgroup 3667 * @parent: cgroup that will be parent of the new cgroup
4153 * @dentry: dentry of the new cgroup 3668 * @name: name of the new cgroup
4154 * @mode: mode to set on new inode 3669 * @mode: mode to set on new cgroup
4155 *
4156 * Must be called with the mutex on the parent inode held
4157 */ 3670 */
4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3671static long cgroup_create(struct cgroup *parent, const char *name,
4159 umode_t mode) 3672 umode_t mode)
4160{ 3673{
4161 struct cgroup *cgrp; 3674 struct cgroup *cgrp;
4162 struct cgroup_name *name; 3675 struct cgroup_root *root = parent->root;
4163 struct cgroupfs_root *root = parent->root;
4164 int ssid, err; 3676 int ssid, err;
4165 struct cgroup_subsys *ss; 3677 struct cgroup_subsys *ss;
4166 struct super_block *sb = root->sb; 3678 struct kernfs_node *kn;
3679
3680 /*
3681 * XXX: The default hierarchy isn't fully implemented yet. Block
3682 * !root cgroup creation on it for now.
3683 */
3684 if (root == &cgrp_dfl_root)
3685 return -EINVAL;
4167 3686
4168 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3687 /* allocate the cgroup and its ID, 0 is reserved for the root */
4169 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3688 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4170 if (!cgrp) 3689 if (!cgrp)
4171 return -ENOMEM; 3690 return -ENOMEM;
4172 3691
4173 name = cgroup_alloc_name(dentry); 3692 mutex_lock(&cgroup_tree_mutex);
4174 if (!name) {
4175 err = -ENOMEM;
4176 goto err_free_cgrp;
4177 }
4178 rcu_assign_pointer(cgrp->name, name);
4179 3693
4180 /* 3694 /*
4181 * Only live parents can have children. Note that the liveliness 3695 * Only live parents can have children. Note that the liveliness
@@ -4186,7 +3700,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4186 */ 3700 */
4187 if (!cgroup_lock_live_group(parent)) { 3701 if (!cgroup_lock_live_group(parent)) {
4188 err = -ENODEV; 3702 err = -ENODEV;
4189 goto err_free_name; 3703 goto err_unlock_tree;
4190 } 3704 }
4191 3705
4192 /* 3706 /*
@@ -4199,18 +3713,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4199 goto err_unlock; 3713 goto err_unlock;
4200 } 3714 }
4201 3715
4202 /* Grab a reference on the superblock so the hierarchy doesn't
4203 * get deleted on unmount if there are child cgroups. This
4204 * can be done outside cgroup_mutex, since the sb can't
4205 * disappear while someone has an open control file on the
4206 * fs */
4207 atomic_inc(&sb->s_active);
4208
4209 init_cgroup_housekeeping(cgrp); 3716 init_cgroup_housekeeping(cgrp);
4210 3717
4211 dentry->d_fsdata = cgrp;
4212 cgrp->dentry = dentry;
4213
4214 cgrp->parent = parent; 3718 cgrp->parent = parent;
4215 cgrp->dummy_css.parent = &parent->dummy_css; 3719 cgrp->dummy_css.parent = &parent->dummy_css;
4216 cgrp->root = parent->root; 3720 cgrp->root = parent->root;
@@ -4221,24 +3725,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3725 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3726 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223 3727
3728 /* create the directory */
3729 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3730 if (IS_ERR(kn)) {
3731 err = PTR_ERR(kn);
3732 goto err_free_id;
3733 }
3734 cgrp->kn = kn;
3735
4224 /* 3736 /*
4225 * Create directory. cgroup_create_file() returns with the new 3737 * This extra ref will be put in cgroup_free_fn() and guarantees
4226 * directory locked on success so that it can be populated without 3738 * that @cgrp->kn is always accessible.
4227 * dropping cgroup_mutex.
4228 */ 3739 */
4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3740 kernfs_get(kn);
4230 if (err < 0)
4231 goto err_free_id;
4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4233 3741
4234 cgrp->serial_nr = cgroup_serial_nr_next++; 3742 cgrp->serial_nr = cgroup_serial_nr_next++;
4235 3743
4236 /* allocation complete, commit to creation */ 3744 /* allocation complete, commit to creation */
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3745 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++; 3746 atomic_inc(&root->nr_cgrps);
4239 3747 cgroup_get(parent);
4240 /* hold a ref to the parent's dentry */
4241 dget(parent->dentry);
4242 3748
4243 /* 3749 /*
4244 * @cgrp is now fully operational. If something fails after this 3750 * @cgrp is now fully operational. If something fails after this
@@ -4252,43 +3758,56 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4252 3758
4253 /* let's create and online css's */ 3759 /* let's create and online css's */
4254 for_each_subsys(ss, ssid) { 3760 for_each_subsys(ss, ssid) {
4255 if (root->subsys_mask & (1 << ssid)) { 3761 if (root->cgrp.subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss); 3762 err = create_css(cgrp, ss);
4257 if (err) 3763 if (err)
4258 goto err_destroy; 3764 goto err_destroy;
4259 } 3765 }
4260 } 3766 }
4261 3767
3768 kernfs_activate(kn);
3769
4262 mutex_unlock(&cgroup_mutex); 3770 mutex_unlock(&cgroup_mutex);
4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3771 mutex_unlock(&cgroup_tree_mutex);
4264 3772
4265 return 0; 3773 return 0;
4266 3774
4267err_free_id: 3775err_free_id:
4268 idr_remove(&root->cgroup_idr, cgrp->id); 3776 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock: 3777err_unlock:
4272 mutex_unlock(&cgroup_mutex); 3778 mutex_unlock(&cgroup_mutex);
4273err_free_name: 3779err_unlock_tree:
4274 kfree(rcu_dereference_raw(cgrp->name)); 3780 mutex_unlock(&cgroup_tree_mutex);
4275err_free_cgrp:
4276 kfree(cgrp); 3781 kfree(cgrp);
4277 return err; 3782 return err;
4278 3783
4279err_destroy: 3784err_destroy:
4280 cgroup_destroy_locked(cgrp); 3785 cgroup_destroy_locked(cgrp);
4281 mutex_unlock(&cgroup_mutex); 3786 mutex_unlock(&cgroup_mutex);
4282 mutex_unlock(&dentry->d_inode->i_mutex); 3787 mutex_unlock(&cgroup_tree_mutex);
4283 return err; 3788 return err;
4284} 3789}
4285 3790
4286static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3791static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3792 umode_t mode)
4287{ 3793{
4288 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3794 struct cgroup *parent = parent_kn->priv;
3795 int ret;
3796
3797 /*
3798 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3799 * kernfs active_ref and cgroup_create() already synchronizes
3800 * properly against removal through cgroup_lock_live_group().
3801 * Break it before calling cgroup_create().
3802 */
3803 cgroup_get(parent);
3804 kernfs_break_active_protection(parent_kn);
3805
3806 ret = cgroup_create(parent, name, mode);
4289 3807
4290 /* the vfs holds inode->i_mutex already */ 3808 kernfs_unbreak_active_protection(parent_kn);
4291 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3809 cgroup_put(parent);
3810 return ret;
4292} 3811}
4293 3812
4294/* 3813/*
@@ -4301,6 +3820,7 @@ static void css_killed_work_fn(struct work_struct *work)
4301 container_of(work, struct cgroup_subsys_state, destroy_work); 3820 container_of(work, struct cgroup_subsys_state, destroy_work);
4302 struct cgroup *cgrp = css->cgroup; 3821 struct cgroup *cgrp = css->cgroup;
4303 3822
3823 mutex_lock(&cgroup_tree_mutex);
4304 mutex_lock(&cgroup_mutex); 3824 mutex_lock(&cgroup_mutex);
4305 3825
4306 /* 3826 /*
@@ -4318,6 +3838,7 @@ static void css_killed_work_fn(struct work_struct *work)
4318 cgroup_destroy_css_killed(cgrp); 3838 cgroup_destroy_css_killed(cgrp);
4319 3839
4320 mutex_unlock(&cgroup_mutex); 3840 mutex_unlock(&cgroup_mutex);
3841 mutex_unlock(&cgroup_tree_mutex);
4321 3842
4322 /* 3843 /*
4323 * Put the css refs from kill_css(). Each css holds an extra 3844 * Put the css refs from kill_css(). Each css holds an extra
@@ -4339,18 +3860,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4339 queue_work(cgroup_destroy_wq, &css->destroy_work); 3860 queue_work(cgroup_destroy_wq, &css->destroy_work);
4340} 3861}
4341 3862
4342/** 3863static void __kill_css(struct cgroup_subsys_state *css)
4343 * kill_css - destroy a css
4344 * @css: css to destroy
4345 *
4346 * This function initiates destruction of @css by removing cgroup interface
4347 * files and putting its base reference. ->css_offline() will be invoked
4348 * asynchronously once css_tryget() is guaranteed to fail and when the
4349 * reference count reaches zero, @css will be released.
4350 */
4351static void kill_css(struct cgroup_subsys_state *css)
4352{ 3864{
4353 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3865 lockdep_assert_held(&cgroup_tree_mutex);
3866
3867 /*
3868 * This must happen before css is disassociated with its cgroup.
3869 * See seq_css() for details.
3870 */
3871 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4354 3872
4355 /* 3873 /*
4356 * Killing would put the base ref, but we need to keep it alive 3874 * Killing would put the base ref, but we need to keep it alive
@@ -4372,6 +3890,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4372} 3890}
4373 3891
4374/** 3892/**
3893 * kill_css - destroy a css
3894 * @css: css to destroy
3895 *
3896 * This function initiates destruction of @css by removing cgroup interface
3897 * files and putting its base reference. ->css_offline() will be invoked
3898 * asynchronously once css_tryget() is guaranteed to fail and when the
3899 * reference count reaches zero, @css will be released.
3900 */
3901static void kill_css(struct cgroup_subsys_state *css)
3902{
3903 struct cgroup *cgrp = css->cgroup;
3904
3905 lockdep_assert_held(&cgroup_tree_mutex);
3906
3907 /* if already killed, noop */
3908 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3909 cgrp->subsys_mask &= ~(1 << css->ss->id);
3910 __kill_css(css);
3911 }
3912}
3913
3914/**
4375 * cgroup_destroy_locked - the first stage of cgroup destruction 3915 * cgroup_destroy_locked - the first stage of cgroup destruction
4376 * @cgrp: cgroup to be destroyed 3916 * @cgrp: cgroup to be destroyed
4377 * 3917 *
@@ -4398,22 +3938,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4398static int cgroup_destroy_locked(struct cgroup *cgrp) 3938static int cgroup_destroy_locked(struct cgroup *cgrp)
4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3939 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4400{ 3940{
4401 struct dentry *d = cgrp->dentry;
4402 struct cgroup_subsys_state *css;
4403 struct cgroup *child; 3941 struct cgroup *child;
3942 struct cgroup_subsys_state *css;
4404 bool empty; 3943 bool empty;
4405 int ssid; 3944 int ssid;
4406 3945
4407 lockdep_assert_held(&d->d_inode->i_mutex); 3946 lockdep_assert_held(&cgroup_tree_mutex);
4408 lockdep_assert_held(&cgroup_mutex); 3947 lockdep_assert_held(&cgroup_mutex);
4409 3948
4410 /* 3949 /*
4411 * css_set_lock synchronizes access to ->cset_links and prevents 3950 * css_set_rwsem synchronizes access to ->cset_links and prevents
4412 * @cgrp from being removed while __put_css_set() is in progress. 3951 * @cgrp from being removed while put_css_set() is in progress.
4413 */ 3952 */
4414 read_lock(&css_set_lock); 3953 down_read(&css_set_rwsem);
4415 empty = list_empty(&cgrp->cset_links); 3954 empty = list_empty(&cgrp->cset_links);
4416 read_unlock(&css_set_lock); 3955 up_read(&css_set_rwsem);
4417 if (!empty) 3956 if (!empty)
4418 return -EBUSY; 3957 return -EBUSY;
4419 3958
@@ -4434,14 +3973,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4434 return -EBUSY; 3973 return -EBUSY;
4435 3974
4436 /* 3975 /*
4437 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4438 * will be invoked to perform the rest of destruction once the
4439 * percpu refs of all css's are confirmed to be killed.
4440 */
4441 for_each_css(css, ssid, cgrp)
4442 kill_css(css);
4443
4444 /*
4445 * Mark @cgrp dead. This prevents further task migration and child 3976 * Mark @cgrp dead. This prevents further task migration and child
4446 * creation by disabling cgroup_lock_live_group(). Note that 3977 * creation by disabling cgroup_lock_live_group(). Note that
4447 * CGRP_DEAD assertion is depended upon by css_next_child() to 3978 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4450,6 +3981,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4450 */ 3981 */
4451 set_bit(CGRP_DEAD, &cgrp->flags); 3982 set_bit(CGRP_DEAD, &cgrp->flags);
4452 3983
3984 /*
3985 * Initiate massacre of all css's. cgroup_destroy_css_killed()
3986 * will be invoked to perform the rest of destruction once the
3987 * percpu refs of all css's are confirmed to be killed. This
3988 * involves removing the subsystem's files, drop cgroup_mutex.
3989 */
3990 mutex_unlock(&cgroup_mutex);
3991 for_each_css(css, ssid, cgrp)
3992 kill_css(css);
3993 mutex_lock(&cgroup_mutex);
3994
4453 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 3995 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4454 raw_spin_lock(&release_list_lock); 3996 raw_spin_lock(&release_list_lock);
4455 if (!list_empty(&cgrp->release_list)) 3997 if (!list_empty(&cgrp->release_list))
@@ -4465,14 +4007,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4465 if (!cgrp->nr_css) 4007 if (!cgrp->nr_css)
4466 cgroup_destroy_css_killed(cgrp); 4008 cgroup_destroy_css_killed(cgrp);
4467 4009
4010 /* remove @cgrp directory along with the base files */
4011 mutex_unlock(&cgroup_mutex);
4012
4468 /* 4013 /*
4469 * Clear the base files and remove @cgrp directory. The removal 4014 * There are two control paths which try to determine cgroup from
4470 * puts the base ref but we aren't quite done with @cgrp yet, so 4015 * dentry without going through kernfs - cgroupstats_build() and
4471 * hold onto it. 4016 * css_tryget_from_dir(). Those are supported by RCU protecting
4017 * clearing of cgrp->kn->priv backpointer, which should happen
4018 * after all files under it have been removed.
4472 */ 4019 */
4473 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4020 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4474 dget(d); 4021 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4475 cgroup_d_remove_dir(d); 4022
4023 mutex_lock(&cgroup_mutex);
4476 4024
4477 return 0; 4025 return 0;
4478}; 4026};
@@ -4489,72 +4037,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4489static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4037static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4490{ 4038{
4491 struct cgroup *parent = cgrp->parent; 4039 struct cgroup *parent = cgrp->parent;
4492 struct dentry *d = cgrp->dentry;
4493 4040
4041 lockdep_assert_held(&cgroup_tree_mutex);
4494 lockdep_assert_held(&cgroup_mutex); 4042 lockdep_assert_held(&cgroup_mutex);
4495 4043
4496 /* delete this cgroup from parent->children */ 4044 /* delete this cgroup from parent->children */
4497 list_del_rcu(&cgrp->sibling); 4045 list_del_rcu(&cgrp->sibling);
4498 4046
4499 dput(d); 4047 cgroup_put(cgrp);
4500 4048
4501 set_bit(CGRP_RELEASABLE, &parent->flags); 4049 set_bit(CGRP_RELEASABLE, &parent->flags);
4502 check_for_release(parent); 4050 check_for_release(parent);
4503} 4051}
4504 4052
4505static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4053static int cgroup_rmdir(struct kernfs_node *kn)
4506{ 4054{
4507 int ret; 4055 struct cgroup *cgrp = kn->priv;
4508 4056 int ret = 0;
4509 mutex_lock(&cgroup_mutex);
4510 ret = cgroup_destroy_locked(dentry->d_fsdata);
4511 mutex_unlock(&cgroup_mutex);
4512 4057
4513 return ret; 4058 /*
4514} 4059 * This is self-destruction but @kn can't be removed while this
4060 * callback is in progress. Let's break active protection. Once
4061 * the protection is broken, @cgrp can be destroyed at any point.
4062 * Pin it so that it stays accessible.
4063 */
4064 cgroup_get(cgrp);
4065 kernfs_break_active_protection(kn);
4515 4066
4516static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4067 mutex_lock(&cgroup_tree_mutex);
4517{ 4068 mutex_lock(&cgroup_mutex);
4518 INIT_LIST_HEAD(&ss->cftsets);
4519 4069
4520 /* 4070 /*
4521 * base_cftset is embedded in subsys itself, no need to worry about 4071 * @cgrp might already have been destroyed while we're trying to
4522 * deregistration. 4072 * grab the mutexes.
4523 */ 4073 */
4524 if (ss->base_cftypes) { 4074 if (!cgroup_is_dead(cgrp))
4525 struct cftype *cft; 4075 ret = cgroup_destroy_locked(cgrp);
4526 4076
4527 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4077 mutex_unlock(&cgroup_mutex);
4528 cft->ss = ss; 4078 mutex_unlock(&cgroup_tree_mutex);
4529 4079
4530 ss->base_cftset.cfts = ss->base_cftypes; 4080 kernfs_unbreak_active_protection(kn);
4531 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4081 cgroup_put(cgrp);
4532 } 4082 return ret;
4533} 4083}
4534 4084
4085static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4086 .remount_fs = cgroup_remount,
4087 .show_options = cgroup_show_options,
4088 .mkdir = cgroup_mkdir,
4089 .rmdir = cgroup_rmdir,
4090 .rename = cgroup_rename,
4091};
4092
4535static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4536{ 4094{
4537 struct cgroup_subsys_state *css; 4095 struct cgroup_subsys_state *css;
4538 4096
4539 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4540 4098
4099 mutex_lock(&cgroup_tree_mutex);
4541 mutex_lock(&cgroup_mutex); 4100 mutex_lock(&cgroup_mutex);
4542 4101
4543 /* init base cftset */ 4102 INIT_LIST_HEAD(&ss->cfts);
4544 cgroup_init_cftsets(ss);
4545 4103
4546 /* Create the top cgroup state for this subsystem */ 4104 /* Create the root cgroup state for this subsystem */
4547 ss->root = &cgroup_dummy_root; 4105 ss->root = &cgrp_dfl_root;
4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4106 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4549 /* We don't handle early failures gracefully */ 4107 /* We don't handle early failures gracefully */
4550 BUG_ON(IS_ERR(css)); 4108 BUG_ON(IS_ERR(css));
4551 init_css(css, ss, cgroup_dummy_top); 4109 init_css(css, ss, &cgrp_dfl_root.cgrp);
4552 4110
4553 /* Update the init_css_set to contain a subsys 4111 /* Update the init_css_set to contain a subsys
4554 * pointer to this state - since the subsystem is 4112 * pointer to this state - since the subsystem is
4555 * newly registered, all tasks and hence the 4113 * newly registered, all tasks and hence the
4556 * init_css_set is in the subsystem's top cgroup. */ 4114 * init_css_set is in the subsystem's root cgroup. */
4557 init_css_set.subsys[ss->subsys_id] = css; 4115 init_css_set.subsys[ss->id] = css;
4558 4116
4559 need_forkexit_callback |= ss->fork || ss->exit; 4117 need_forkexit_callback |= ss->fork || ss->exit;
4560 4118
@@ -4565,185 +4123,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4565 4123
4566 BUG_ON(online_css(css)); 4124 BUG_ON(online_css(css));
4567 4125
4568 mutex_unlock(&cgroup_mutex); 4126 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4569
4570 /* this function shouldn't be used with modular subsystems, since they
4571 * need to register a subsys_id, among other things */
4572 BUG_ON(ss->module);
4573}
4574
4575/**
4576 * cgroup_load_subsys: load and register a modular subsystem at runtime
4577 * @ss: the subsystem to load
4578 *
4579 * This function should be called in a modular subsystem's initcall. If the
4580 * subsystem is built as a module, it will be assigned a new subsys_id and set
4581 * up for use. If the subsystem is built-in anyway, work is delegated to the
4582 * simpler cgroup_init_subsys.
4583 */
4584int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4585{
4586 struct cgroup_subsys_state *css;
4587 int i, ret;
4588 struct hlist_node *tmp;
4589 struct css_set *cset;
4590 unsigned long key;
4591
4592 /* check name and function validity */
4593 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4594 ss->css_alloc == NULL || ss->css_free == NULL)
4595 return -EINVAL;
4596
4597 /*
4598 * we don't support callbacks in modular subsystems. this check is
4599 * before the ss->module check for consistency; a subsystem that could
4600 * be a module should still have no callbacks even if the user isn't
4601 * compiling it as one.
4602 */
4603 if (ss->fork || ss->exit)
4604 return -EINVAL;
4605
4606 /*
4607 * an optionally modular subsystem is built-in: we want to do nothing,
4608 * since cgroup_init_subsys will have already taken care of it.
4609 */
4610 if (ss->module == NULL) {
4611 /* a sanity check */
4612 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4613 return 0;
4614 }
4615
4616 /* init base cftset */
4617 cgroup_init_cftsets(ss);
4618
4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4621 cgroup_subsys[ss->subsys_id] = ss;
4622
4623 /*
4624 * no ss->css_alloc seems to need anything important in the ss
4625 * struct, so this can happen first (i.e. before the dummy root
4626 * attachment).
4627 */
4628 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4629 if (IS_ERR(css)) {
4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4633 mutex_unlock(&cgroup_mutex);
4634 return PTR_ERR(css);
4635 }
4636
4637 ss->root = &cgroup_dummy_root;
4638
4639 /* our new subsystem will be attached to the dummy hierarchy. */
4640 init_css(css, ss, cgroup_dummy_top);
4641
4642 /*
4643 * Now we need to entangle the css into the existing css_sets. unlike
4644 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4645 * will need a new pointer to it; done by iterating the css_set_table.
4646 * furthermore, modifying the existing css_sets will corrupt the hash
4647 * table state, so each changed css_set will need its hash recomputed.
4648 * this is all done under the css_set_lock.
4649 */
4650 write_lock(&css_set_lock);
4651 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4652 /* skip entries that we already rehashed */
4653 if (cset->subsys[ss->subsys_id])
4654 continue;
4655 /* remove existing entry */
4656 hash_del(&cset->hlist);
4657 /* set new value */
4658 cset->subsys[ss->subsys_id] = css;
4659 /* recompute hash and restore entry */
4660 key = css_set_hash(cset->subsys);
4661 hash_add(css_set_table, &cset->hlist, key);
4662 }
4663 write_unlock(&css_set_lock);
4664
4665 ret = online_css(css);
4666 if (ret) {
4667 ss->css_free(css);
4668 goto err_unload;
4669 }
4670
4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4673 mutex_unlock(&cgroup_mutex);
4674 return 0;
4675
4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4678 mutex_unlock(&cgroup_mutex);
4679 /* @ss can't be mounted here as try_module_get() would fail */
4680 cgroup_unload_subsys(ss);
4681 return ret;
4682}
4683EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4684
4685/**
4686 * cgroup_unload_subsys: unload a modular subsystem
4687 * @ss: the subsystem to unload
4688 *
4689 * This function should be called in a modular subsystem's exitcall. When this
4690 * function is invoked, the refcount on the subsystem's module will be 0, so
4691 * the subsystem will not be attached to any hierarchy.
4692 */
4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4694{
4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4697
4698 BUG_ON(ss->module == NULL);
4699
4700 /*
4701 * we shouldn't be called if the subsystem is in use, and the use of
4702 * try_module_get() in rebind_subsystems() should ensure that it
4703 * doesn't start being used while we're killing it off.
4704 */
4705 BUG_ON(ss->root != &cgroup_dummy_root);
4706
4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4709
4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4713 4127
4714 /* deassign the subsys_id */
4715 cgroup_subsys[ss->subsys_id] = NULL;
4716
4717 /*
4718 * disentangle the css from all css_sets attached to the dummy
4719 * top. as in loading, we need to pay our respects to the hashtable
4720 * gods.
4721 */
4722 write_lock(&css_set_lock);
4723 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4724 struct css_set *cset = link->cset;
4725 unsigned long key;
4726
4727 hash_del(&cset->hlist);
4728 cset->subsys[ss->subsys_id] = NULL;
4729 key = css_set_hash(cset->subsys);
4730 hash_add(css_set_table, &cset->hlist, key);
4731 }
4732 write_unlock(&css_set_lock);
4733
4734 /*
4735 * remove subsystem's css from the cgroup_dummy_top and free it -
4736 * need to free before marking as null because ss->css_free needs
4737 * the cgrp->subsys pointer to find their state.
4738 */
4739 if (css)
4740 ss->css_free(css);
4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4742
4743 mutex_unlock(&cgroup_root_mutex);
4744 mutex_unlock(&cgroup_mutex); 4128 mutex_unlock(&cgroup_mutex);
4129 mutex_unlock(&cgroup_tree_mutex);
4745} 4130}
4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4747 4131
4748/** 4132/**
4749 * cgroup_init_early - cgroup initialization at system boot 4133 * cgroup_init_early - cgroup initialization at system boot
@@ -4753,34 +4137,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4753 */ 4137 */
4754int __init cgroup_init_early(void) 4138int __init cgroup_init_early(void)
4755{ 4139{
4140 static struct cgroup_sb_opts __initdata opts =
4141 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4756 struct cgroup_subsys *ss; 4142 struct cgroup_subsys *ss;
4757 int i; 4143 int i;
4758 4144
4759 atomic_set(&init_css_set.refcount, 1); 4145 init_cgroup_root(&cgrp_dfl_root, &opts);
4760 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4761 INIT_LIST_HEAD(&init_css_set.tasks);
4762 INIT_HLIST_NODE(&init_css_set.hlist);
4763 css_set_count = 1;
4764 init_cgroup_root(&cgroup_dummy_root);
4765 cgroup_root_count = 1;
4766 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4146 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4767 4147
4768 init_cgrp_cset_link.cset = &init_css_set; 4148 for_each_subsys(ss, i) {
4769 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4149 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4770 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4150 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4771 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4151 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4772 4152 ss->id, ss->name);
4773 /* at bootup time, we don't worry about modular subsystems */ 4153 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4774 for_each_builtin_subsys(ss, i) { 4154 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4775 BUG_ON(!ss->name); 4155
4776 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4156 ss->id = i;
4777 BUG_ON(!ss->css_alloc); 4157 ss->name = cgroup_subsys_name[i];
4778 BUG_ON(!ss->css_free);
4779 if (ss->subsys_id != i) {
4780 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4781 ss->name, ss->subsys_id);
4782 BUG();
4783 }
4784 4158
4785 if (ss->early_init) 4159 if (ss->early_init)
4786 cgroup_init_subsys(ss); 4160 cgroup_init_subsys(ss);
@@ -4798,53 +4172,46 @@ int __init cgroup_init(void)
4798{ 4172{
4799 struct cgroup_subsys *ss; 4173 struct cgroup_subsys *ss;
4800 unsigned long key; 4174 unsigned long key;
4801 int i, err; 4175 int ssid, err;
4802 4176
4803 err = bdi_init(&cgroup_backing_dev_info); 4177 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4804 if (err)
4805 return err;
4806 4178
4807 for_each_builtin_subsys(ss, i) { 4179 mutex_lock(&cgroup_tree_mutex);
4808 if (!ss->early_init)
4809 cgroup_init_subsys(ss);
4810 }
4811
4812 /* allocate id for the dummy hierarchy */
4813 mutex_lock(&cgroup_mutex); 4180 mutex_lock(&cgroup_mutex);
4814 mutex_lock(&cgroup_root_mutex);
4815 4181
4816 /* Add init_css_set to the hash table */ 4182 /* Add init_css_set to the hash table */
4817 key = css_set_hash(init_css_set.subsys); 4183 key = css_set_hash(init_css_set.subsys);
4818 hash_add(css_set_table, &init_css_set.hlist, key); 4184 hash_add(css_set_table, &init_css_set.hlist, key);
4819 4185
4820 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4186 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4821 4187
4822 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4823 0, 1, GFP_KERNEL);
4824 BUG_ON(err < 0);
4825
4826 mutex_unlock(&cgroup_root_mutex);
4827 mutex_unlock(&cgroup_mutex); 4188 mutex_unlock(&cgroup_mutex);
4189 mutex_unlock(&cgroup_tree_mutex);
4828 4190
4829 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4191 for_each_subsys(ss, ssid) {
4830 if (!cgroup_kobj) { 4192 if (!ss->early_init)
4831 err = -ENOMEM; 4193 cgroup_init_subsys(ss);
4832 goto out; 4194
4195 /*
4196 * cftype registration needs kmalloc and can't be done
4197 * during early_init. Register base cftypes separately.
4198 */
4199 if (ss->base_cftypes)
4200 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4833 } 4201 }
4834 4202
4203 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4204 if (!cgroup_kobj)
4205 return -ENOMEM;
4206
4835 err = register_filesystem(&cgroup_fs_type); 4207 err = register_filesystem(&cgroup_fs_type);
4836 if (err < 0) { 4208 if (err < 0) {
4837 kobject_put(cgroup_kobj); 4209 kobject_put(cgroup_kobj);
4838 goto out; 4210 return err;
4839 } 4211 }
4840 4212
4841 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4213 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4842 4214 return 0;
4843out:
4844 if (err)
4845 bdi_destroy(&cgroup_backing_dev_info);
4846
4847 return err;
4848} 4215}
4849 4216
4850static int __init cgroup_wq_init(void) 4217static int __init cgroup_wq_init(void)
@@ -4876,12 +4243,6 @@ core_initcall(cgroup_wq_init);
4876 * proc_cgroup_show() 4243 * proc_cgroup_show()
4877 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4244 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4878 * - Used for /proc/<pid>/cgroup. 4245 * - Used for /proc/<pid>/cgroup.
4879 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4880 * doesn't really matter if tsk->cgroup changes after we read it,
4881 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4882 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4883 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4884 * cgroup to top_cgroup.
4885 */ 4246 */
4886 4247
4887/* TODO: Use a proper seq_file iterator */ 4248/* TODO: Use a proper seq_file iterator */
@@ -4889,12 +4250,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4889{ 4250{
4890 struct pid *pid; 4251 struct pid *pid;
4891 struct task_struct *tsk; 4252 struct task_struct *tsk;
4892 char *buf; 4253 char *buf, *path;
4893 int retval; 4254 int retval;
4894 struct cgroupfs_root *root; 4255 struct cgroup_root *root;
4895 4256
4896 retval = -ENOMEM; 4257 retval = -ENOMEM;
4897 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4258 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4898 if (!buf) 4259 if (!buf)
4899 goto out; 4260 goto out;
4900 4261
@@ -4907,29 +4268,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4907 retval = 0; 4268 retval = 0;
4908 4269
4909 mutex_lock(&cgroup_mutex); 4270 mutex_lock(&cgroup_mutex);
4271 down_read(&css_set_rwsem);
4910 4272
4911 for_each_active_root(root) { 4273 for_each_root(root) {
4912 struct cgroup_subsys *ss; 4274 struct cgroup_subsys *ss;
4913 struct cgroup *cgrp; 4275 struct cgroup *cgrp;
4914 int ssid, count = 0; 4276 int ssid, count = 0;
4915 4277
4278 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4279 continue;
4280
4916 seq_printf(m, "%d:", root->hierarchy_id); 4281 seq_printf(m, "%d:", root->hierarchy_id);
4917 for_each_subsys(ss, ssid) 4282 for_each_subsys(ss, ssid)
4918 if (root->subsys_mask & (1 << ssid)) 4283 if (root->cgrp.subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4284 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4920 if (strlen(root->name)) 4285 if (strlen(root->name))
4921 seq_printf(m, "%sname=%s", count ? "," : "", 4286 seq_printf(m, "%sname=%s", count ? "," : "",
4922 root->name); 4287 root->name);
4923 seq_putc(m, ':'); 4288 seq_putc(m, ':');
4924 cgrp = task_cgroup_from_root(tsk, root); 4289 cgrp = task_cgroup_from_root(tsk, root);
4925 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4290 path = cgroup_path(cgrp, buf, PATH_MAX);
4926 if (retval < 0) 4291 if (!path) {
4292 retval = -ENAMETOOLONG;
4927 goto out_unlock; 4293 goto out_unlock;
4928 seq_puts(m, buf); 4294 }
4295 seq_puts(m, path);
4929 seq_putc(m, '\n'); 4296 seq_putc(m, '\n');
4930 } 4297 }
4931 4298
4932out_unlock: 4299out_unlock:
4300 up_read(&css_set_rwsem);
4933 mutex_unlock(&cgroup_mutex); 4301 mutex_unlock(&cgroup_mutex);
4934 put_task_struct(tsk); 4302 put_task_struct(tsk);
4935out_free: 4303out_free:
@@ -4955,7 +4323,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4955 for_each_subsys(ss, i) 4323 for_each_subsys(ss, i)
4956 seq_printf(m, "%s\t%d\t%d\t%d\n", 4324 seq_printf(m, "%s\t%d\t%d\t%d\n",
4957 ss->name, ss->root->hierarchy_id, 4325 ss->name, ss->root->hierarchy_id,
4958 ss->root->number_of_cgroups, !ss->disabled); 4326 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4959 4327
4960 mutex_unlock(&cgroup_mutex); 4328 mutex_unlock(&cgroup_mutex);
4961 return 0; 4329 return 0;
@@ -4974,27 +4342,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4974}; 4342};
4975 4343
4976/** 4344/**
4977 * cgroup_fork - attach newly forked task to its parents cgroup. 4345 * cgroup_fork - initialize cgroup related fields during copy_process()
4978 * @child: pointer to task_struct of forking parent process. 4346 * @child: pointer to task_struct of forking parent process.
4979 * 4347 *
4980 * Description: A task inherits its parent's cgroup at fork(). 4348 * A task is associated with the init_css_set until cgroup_post_fork()
4981 * 4349 * attaches it to the parent's css_set. Empty cg_list indicates that
4982 * A pointer to the shared css_set was automatically copied in 4350 * @child isn't holding reference to its css_set.
4983 * fork.c by dup_task_struct(). However, we ignore that copy, since
4984 * it was not made under the protection of RCU or cgroup_mutex, so
4985 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4986 * have already changed current->cgroups, allowing the previously
4987 * referenced cgroup group to be removed and freed.
4988 *
4989 * At the point that cgroup_fork() is called, 'current' is the parent
4990 * task, and the passed argument 'child' points to the child task.
4991 */ 4351 */
4992void cgroup_fork(struct task_struct *child) 4352void cgroup_fork(struct task_struct *child)
4993{ 4353{
4994 task_lock(current); 4354 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4995 get_css_set(task_css_set(current));
4996 child->cgroups = current->cgroups;
4997 task_unlock(current);
4998 INIT_LIST_HEAD(&child->cg_list); 4355 INIT_LIST_HEAD(&child->cg_list);
4999} 4356}
5000 4357
@@ -5014,23 +4371,37 @@ void cgroup_post_fork(struct task_struct *child)
5014 int i; 4371 int i;
5015 4372
5016 /* 4373 /*
5017 * use_task_css_set_links is set to 1 before we walk the tasklist 4374 * This may race against cgroup_enable_task_cg_links(). As that
5018 * under the tasklist_lock and we read it here after we added the child 4375 * function sets use_task_css_set_links before grabbing
5019 * to the tasklist under the tasklist_lock as well. If the child wasn't 4376 * tasklist_lock and we just went through tasklist_lock to add
5020 * yet in the tasklist when we walked through it from 4377 * @child, it's guaranteed that either we see the set
5021 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4378 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5022 * should be visible now due to the paired locking and barriers implied 4379 * @child during its iteration.
5023 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4380 *
5024 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4381 * If we won the race, @child is associated with %current's
5025 * lock on fork. 4382 * css_set. Grabbing css_set_rwsem guarantees both that the
4383 * association is stable, and, on completion of the parent's
4384 * migration, @child is visible in the source of migration or
4385 * already in the destination cgroup. This guarantee is necessary
4386 * when implementing operations which need to migrate all tasks of
4387 * a cgroup to another.
4388 *
4389 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4390 * will remain in init_css_set. This is safe because all tasks are
4391 * in the init_css_set before cg_links is enabled and there's no
4392 * operation which transfers all tasks out of init_css_set.
5026 */ 4393 */
5027 if (use_task_css_set_links) { 4394 if (use_task_css_set_links) {
5028 write_lock(&css_set_lock); 4395 struct css_set *cset;
5029 task_lock(child); 4396
5030 if (list_empty(&child->cg_list)) 4397 down_write(&css_set_rwsem);
5031 list_add(&child->cg_list, &task_css_set(child)->tasks); 4398 cset = task_css_set(current);
5032 task_unlock(child); 4399 if (list_empty(&child->cg_list)) {
5033 write_unlock(&css_set_lock); 4400 rcu_assign_pointer(child->cgroups, cset);
4401 list_add(&child->cg_list, &cset->tasks);
4402 get_css_set(cset);
4403 }
4404 up_write(&css_set_rwsem);
5034 } 4405 }
5035 4406
5036 /* 4407 /*
@@ -5039,15 +4410,7 @@ void cgroup_post_fork(struct task_struct *child)
5039 * and addition to css_set. 4410 * and addition to css_set.
5040 */ 4411 */
5041 if (need_forkexit_callback) { 4412 if (need_forkexit_callback) {
5042 /* 4413 for_each_subsys(ss, i)
5043 * fork/exit callbacks are supported only for builtin
5044 * subsystems, and the builtin section of the subsys
5045 * array is immutable, so we don't need to lock the
5046 * subsys array here. On the other hand, modular section
5047 * of the array can be freed at module unload, so we
5048 * can't touch that.
5049 */
5050 for_each_builtin_subsys(ss, i)
5051 if (ss->fork) 4414 if (ss->fork)
5052 ss->fork(child); 4415 ss->fork(child);
5053 } 4416 }
@@ -5056,7 +4419,6 @@ void cgroup_post_fork(struct task_struct *child)
5056/** 4419/**
5057 * cgroup_exit - detach cgroup from exiting task 4420 * cgroup_exit - detach cgroup from exiting task
5058 * @tsk: pointer to task_struct of exiting process 4421 * @tsk: pointer to task_struct of exiting process
5059 * @run_callback: run exit callbacks?
5060 * 4422 *
5061 * Description: Detach cgroup from @tsk and release it. 4423 * Description: Detach cgroup from @tsk and release it.
5062 * 4424 *
@@ -5066,57 +4428,38 @@ void cgroup_post_fork(struct task_struct *child)
5066 * use notify_on_release cgroups where very high task exit scaling 4428 * use notify_on_release cgroups where very high task exit scaling
5067 * is required on large systems. 4429 * is required on large systems.
5068 * 4430 *
5069 * the_top_cgroup_hack: 4431 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5070 * 4432 * call cgroup_exit() while the task is still competent to handle
5071 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4433 * notify_on_release(), then leave the task attached to the root cgroup in
5072 * 4434 * each hierarchy for the remainder of its exit. No need to bother with
5073 * We call cgroup_exit() while the task is still competent to 4435 * init_css_set refcnting. init_css_set never goes away and we can't race
5074 * handle notify_on_release(), then leave the task attached to the 4436 * with migration path - PF_EXITING is visible to migration path.
5075 * root cgroup in each hierarchy for the remainder of its exit.
5076 *
5077 * To do this properly, we would increment the reference count on
5078 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5079 * code we would add a second cgroup function call, to drop that
5080 * reference. This would just create an unnecessary hot spot on
5081 * the top_cgroup reference count, to no avail.
5082 *
5083 * Normally, holding a reference to a cgroup without bumping its
5084 * count is unsafe. The cgroup could go away, or someone could
5085 * attach us to a different cgroup, decrementing the count on
5086 * the first cgroup that we never incremented. But in this case,
5087 * top_cgroup isn't going away, and either task has PF_EXITING set,
5088 * which wards off any cgroup_attach_task() attempts, or task is a failed
5089 * fork, never visible to cgroup_attach_task.
5090 */ 4437 */
5091void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4438void cgroup_exit(struct task_struct *tsk)
5092{ 4439{
5093 struct cgroup_subsys *ss; 4440 struct cgroup_subsys *ss;
5094 struct css_set *cset; 4441 struct css_set *cset;
4442 bool put_cset = false;
5095 int i; 4443 int i;
5096 4444
5097 /* 4445 /*
5098 * Unlink from the css_set task list if necessary. 4446 * Unlink from @tsk from its css_set. As migration path can't race
5099 * Optimistically check cg_list before taking 4447 * with us, we can check cg_list without grabbing css_set_rwsem.
5100 * css_set_lock
5101 */ 4448 */
5102 if (!list_empty(&tsk->cg_list)) { 4449 if (!list_empty(&tsk->cg_list)) {
5103 write_lock(&css_set_lock); 4450 down_write(&css_set_rwsem);
5104 if (!list_empty(&tsk->cg_list)) 4451 list_del_init(&tsk->cg_list);
5105 list_del_init(&tsk->cg_list); 4452 up_write(&css_set_rwsem);
5106 write_unlock(&css_set_lock); 4453 put_cset = true;
5107 } 4454 }
5108 4455
5109 /* Reassign the task to the init_css_set. */ 4456 /* Reassign the task to the init_css_set. */
5110 task_lock(tsk);
5111 cset = task_css_set(tsk); 4457 cset = task_css_set(tsk);
5112 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4458 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5113 4459
5114 if (run_callbacks && need_forkexit_callback) { 4460 if (need_forkexit_callback) {
5115 /* 4461 /* see cgroup_post_fork() for details */
5116 * fork/exit callbacks are supported only for builtin 4462 for_each_subsys(ss, i) {
5117 * subsystems, see cgroup_post_fork() for details.
5118 */
5119 for_each_builtin_subsys(ss, i) {
5120 if (ss->exit) { 4463 if (ss->exit) {
5121 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4464 struct cgroup_subsys_state *old_css = cset->subsys[i];
5122 struct cgroup_subsys_state *css = task_css(tsk, i); 4465 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5125,9 +4468,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5125 } 4468 }
5126 } 4469 }
5127 } 4470 }
5128 task_unlock(tsk);
5129 4471
5130 put_css_set_taskexit(cset); 4472 if (put_cset)
4473 put_css_set(cset, true);
5131} 4474}
5132 4475
5133static void check_for_release(struct cgroup *cgrp) 4476static void check_for_release(struct cgroup *cgrp)
@@ -5184,16 +4527,17 @@ static void cgroup_release_agent(struct work_struct *work)
5184 while (!list_empty(&release_list)) { 4527 while (!list_empty(&release_list)) {
5185 char *argv[3], *envp[3]; 4528 char *argv[3], *envp[3];
5186 int i; 4529 int i;
5187 char *pathbuf = NULL, *agentbuf = NULL; 4530 char *pathbuf = NULL, *agentbuf = NULL, *path;
5188 struct cgroup *cgrp = list_entry(release_list.next, 4531 struct cgroup *cgrp = list_entry(release_list.next,
5189 struct cgroup, 4532 struct cgroup,
5190 release_list); 4533 release_list);
5191 list_del_init(&cgrp->release_list); 4534 list_del_init(&cgrp->release_list);
5192 raw_spin_unlock(&release_list_lock); 4535 raw_spin_unlock(&release_list_lock);
5193 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4536 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5194 if (!pathbuf) 4537 if (!pathbuf)
5195 goto continue_free; 4538 goto continue_free;
5196 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4539 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4540 if (!path)
5197 goto continue_free; 4541 goto continue_free;
5198 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4542 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5199 if (!agentbuf) 4543 if (!agentbuf)
@@ -5201,7 +4545,7 @@ static void cgroup_release_agent(struct work_struct *work)
5201 4545
5202 i = 0; 4546 i = 0;
5203 argv[i++] = agentbuf; 4547 argv[i++] = agentbuf;
5204 argv[i++] = pathbuf; 4548 argv[i++] = path;
5205 argv[i] = NULL; 4549 argv[i] = NULL;
5206 4550
5207 i = 0; 4551 i = 0;
@@ -5235,11 +4579,7 @@ static int __init cgroup_disable(char *str)
5235 if (!*token) 4579 if (!*token)
5236 continue; 4580 continue;
5237 4581
5238 /* 4582 for_each_subsys(ss, i) {
5239 * cgroup_disable, being at boot time, can't know about
5240 * module subsystems, so we don't worry about them.
5241 */
5242 for_each_builtin_subsys(ss, i) {
5243 if (!strcmp(token, ss->name)) { 4583 if (!strcmp(token, ss->name)) {
5244 ss->disabled = 1; 4584 ss->disabled = 1;
5245 printk(KERN_INFO "Disabling %s control group" 4585 printk(KERN_INFO "Disabling %s control group"
@@ -5253,28 +4593,42 @@ static int __init cgroup_disable(char *str)
5253__setup("cgroup_disable=", cgroup_disable); 4593__setup("cgroup_disable=", cgroup_disable);
5254 4594
5255/** 4595/**
5256 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4596 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5257 * @dentry: directory dentry of interest 4597 * @dentry: directory dentry of interest
5258 * @ss: subsystem of interest 4598 * @ss: subsystem of interest
5259 * 4599 *
5260 * Must be called under cgroup_mutex or RCU read lock. The caller is 4600 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5261 * responsible for pinning the returned css if it needs to be accessed 4601 * to get the corresponding css and return it. If such css doesn't exist
5262 * outside the critical section. 4602 * or can't be pinned, an ERR_PTR value is returned.
5263 */ 4603 */
5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4604struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5265 struct cgroup_subsys *ss) 4605 struct cgroup_subsys *ss)
5266{ 4606{
4607 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4608 struct cgroup_subsys_state *css = NULL;
5267 struct cgroup *cgrp; 4609 struct cgroup *cgrp;
5268 4610
5269 cgroup_assert_mutex_or_rcu_locked();
5270
5271 /* is @dentry a cgroup dir? */ 4611 /* is @dentry a cgroup dir? */
5272 if (!dentry->d_inode || 4612 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5273 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4613 kernfs_type(kn) != KERNFS_DIR)
5274 return ERR_PTR(-EBADF); 4614 return ERR_PTR(-EBADF);
5275 4615
5276 cgrp = __d_cgrp(dentry); 4616 rcu_read_lock();
5277 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4617
4618 /*
4619 * This path doesn't originate from kernfs and @kn could already
4620 * have been or be removed at any point. @kn->priv is RCU
4621 * protected for this access. See destroy_locked() for details.
4622 */
4623 cgrp = rcu_dereference(kn->priv);
4624 if (cgrp)
4625 css = cgroup_css(cgrp, ss);
4626
4627 if (!css || !css_tryget(css))
4628 css = ERR_PTR(-ENOENT);
4629
4630 rcu_read_unlock();
4631 return css;
5278} 4632}
5279 4633
5280/** 4634/**
@@ -5289,7 +4643,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5289{ 4643{
5290 struct cgroup *cgrp; 4644 struct cgroup *cgrp;
5291 4645
5292 cgroup_assert_mutex_or_rcu_locked(); 4646 cgroup_assert_mutexes_or_rcu_locked();
5293 4647
5294 cgrp = idr_find(&ss->root->cgroup_idr, id); 4648 cgrp = idr_find(&ss->root->cgroup_idr, id);
5295 if (cgrp) 4649 if (cgrp)
@@ -5341,23 +4695,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5341{ 4695{
5342 struct cgrp_cset_link *link; 4696 struct cgrp_cset_link *link;
5343 struct css_set *cset; 4697 struct css_set *cset;
4698 char *name_buf;
5344 4699
5345 read_lock(&css_set_lock); 4700 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4701 if (!name_buf)
4702 return -ENOMEM;
4703
4704 down_read(&css_set_rwsem);
5346 rcu_read_lock(); 4705 rcu_read_lock();
5347 cset = rcu_dereference(current->cgroups); 4706 cset = rcu_dereference(current->cgroups);
5348 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4707 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5349 struct cgroup *c = link->cgrp; 4708 struct cgroup *c = link->cgrp;
5350 const char *name;
5351 4709
5352 if (c->dentry) 4710 cgroup_name(c, name_buf, NAME_MAX + 1);
5353 name = c->dentry->d_name.name;
5354 else
5355 name = "?";
5356 seq_printf(seq, "Root %d group %s\n", 4711 seq_printf(seq, "Root %d group %s\n",
5357 c->root->hierarchy_id, name); 4712 c->root->hierarchy_id, name_buf);
5358 } 4713 }
5359 rcu_read_unlock(); 4714 rcu_read_unlock();
5360 read_unlock(&css_set_lock); 4715 up_read(&css_set_rwsem);
4716 kfree(name_buf);
5361 return 0; 4717 return 0;
5362} 4718}
5363 4719
@@ -5367,23 +4723,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5367 struct cgroup_subsys_state *css = seq_css(seq); 4723 struct cgroup_subsys_state *css = seq_css(seq);
5368 struct cgrp_cset_link *link; 4724 struct cgrp_cset_link *link;
5369 4725
5370 read_lock(&css_set_lock); 4726 down_read(&css_set_rwsem);
5371 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4727 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5372 struct css_set *cset = link->cset; 4728 struct css_set *cset = link->cset;
5373 struct task_struct *task; 4729 struct task_struct *task;
5374 int count = 0; 4730 int count = 0;
4731
5375 seq_printf(seq, "css_set %p\n", cset); 4732 seq_printf(seq, "css_set %p\n", cset);
4733
5376 list_for_each_entry(task, &cset->tasks, cg_list) { 4734 list_for_each_entry(task, &cset->tasks, cg_list) {
5377 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4735 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5378 seq_puts(seq, " ...\n"); 4736 goto overflow;
5379 break; 4737 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5380 } else { 4738 }
5381 seq_printf(seq, " task %d\n", 4739
5382 task_pid_vnr(task)); 4740 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5383 } 4741 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4742 goto overflow;
4743 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5384 } 4744 }
4745 continue;
4746 overflow:
4747 seq_puts(seq, " ...\n");
5385 } 4748 }
5386 read_unlock(&css_set_lock); 4749 up_read(&css_set_rwsem);
5387 return 0; 4750 return 0;
5388} 4751}
5389 4752
@@ -5426,11 +4789,9 @@ static struct cftype debug_files[] = {
5426 { } /* terminate */ 4789 { } /* terminate */
5427}; 4790};
5428 4791
5429struct cgroup_subsys debug_subsys = { 4792struct cgroup_subsys debug_cgrp_subsys = {
5430 .name = "debug",
5431 .css_alloc = debug_css_alloc, 4793 .css_alloc = debug_css_alloc,
5432 .css_free = debug_css_free, 4794 .css_free = debug_css_free,
5433 .subsys_id = debug_subsys_id,
5434 .base_cftypes = debug_files, 4795 .base_cftypes = debug_files,
5435}; 4796};
5436#endif /* CONFIG_CGROUP_DEBUG */ 4797#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
52 52
53static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
54{ 54{
55 return css_freezer(task_css(task, freezer_subsys_id)); 55 return css_freezer(task_css(task, freezer_cgrp_id));
56} 56}
57 57
58static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
84 return "THAWED"; 84 return "THAWED";
85}; 85};
86 86
87struct cgroup_subsys freezer_subsys;
88
89static struct cgroup_subsys_state * 87static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css) 88freezer_css_alloc(struct cgroup_subsys_state *parent_css)
91{ 89{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
189 * current state before executing the following - !frozen tasks may 187 * current state before executing the following - !frozen tasks may
190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 188 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
191 */ 189 */
192 cgroup_taskset_for_each(task, new_css, tset) { 190 cgroup_taskset_for_each(task, tset) {
193 if (!(freezer->state & CGROUP_FREEZING)) { 191 if (!(freezer->state & CGROUP_FREEZING)) {
194 __thaw_task(task); 192 __thaw_task(task);
195 } else { 193 } else {
@@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
216 } 214 }
217} 215}
218 216
217/**
218 * freezer_fork - cgroup post fork callback
219 * @task: a task which has just been forked
220 *
221 * @task has just been created and should conform to the current state of
222 * the cgroup_freezer it belongs to. This function may race against
223 * freezer_attach(). Losing to freezer_attach() means that we don't have
224 * to do anything as freezer_attach() will put @task into the appropriate
225 * state.
226 */
219static void freezer_fork(struct task_struct *task) 227static void freezer_fork(struct task_struct *task)
220{ 228{
221 struct freezer *freezer; 229 struct freezer *freezer;
@@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
224 freezer = task_freezer(task); 232 freezer = task_freezer(task);
225 233
226 /* 234 /*
227 * The root cgroup is non-freezable, so we can skip the 235 * The root cgroup is non-freezable, so we can skip locking the
228 * following check. 236 * freezer. This is safe regardless of race with task migration.
237 * If we didn't race or won, skipping is obviously the right thing
238 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do.
229 */ 240 */
230 if (!parent_freezer(freezer)) 241 if (!parent_freezer(freezer))
231 goto out; 242 goto out;
232 243
244 /*
245 * Grab @freezer->lock and freeze @task after verifying @task still
246 * belongs to @freezer and it's freezing. The former is for the
247 * case where we have raced against task migration and lost and
248 * @task is already in a different cgroup which may not be frozen.
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
233 spin_lock_irq(&freezer->lock); 253 spin_lock_irq(&freezer->lock);
234 if (freezer->state & CGROUP_FREEZING) 254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
235 freeze_task(task); 255 freeze_task(task);
236 spin_unlock_irq(&freezer->lock); 256 spin_unlock_irq(&freezer->lock);
237out: 257out:
@@ -422,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
422} 442}
423 443
424static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
425 const char *buffer) 445 char *buffer)
426{ 446{
427 bool freeze; 447 bool freeze;
428 448
@@ -473,13 +493,11 @@ static struct cftype files[] = {
473 { } /* terminate */ 493 { } /* terminate */
474}; 494};
475 495
476struct cgroup_subsys freezer_subsys = { 496struct cgroup_subsys freezer_cgrp_subsys = {
477 .name = "freezer",
478 .css_alloc = freezer_css_alloc, 497 .css_alloc = freezer_css_alloc,
479 .css_online = freezer_css_online, 498 .css_online = freezer_css_online,
480 .css_offline = freezer_css_offline, 499 .css_offline = freezer_css_offline,
481 .css_free = freezer_css_free, 500 .css_free = freezer_css_free,
482 .subsys_id = freezer_subsys_id,
483 .attach = freezer_attach, 501 .attach = freezer_attach,
484 .fork = freezer_fork, 502 .fork = freezer_fork,
485 .base_cftypes = files, 503 .base_cftypes = files,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..e2dbb60004d4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
120static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
121{ 121{
122 return css_cs(task_css(task, cpuset_subsys_id)); 122 return css_cs(task_css(task, cpuset_cgrp_id));
123} 123}
124 124
125static inline struct cpuset *parent_cs(struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
467 * be changed to have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
468 */ 468 */
469 ret = -ENOSPC; 469 ret = -ENOSPC;
470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { 470 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
471 if (!cpumask_empty(cur->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
472 cpumask_empty(trial->cpus_allowed)) 472 cpumask_empty(trial->cpus_allowed))
473 goto out; 473 goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
829} 829}
830 830
831/** 831/**
832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
833 * @tsk: task to test
834 * @data: cpuset to @tsk belongs to
835 *
836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
837 * mask needs to be changed.
838 *
839 * We don't need to re-check for the cgroup/cpuset membership, since we're
840 * holding cpuset_mutex at this point.
841 */
842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
843{
844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
846
847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
848}
849
850/**
851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 832 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 833 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
854 *
855 * Called with cpuset_mutex held
856 * 834 *
857 * The css_scan_tasks() function will scan all the tasks in a cgroup, 835 * Iterate through each task of @cs updating its cpus_allowed to the
858 * calling callback functions for each. 836 * effective cpuset's. As this function is called with cpuset_mutex held,
859 * 837 * cpuset membership stays stable.
860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
861 * if @heap != NULL.
862 */ 838 */
863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 839static void update_tasks_cpumask(struct cpuset *cs)
864{ 840{
865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); 841 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
842 struct css_task_iter it;
843 struct task_struct *task;
844
845 css_task_iter_start(&cs->css, &it);
846 while ((task = css_task_iter_next(&it)))
847 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
848 css_task_iter_end(&it);
866} 849}
867 850
868/* 851/*
869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 852 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
870 * @root_cs: the root cpuset of the hierarchy 853 * @root_cs: the root cpuset of the hierarchy
871 * @update_root: update root cpuset or not? 854 * @update_root: update root cpuset or not?
872 * @heap: the heap used by css_scan_tasks()
873 * 855 *
874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 856 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
875 * which take on cpumask of @root_cs. 857 * which take on cpumask of @root_cs.
876 * 858 *
877 * Called with cpuset_mutex held 859 * Called with cpuset_mutex held
878 */ 860 */
879static void update_tasks_cpumask_hier(struct cpuset *root_cs, 861static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
880 bool update_root, struct ptr_heap *heap)
881{ 862{
882 struct cpuset *cp; 863 struct cpuset *cp;
883 struct cgroup_subsys_state *pos_css; 864 struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
898 continue; 879 continue;
899 rcu_read_unlock(); 880 rcu_read_unlock();
900 881
901 update_tasks_cpumask(cp, heap); 882 update_tasks_cpumask(cp);
902 883
903 rcu_read_lock(); 884 rcu_read_lock();
904 css_put(&cp->css); 885 css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf) 896 const char *buf)
916{ 897{
917 struct ptr_heap heap;
918 int retval; 898 int retval;
919 int is_load_balanced; 899 int is_load_balanced;
920 900
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
947 if (retval < 0) 927 if (retval < 0)
948 return retval; 928 return retval;
949 929
950 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
951 if (retval)
952 return retval;
953
954 is_load_balanced = is_sched_load_balance(trialcs); 930 is_load_balanced = is_sched_load_balance(trialcs);
955 931
956 mutex_lock(&callback_mutex); 932 mutex_lock(&callback_mutex);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 933 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 934 mutex_unlock(&callback_mutex);
959 935
960 update_tasks_cpumask_hier(cs, true, &heap); 936 update_tasks_cpumask_hier(cs, true);
961
962 heap_free(&heap);
963 937
964 if (is_load_balanced) 938 if (is_load_balanced)
965 rebuild_sched_domains_locked(); 939 rebuild_sched_domains_locked();
@@ -1048,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 task_unlock(tsk); 1022 task_unlock(tsk);
1049} 1023}
1050 1024
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1056/*
1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1060 */
1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1062{
1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1065 struct mm_struct *mm;
1066 int migrate;
1067
1068 cpuset_change_task_nodemask(p, arg->newmems);
1069
1070 mm = get_task_mm(p);
1071 if (!mm)
1072 return;
1073
1074 migrate = is_memory_migrate(cs);
1075
1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1077 if (migrate)
1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1079 mmput(mm);
1080}
1081
1082static void *cpuset_being_rebound; 1025static void *cpuset_being_rebound;
1083 1026
1084/** 1027/**
1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1028 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1029 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1088 * 1030 *
1089 * Called with cpuset_mutex held. No return value. It's guaranteed that 1031 * Iterate through each task of @cs updating its mems_allowed to the
1090 * css_scan_tasks() always returns 0 if @heap != NULL. 1032 * effective cpuset's. As this function is called with cpuset_mutex held,
1033 * cpuset membership stays stable.
1091 */ 1034 */
1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1035static void update_tasks_nodemask(struct cpuset *cs)
1093{ 1036{
1094 static nodemask_t newmems; /* protected by cpuset_mutex */ 1037 static nodemask_t newmems; /* protected by cpuset_mutex */
1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1038 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs, 1039 struct css_task_iter it;
1097 .newmems = &newmems }; 1040 struct task_struct *task;
1098 1041
1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1042 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1100 1043
@@ -1110,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1053 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1111 * is idempotent. Also migrate pages in each mm to new nodes. 1054 * is idempotent. Also migrate pages in each mm to new nodes.
1112 */ 1055 */
1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); 1056 css_task_iter_start(&cs->css, &it);
1057 while ((task = css_task_iter_next(&it))) {
1058 struct mm_struct *mm;
1059 bool migrate;
1060
1061 cpuset_change_task_nodemask(task, &newmems);
1062
1063 mm = get_task_mm(task);
1064 if (!mm)
1065 continue;
1066
1067 migrate = is_memory_migrate(cs);
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1072 mmput(mm);
1073 }
1074 css_task_iter_end(&it);
1114 1075
1115 /* 1076 /*
1116 * All the tasks' nodemasks have been updated, update 1077 * All the tasks' nodemasks have been updated, update
@@ -1126,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1087 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1127 * @cs: the root cpuset of the hierarchy 1088 * @cs: the root cpuset of the hierarchy
1128 * @update_root: update the root cpuset or not? 1089 * @update_root: update the root cpuset or not?
1129 * @heap: the heap used by css_scan_tasks()
1130 * 1090 *
1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1091 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1132 * which take on nodemask of @root_cs. 1092 * which take on nodemask of @root_cs.
1133 * 1093 *
1134 * Called with cpuset_mutex held 1094 * Called with cpuset_mutex held
1135 */ 1095 */
1136static void update_tasks_nodemask_hier(struct cpuset *root_cs, 1096static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1137 bool update_root, struct ptr_heap *heap)
1138{ 1097{
1139 struct cpuset *cp; 1098 struct cpuset *cp;
1140 struct cgroup_subsys_state *pos_css; 1099 struct cgroup_subsys_state *pos_css;
@@ -1155,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1155 continue; 1114 continue;
1156 rcu_read_unlock(); 1115 rcu_read_unlock();
1157 1116
1158 update_tasks_nodemask(cp, heap); 1117 update_tasks_nodemask(cp);
1159 1118
1160 rcu_read_lock(); 1119 rcu_read_lock();
1161 css_put(&cp->css); 1120 css_put(&cp->css);
@@ -1180,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf) 1139 const char *buf)
1181{ 1140{
1182 int retval; 1141 int retval;
1183 struct ptr_heap heap;
1184 1142
1185 /* 1143 /*
1186 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1144 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1219,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1219 if (retval < 0) 1177 if (retval < 0)
1220 goto done; 1178 goto done;
1221 1179
1222 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1223 if (retval < 0)
1224 goto done;
1225
1226 mutex_lock(&callback_mutex); 1180 mutex_lock(&callback_mutex);
1227 cs->mems_allowed = trialcs->mems_allowed; 1181 cs->mems_allowed = trialcs->mems_allowed;
1228 mutex_unlock(&callback_mutex); 1182 mutex_unlock(&callback_mutex);
1229 1183
1230 update_tasks_nodemask_hier(cs, true, &heap); 1184 update_tasks_nodemask_hier(cs, true);
1231
1232 heap_free(&heap);
1233done: 1185done:
1234 return retval; 1186 return retval;
1235} 1187}
@@ -1257,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1257} 1209}
1258 1210
1259/** 1211/**
1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1261 * @tsk: task to be updated
1262 * @data: cpuset to @tsk belongs to
1263 *
1264 * Called by css_scan_tasks() for each task in a cgroup.
1265 *
1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1267 * holding cpuset_mutex at this point.
1268 */
1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1270{
1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1274}
1275
1276/**
1277 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1212 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1278 * @cs: the cpuset in which each task's spread flags needs to be changed 1213 * @cs: the cpuset in which each task's spread flags needs to be changed
1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1280 *
1281 * Called with cpuset_mutex held
1282 * 1214 *
1283 * The css_scan_tasks() function will scan all the tasks in a cgroup, 1215 * Iterate through each task of @cs updating its spread flags. As this
1284 * calling callback functions for each. 1216 * function is called with cpuset_mutex held, cpuset membership stays
1285 * 1217 * stable.
1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1287 * if @heap != NULL.
1288 */ 1218 */
1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1219static void update_tasks_flags(struct cpuset *cs)
1290{ 1220{
1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); 1221 struct css_task_iter it;
1222 struct task_struct *task;
1223
1224 css_task_iter_start(&cs->css, &it);
1225 while ((task = css_task_iter_next(&it)))
1226 cpuset_update_task_spread_flag(cs, task);
1227 css_task_iter_end(&it);
1292} 1228}
1293 1229
1294/* 1230/*
@@ -1306,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1306 struct cpuset *trialcs; 1242 struct cpuset *trialcs;
1307 int balance_flag_changed; 1243 int balance_flag_changed;
1308 int spread_flag_changed; 1244 int spread_flag_changed;
1309 struct ptr_heap heap;
1310 int err; 1245 int err;
1311 1246
1312 trialcs = alloc_trial_cpuset(cs); 1247 trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1322 if (err < 0) 1257 if (err < 0)
1323 goto out; 1258 goto out;
1324 1259
1325 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1326 if (err < 0)
1327 goto out;
1328
1329 balance_flag_changed = (is_sched_load_balance(cs) != 1260 balance_flag_changed = (is_sched_load_balance(cs) !=
1330 is_sched_load_balance(trialcs)); 1261 is_sched_load_balance(trialcs));
1331 1262
@@ -1340,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1340 rebuild_sched_domains_locked(); 1271 rebuild_sched_domains_locked();
1341 1272
1342 if (spread_flag_changed) 1273 if (spread_flag_changed)
1343 update_tasks_flags(cs, &heap); 1274 update_tasks_flags(cs);
1344 heap_free(&heap);
1345out: 1275out:
1346 free_trial_cpuset(trialcs); 1276 free_trial_cpuset(trialcs);
1347 return err; 1277 return err;
@@ -1445,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp)
1445 return val; 1375 return val;
1446} 1376}
1447 1377
1378static struct cpuset *cpuset_attach_old_cs;
1379
1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1380/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1449static int cpuset_can_attach(struct cgroup_subsys_state *css, 1381static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset) 1382 struct cgroup_taskset *tset)
@@ -1453,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1453 struct task_struct *task; 1385 struct task_struct *task;
1454 int ret; 1386 int ret;
1455 1387
1388 /* used later by cpuset_attach() */
1389 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1390
1456 mutex_lock(&cpuset_mutex); 1391 mutex_lock(&cpuset_mutex);
1457 1392
1458 /* 1393 /*
@@ -1464,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1399 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock; 1400 goto out_unlock;
1466 1401
1467 cgroup_taskset_for_each(task, css, tset) { 1402 cgroup_taskset_for_each(task, tset) {
1468 /* 1403 /*
1469 * Kthreads which disallow setaffinity shouldn't be moved 1404 * Kthreads which disallow setaffinity shouldn't be moved
1470 * to a new cpuset; we don't want to change their cpu 1405 * to a new cpuset; we don't want to change their cpu
@@ -1516,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1516 struct mm_struct *mm; 1451 struct mm_struct *mm;
1517 struct task_struct *task; 1452 struct task_struct *task;
1518 struct task_struct *leader = cgroup_taskset_first(tset); 1453 struct task_struct *leader = cgroup_taskset_first(tset);
1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1520 cpuset_subsys_id);
1521 struct cpuset *cs = css_cs(css); 1454 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss); 1455 struct cpuset *oldcs = cpuset_attach_old_cs;
1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1456 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1457 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1525 1458
@@ -1533,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1533 1466
1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1467 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1535 1468
1536 cgroup_taskset_for_each(task, css, tset) { 1469 cgroup_taskset_for_each(task, tset) {
1537 /* 1470 /*
1538 * can_attach beforehand should guarantee that this doesn't 1471 * can_attach beforehand should guarantee that this doesn't
1539 * fail. TODO: have a better way to handle failure here 1472 * fail. TODO: have a better way to handle failure here
@@ -1673,7 +1606,7 @@ out_unlock:
1673 * Common handling for a write to a "cpus" or "mems" file. 1606 * Common handling for a write to a "cpus" or "mems" file.
1674 */ 1607 */
1675static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1608static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1676 struct cftype *cft, const char *buf) 1609 struct cftype *cft, char *buf)
1677{ 1610{
1678 struct cpuset *cs = css_cs(css); 1611 struct cpuset *cs = css_cs(css);
1679 struct cpuset *trialcs; 1612 struct cpuset *trialcs;
@@ -2020,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2020 kfree(cs); 1953 kfree(cs);
2021} 1954}
2022 1955
2023struct cgroup_subsys cpuset_subsys = { 1956struct cgroup_subsys cpuset_cgrp_subsys = {
2024 .name = "cpuset",
2025 .css_alloc = cpuset_css_alloc, 1957 .css_alloc = cpuset_css_alloc,
2026 .css_online = cpuset_css_online, 1958 .css_online = cpuset_css_online,
2027 .css_offline = cpuset_css_offline, 1959 .css_offline = cpuset_css_offline,
@@ -2029,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = {
2029 .can_attach = cpuset_can_attach, 1961 .can_attach = cpuset_can_attach,
2030 .cancel_attach = cpuset_cancel_attach, 1962 .cancel_attach = cpuset_cancel_attach,
2031 .attach = cpuset_attach, 1963 .attach = cpuset_attach,
2032 .subsys_id = cpuset_subsys_id,
2033 .base_cftypes = files, 1964 .base_cftypes = files,
2034 .early_init = 1, 1965 .early_init = 1,
2035}; 1966};
@@ -2086,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2086 parent = parent_cs(parent); 2017 parent = parent_cs(parent);
2087 2018
2088 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2089 rcu_read_lock(); 2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
2090 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", 2021 pr_cont_cgroup_name(cs->css.cgroup);
2091 cgroup_name(cs->css.cgroup)); 2022 pr_cont("\n");
2092 rcu_read_unlock();
2093 } 2023 }
2094} 2024}
2095 2025
@@ -2137,7 +2067,7 @@ retry:
2137 */ 2067 */
2138 if ((sane && cpumask_empty(cs->cpus_allowed)) || 2068 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2139 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2069 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2140 update_tasks_cpumask(cs, NULL); 2070 update_tasks_cpumask(cs);
2141 2071
2142 mutex_lock(&callback_mutex); 2072 mutex_lock(&callback_mutex);
2143 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2073 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2151,7 +2081,7 @@ retry:
2151 */ 2081 */
2152 if ((sane && nodes_empty(cs->mems_allowed)) || 2082 if ((sane && nodes_empty(cs->mems_allowed)) ||
2153 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2083 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2154 update_tasks_nodemask(cs, NULL); 2084 update_tasks_nodemask(cs);
2155 2085
2156 is_empty = cpumask_empty(cs->cpus_allowed) || 2086 is_empty = cpumask_empty(cs->cpus_allowed) ||
2157 nodes_empty(cs->mems_allowed); 2087 nodes_empty(cs->mems_allowed);
@@ -2213,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2213 mutex_lock(&callback_mutex); 2143 mutex_lock(&callback_mutex);
2214 top_cpuset.mems_allowed = new_mems; 2144 top_cpuset.mems_allowed = new_mems;
2215 mutex_unlock(&callback_mutex); 2145 mutex_unlock(&callback_mutex);
2216 update_tasks_nodemask(&top_cpuset, NULL); 2146 update_tasks_nodemask(&top_cpuset);
2217 } 2147 }
2218 2148
2219 mutex_unlock(&cpuset_mutex); 2149 mutex_unlock(&cpuset_mutex);
@@ -2305,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2305 struct cpuset *cpus_cs; 2235 struct cpuset *cpus_cs;
2306 2236
2307 mutex_lock(&callback_mutex); 2237 mutex_lock(&callback_mutex);
2308 task_lock(tsk); 2238 rcu_read_lock();
2309 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2239 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2310 guarantee_online_cpus(cpus_cs, pmask); 2240 guarantee_online_cpus(cpus_cs, pmask);
2311 task_unlock(tsk); 2241 rcu_read_unlock();
2312 mutex_unlock(&callback_mutex); 2242 mutex_unlock(&callback_mutex);
2313} 2243}
2314 2244
@@ -2361,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2361 nodemask_t mask; 2291 nodemask_t mask;
2362 2292
2363 mutex_lock(&callback_mutex); 2293 mutex_lock(&callback_mutex);
2364 task_lock(tsk); 2294 rcu_read_lock();
2365 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2295 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2366 guarantee_online_mems(mems_cs, &mask); 2296 guarantee_online_mems(mems_cs, &mask);
2367 task_unlock(tsk); 2297 rcu_read_unlock();
2368 mutex_unlock(&callback_mutex); 2298 mutex_unlock(&callback_mutex);
2369 2299
2370 return mask; 2300 return mask;
@@ -2480,10 +2410,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2480 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2481 mutex_lock(&callback_mutex); 2411 mutex_lock(&callback_mutex);
2482 2412
2483 task_lock(current); 2413 rcu_read_lock();
2484 cs = nearest_hardwall_ancestor(task_cs(current)); 2414 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed); 2415 allowed = node_isset(node, cs->mems_allowed);
2486 task_unlock(current); 2416 rcu_read_unlock();
2487 2417
2488 mutex_unlock(&callback_mutex); 2418 mutex_unlock(&callback_mutex);
2489 return allowed; 2419 return allowed;
@@ -2609,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2609 * @task: pointer to task_struct of some task. 2539 * @task: pointer to task_struct of some task.
2610 * 2540 *
2611 * Description: Prints @task's name, cpuset name, and cached copy of its 2541 * Description: Prints @task's name, cpuset name, and cached copy of its
2612 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2542 * mems_allowed to the kernel log.
2613 * dereferencing task_cs(task).
2614 */ 2543 */
2615void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2544void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2616{ 2545{
2617 /* Statically allocated to prevent using excess stack. */ 2546 /* Statically allocated to prevent using excess stack. */
2618 static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 2547 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2619 static DEFINE_SPINLOCK(cpuset_buffer_lock); 2548 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2549 struct cgroup *cgrp;
2620 2550
2621 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2622
2623 rcu_read_lock();
2624 spin_lock(&cpuset_buffer_lock); 2551 spin_lock(&cpuset_buffer_lock);
2552 rcu_read_lock();
2625 2553
2554 cgrp = task_cs(tsk)->css.cgroup;
2626 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2627 tsk->mems_allowed); 2556 tsk->mems_allowed);
2628 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2557 printk(KERN_INFO "%s cpuset=", tsk->comm);
2629 tsk->comm, cgroup_name(cgrp), cpuset_nodelist); 2558 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2630 2560
2631 spin_unlock(&cpuset_buffer_lock);
2632 rcu_read_unlock(); 2561 rcu_read_unlock();
2562 spin_unlock(&cpuset_buffer_lock);
2633} 2563}
2634 2564
2635/* 2565/*
@@ -2660,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
2660 2590
2661void __cpuset_memory_pressure_bump(void) 2591void __cpuset_memory_pressure_bump(void)
2662{ 2592{
2663 task_lock(current); 2593 rcu_read_lock();
2664 fmeter_markevent(&task_cs(current)->fmeter); 2594 fmeter_markevent(&task_cs(current)->fmeter);
2665 task_unlock(current); 2595 rcu_read_unlock();
2666} 2596}
2667 2597
2668#ifdef CONFIG_PROC_PID_CPUSET 2598#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2679{ 2609{
2680 struct pid *pid; 2610 struct pid *pid;
2681 struct task_struct *tsk; 2611 struct task_struct *tsk;
2682 char *buf; 2612 char *buf, *p;
2683 struct cgroup_subsys_state *css; 2613 struct cgroup_subsys_state *css;
2684 int retval; 2614 int retval;
2685 2615
2686 retval = -ENOMEM; 2616 retval = -ENOMEM;
2687 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2688 if (!buf) 2618 if (!buf)
2689 goto out; 2619 goto out;
2690 2620
@@ -2694,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2694 if (!tsk) 2624 if (!tsk)
2695 goto out_free; 2625 goto out_free;
2696 2626
2627 retval = -ENAMETOOLONG;
2697 rcu_read_lock(); 2628 rcu_read_lock();
2698 css = task_css(tsk, cpuset_subsys_id); 2629 css = task_css(tsk, cpuset_cgrp_id);
2699 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2630 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2700 rcu_read_unlock(); 2631 rcu_read_unlock();
2701 if (retval < 0) 2632 if (!p)
2702 goto out_put_task; 2633 goto out_put_task;
2703 seq_puts(m, buf); 2634 seq_puts(m, p);
2704 seq_putc(m, '\n'); 2635 seq_putc(m, '\n');
2636 retval = 0;
2705out_put_task: 2637out_put_task:
2706 put_task_struct(tsk); 2638 put_task_struct(tsk);
2707out_free: 2639out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 661951ab8ae7..f83a71a3e46d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -361,7 +361,7 @@ struct perf_cgroup {
361static inline struct perf_cgroup * 361static inline struct perf_cgroup *
362perf_cgroup_from_task(struct task_struct *task) 362perf_cgroup_from_task(struct task_struct *task)
363{ 363{
364 return container_of(task_css(task, perf_subsys_id), 364 return container_of(task_css(task, perf_event_cgrp_id),
365 struct perf_cgroup, css); 365 struct perf_cgroup, css);
366} 366}
367 367
@@ -389,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
389 event->cgrp->css.cgroup); 389 event->cgrp->css.cgroup);
390} 390}
391 391
392static inline bool perf_tryget_cgroup(struct perf_event *event)
393{
394 return css_tryget(&event->cgrp->css);
395}
396
397static inline void perf_put_cgroup(struct perf_event *event) 392static inline void perf_put_cgroup(struct perf_event *event)
398{ 393{
399 css_put(&event->cgrp->css); 394 css_put(&event->cgrp->css);
@@ -612,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
612 if (!f.file) 607 if (!f.file)
613 return -EBADF; 608 return -EBADF;
614 609
615 rcu_read_lock(); 610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
616
617 css = css_from_dir(f.file->f_dentry, &perf_subsys);
618 if (IS_ERR(css)) { 611 if (IS_ERR(css)) {
619 ret = PTR_ERR(css); 612 ret = PTR_ERR(css);
620 goto out; 613 goto out;
@@ -623,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
623 cgrp = container_of(css, struct perf_cgroup, css); 616 cgrp = container_of(css, struct perf_cgroup, css);
624 event->cgrp = cgrp; 617 event->cgrp = cgrp;
625 618
626 /* must be done before we fput() the file */
627 if (!perf_tryget_cgroup(event)) {
628 event->cgrp = NULL;
629 ret = -ENOENT;
630 goto out;
631 }
632
633 /* 619 /*
634 * all events in a group must monitor 620 * all events in a group must monitor
635 * the same cgroup because a task belongs 621 * the same cgroup because a task belongs
@@ -640,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
640 ret = -EINVAL; 626 ret = -EINVAL;
641 } 627 }
642out: 628out:
643 rcu_read_unlock();
644 fdput(f); 629 fdput(f);
645 return ret; 630 return ret;
646} 631}
@@ -8053,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8053{ 8038{
8054 struct task_struct *task; 8039 struct task_struct *task;
8055 8040
8056 cgroup_taskset_for_each(task, css, tset) 8041 cgroup_taskset_for_each(task, tset)
8057 task_function_call(task, __perf_cgroup_move, task); 8042 task_function_call(task, __perf_cgroup_move, task);
8058} 8043}
8059 8044
@@ -8072,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8072 task_function_call(task, __perf_cgroup_move, task); 8057 task_function_call(task, __perf_cgroup_move, task);
8073} 8058}
8074 8059
8075struct cgroup_subsys perf_subsys = { 8060struct cgroup_subsys perf_event_cgrp_subsys = {
8076 .name = "perf_event",
8077 .subsys_id = perf_subsys_id,
8078 .css_alloc = perf_cgroup_css_alloc, 8061 .css_alloc = perf_cgroup_css_alloc,
8079 .css_free = perf_cgroup_css_free, 8062 .css_free = perf_cgroup_css_free,
8080 .exit = perf_cgroup_exit, 8063 .exit = perf_cgroup_exit,
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6480d1c85d7a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -797,7 +797,7 @@ void do_exit(long code)
797 */ 797 */
798 perf_event_exit_task(tsk); 798 perf_event_exit_task(tsk);
799 799
800 cgroup_exit(tsk, 1); 800 cgroup_exit(tsk);
801 801
802 if (group_dead) 802 if (group_dead)
803 disassociate_ctty(1); 803 disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 332688e5e7b4..abc45890f0a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1272,7 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1272 if (IS_ERR(p->mempolicy)) { 1272 if (IS_ERR(p->mempolicy)) {
1273 retval = PTR_ERR(p->mempolicy); 1273 retval = PTR_ERR(p->mempolicy);
1274 p->mempolicy = NULL; 1274 p->mempolicy = NULL;
1275 goto bad_fork_cleanup_cgroup; 1275 goto bad_fork_cleanup_threadgroup_lock;
1276 } 1276 }
1277 mpol_fix_fork_child_flag(p); 1277 mpol_fix_fork_child_flag(p);
1278#endif 1278#endif
@@ -1525,11 +1525,10 @@ bad_fork_cleanup_policy:
1525 perf_event_free_task(p); 1525 perf_event_free_task(p);
1526#ifdef CONFIG_NUMA 1526#ifdef CONFIG_NUMA
1527 mpol_put(p->mempolicy); 1527 mpol_put(p->mempolicy);
1528bad_fork_cleanup_cgroup: 1528bad_fork_cleanup_threadgroup_lock:
1529#endif 1529#endif
1530 if (clone_flags & CLONE_THREAD) 1530 if (clone_flags & CLONE_THREAD)
1531 threadgroup_change_end(current); 1531 threadgroup_change_end(current);
1532 cgroup_exit(p, 0);
1533 delayacct_tsk_free(p); 1532 delayacct_tsk_free(p);
1534 module_put(task_thread_info(p)->exec_domain->module); 1533 module_put(task_thread_info(p)->exec_domain->module);
1535bad_fork_cleanup_count: 1534bad_fork_cleanup_count:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..1d1b87b36778 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7230,7 +7230,7 @@ void sched_move_task(struct task_struct *tsk)
7230 if (unlikely(running)) 7230 if (unlikely(running))
7231 tsk->sched_class->put_prev_task(rq, tsk); 7231 tsk->sched_class->put_prev_task(rq, tsk);
7232 7232
7233 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7233 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7234 lockdep_is_held(&tsk->sighand->siglock)), 7234 lockdep_is_held(&tsk->sighand->siglock)),
7235 struct task_group, css); 7235 struct task_group, css);
7236 tg = autogroup_task_group(tsk, tg); 7236 tg = autogroup_task_group(tsk, tg);
@@ -7657,7 +7657,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7657{ 7657{
7658 struct task_struct *task; 7658 struct task_struct *task;
7659 7659
7660 cgroup_taskset_for_each(task, css, tset) { 7660 cgroup_taskset_for_each(task, tset) {
7661#ifdef CONFIG_RT_GROUP_SCHED 7661#ifdef CONFIG_RT_GROUP_SCHED
7662 if (!sched_rt_can_attach(css_tg(css), task)) 7662 if (!sched_rt_can_attach(css_tg(css), task))
7663 return -EINVAL; 7663 return -EINVAL;
@@ -7675,7 +7675,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7675{ 7675{
7676 struct task_struct *task; 7676 struct task_struct *task;
7677 7677
7678 cgroup_taskset_for_each(task, css, tset) 7678 cgroup_taskset_for_each(task, tset)
7679 sched_move_task(task); 7679 sched_move_task(task);
7680} 7680}
7681 7681
@@ -8014,8 +8014,7 @@ static struct cftype cpu_files[] = {
8014 { } /* terminate */ 8014 { } /* terminate */
8015}; 8015};
8016 8016
8017struct cgroup_subsys cpu_cgroup_subsys = { 8017struct cgroup_subsys cpu_cgrp_subsys = {
8018 .name = "cpu",
8019 .css_alloc = cpu_cgroup_css_alloc, 8018 .css_alloc = cpu_cgroup_css_alloc,
8020 .css_free = cpu_cgroup_css_free, 8019 .css_free = cpu_cgroup_css_free,
8021 .css_online = cpu_cgroup_css_online, 8020 .css_online = cpu_cgroup_css_online,
@@ -8023,7 +8022,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8023 .can_attach = cpu_cgroup_can_attach, 8022 .can_attach = cpu_cgroup_can_attach,
8024 .attach = cpu_cgroup_attach, 8023 .attach = cpu_cgroup_attach,
8025 .exit = cpu_cgroup_exit, 8024 .exit = cpu_cgroup_exit,
8026 .subsys_id = cpu_cgroup_subsys_id,
8027 .base_cftypes = cpu_files, 8025 .base_cftypes = cpu_files,
8028 .early_init = 1, 8026 .early_init = 1,
8029}; 8027};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
41/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
42static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
43{ 43{
44 return css_ca(task_css(tsk, cpuacct_subsys_id)); 44 return css_ca(task_css(tsk, cpuacct_cgrp_id));
45} 45}
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275 rcu_read_unlock(); 275 rcu_read_unlock();
276} 276}
277 277
278struct cgroup_subsys cpuacct_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .name = "cpuacct",
280 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
281 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
282 .subsys_id = cpuacct_subsys_id,
283 .base_cftypes = files, 281 .base_cftypes = files,
284 .early_init = 1, 282 .early_init = 1,
285}; 283};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
111 if (autogroup_path(tg, group_path, PATH_MAX)) 111 if (autogroup_path(tg, group_path, PATH_MAX))
112 return group_path; 112 return group_path;
113 113
114 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 114 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
115 return group_path;
116} 115}
117#endif 116#endif
118 117
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index cb00829bb466..595d7fd795e1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,7 +30,6 @@ struct hugetlb_cgroup {
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff) 31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32 32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 33static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35 34
36static inline 35static inline
@@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
42static inline 41static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 42struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
44{ 43{
45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); 44 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
46} 45}
47 46
48static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 47static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -255,7 +254,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
255} 254}
256 255
257static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 256static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
258 struct cftype *cft, const char *buffer) 257 struct cftype *cft, char *buffer)
259{ 258{
260 int idx, name, ret; 259 int idx, name, ret;
261 unsigned long long val; 260 unsigned long long val;
@@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
358 cft = &h->cgroup_files[4]; 357 cft = &h->cgroup_files[4];
359 memset(cft, 0, sizeof(*cft)); 358 memset(cft, 0, sizeof(*cft));
360 359
361 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 360 WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
362 361
363 return; 362 return;
364} 363}
@@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
402 return; 401 return;
403} 402}
404 403
405struct cgroup_subsys hugetlb_subsys = { 404struct cgroup_subsys hugetlb_cgrp_subsys = {
406 .name = "hugetlb",
407 .css_alloc = hugetlb_cgroup_css_alloc, 405 .css_alloc = hugetlb_cgroup_css_alloc,
408 .css_offline = hugetlb_cgroup_css_offline, 406 .css_offline = hugetlb_cgroup_css_offline,
409 .css_free = hugetlb_cgroup_css_free, 407 .css_free = hugetlb_cgroup_css_free,
410 .subsys_id = hugetlb_subsys_id,
411}; 408};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b6b0039f725..dcc8153a1681 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,8 +66,8 @@
66 66
67#include <trace/events/vmscan.h> 67#include <trace/events/vmscan.h>
68 68
69struct cgroup_subsys mem_cgroup_subsys __read_mostly; 69struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70EXPORT_SYMBOL(mem_cgroup_subsys); 70EXPORT_SYMBOL(memory_cgrp_subsys);
71 71
72#define MEM_CGROUP_RECLAIM_RETRIES 5 72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly; 73static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
538{ 538{
539 struct cgroup_subsys_state *css; 539 struct cgroup_subsys_state *css;
540 540
541 css = css_from_id(id - 1, &mem_cgroup_subsys); 541 css = css_from_id(id - 1, &memory_cgrp_subsys);
542 return mem_cgroup_from_css(css); 542 return mem_cgroup_from_css(css);
543} 543}
544 544
@@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1072 if (unlikely(!p)) 1072 if (unlikely(!p))
1073 return NULL; 1073 return NULL;
1074 1074
1075 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); 1075 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1076} 1076}
1077 1077
1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1683 */ 1683 */
1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1685{ 1685{
1686 /* 1686 /* oom_info_lock ensures that parallel ooms do not interleave */
1687 * protects memcg_name and makes sure that parallel ooms do not
1688 * interleave
1689 */
1690 static DEFINE_MUTEX(oom_info_lock); 1687 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1693 static char memcg_name[PATH_MAX];
1694 int ret;
1695 struct mem_cgroup *iter; 1688 struct mem_cgroup *iter;
1696 unsigned int i; 1689 unsigned int i;
1697 1690
@@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1701 mutex_lock(&oom_info_lock); 1694 mutex_lock(&oom_info_lock);
1702 rcu_read_lock(); 1695 rcu_read_lock();
1703 1696
1704 mem_cgrp = memcg->css.cgroup; 1697 pr_info("Task in ");
1705 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1698 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1706 1699 pr_info(" killed as a result of limit of ");
1707 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1700 pr_cont_cgroup_path(memcg->css.cgroup);
1708 if (ret < 0) { 1701 pr_info("\n");
1709 /*
1710 * Unfortunately, we are unable to convert to a useful name
1711 * But we'll still print out the usage information
1712 */
1713 rcu_read_unlock();
1714 goto done;
1715 }
1716 rcu_read_unlock();
1717
1718 pr_info("Task in %s killed", memcg_name);
1719 1702
1720 rcu_read_lock();
1721 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1722 if (ret < 0) {
1723 rcu_read_unlock();
1724 goto done;
1725 }
1726 rcu_read_unlock(); 1703 rcu_read_unlock();
1727 1704
1728 /*
1729 * Continues from above, so we don't need an KERN_ level
1730 */
1731 pr_cont(" as a result of limit of %s\n", memcg_name);
1732done:
1733
1734 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1705 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1735 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1706 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1736 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1707 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1745,13 +1716,8 @@ done:
1745 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1716 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1746 1717
1747 for_each_mem_cgroup_tree(iter, memcg) { 1718 for_each_mem_cgroup_tree(iter, memcg) {
1748 pr_info("Memory cgroup stats"); 1719 pr_info("Memory cgroup stats for ");
1749 1720 pr_cont_cgroup_path(iter->css.cgroup);
1750 rcu_read_lock();
1751 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1752 if (!ret)
1753 pr_cont(" for %s", memcg_name);
1754 rcu_read_unlock();
1755 pr_cont(":"); 1721 pr_cont(":");
1756 1722
1757 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1723 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3401 struct kmem_cache *s) 3367 struct kmem_cache *s)
3402{ 3368{
3403 struct kmem_cache *new = NULL; 3369 struct kmem_cache *new = NULL;
3404 static char *tmp_name = NULL; 3370 static char *tmp_path = NULL, *tmp_name = NULL;
3405 static DEFINE_MUTEX(mutex); /* protects tmp_name */ 3371 static DEFINE_MUTEX(mutex); /* protects tmp_name */
3406 3372
3407 BUG_ON(!memcg_can_account_kmem(memcg)); 3373 BUG_ON(!memcg_can_account_kmem(memcg));
@@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3413 * This static temporary buffer is used to prevent from 3379 * This static temporary buffer is used to prevent from
3414 * pointless shortliving allocation. 3380 * pointless shortliving allocation.
3415 */ 3381 */
3416 if (!tmp_name) { 3382 if (!tmp_path || !tmp_name) {
3417 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3383 if (!tmp_path)
3384 tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
3418 if (!tmp_name) 3385 if (!tmp_name)
3386 tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3387 if (!tmp_path || !tmp_name)
3419 goto out; 3388 goto out;
3420 } 3389 }
3421 3390
3422 rcu_read_lock(); 3391 cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
3423 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3392 snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
3424 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3393 memcg_cache_id(memcg), tmp_name);
3425 rcu_read_unlock();
3426 3394
3427 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3395 new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
3428 (s->flags & ~SLAB_PANIC), s->ctor, s); 3396 (s->flags & ~SLAB_PANIC), s->ctor, s);
3429 if (new) 3397 if (new)
3430 new->allocflags |= __GFP_KMEMCG; 3398 new->allocflags |= __GFP_KMEMCG;
@@ -4990,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4990 struct cgroup *cgrp = memcg->css.cgroup; 4958 struct cgroup *cgrp = memcg->css.cgroup;
4991 4959
4992 /* returns EBUSY if there is a task or if we come here twice. */ 4960 /* returns EBUSY if there is a task or if we come here twice. */
4993 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 4961 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
4994 return -EBUSY; 4962 return -EBUSY;
4995 4963
4996 /* we call try-to-free pages for make this cgroup empty */ 4964 /* we call try-to-free pages for make this cgroup empty */
@@ -5172,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5172 * of course permitted. 5140 * of course permitted.
5173 */ 5141 */
5174 mutex_lock(&memcg_create_mutex); 5142 mutex_lock(&memcg_create_mutex);
5175 if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) 5143 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
5176 err = -EBUSY; 5144 err = -EBUSY;
5177 mutex_unlock(&memcg_create_mutex); 5145 mutex_unlock(&memcg_create_mutex);
5178 if (err) 5146 if (err)
@@ -5274,7 +5242,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5274 * RES_LIMIT. 5242 * RES_LIMIT.
5275 */ 5243 */
5276static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5244static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5277 const char *buffer) 5245 char *buffer)
5278{ 5246{
5279 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5247 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5280 enum res_type type; 5248 enum res_type type;
@@ -6095,7 +6063,7 @@ static void memcg_event_ptable_queue_proc(struct file *file,
6095 * Interpretation of args is defined by control file implementation. 6063 * Interpretation of args is defined by control file implementation.
6096 */ 6064 */
6097static int memcg_write_event_control(struct cgroup_subsys_state *css, 6065static int memcg_write_event_control(struct cgroup_subsys_state *css,
6098 struct cftype *cft, const char *buffer) 6066 struct cftype *cft, char *buffer)
6099{ 6067{
6100 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6068 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6101 struct mem_cgroup_event *event; 6069 struct mem_cgroup_event *event;
@@ -6183,17 +6151,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
6183 * automatically removed on cgroup destruction but the removal is 6151 * automatically removed on cgroup destruction but the removal is
6184 * asynchronous, so take an extra ref on @css. 6152 * asynchronous, so take an extra ref on @css.
6185 */ 6153 */
6186 rcu_read_lock(); 6154 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
6187 6155 &memory_cgrp_subsys);
6188 ret = -EINVAL; 6156 ret = -EINVAL;
6189 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, 6157 if (IS_ERR(cfile_css))
6190 &mem_cgroup_subsys); 6158 goto out_put_cfile;
6191 if (cfile_css == css && css_tryget(css)) 6159 if (cfile_css != css) {
6192 ret = 0; 6160 css_put(cfile_css);
6193
6194 rcu_read_unlock();
6195 if (ret)
6196 goto out_put_cfile; 6161 goto out_put_cfile;
6162 }
6197 6163
6198 ret = event->register_event(memcg, event->eventfd, buffer); 6164 ret = event->register_event(memcg, event->eventfd, buffer);
6199 if (ret) 6165 if (ret)
@@ -6566,11 +6532,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6566 * unfortunate state in our controller. 6532 * unfortunate state in our controller.
6567 */ 6533 */
6568 if (parent != root_mem_cgroup) 6534 if (parent != root_mem_cgroup)
6569 mem_cgroup_subsys.broken_hierarchy = true; 6535 memory_cgrp_subsys.broken_hierarchy = true;
6570 } 6536 }
6571 mutex_unlock(&memcg_create_mutex); 6537 mutex_unlock(&memcg_create_mutex);
6572 6538
6573 return memcg_init_kmem(memcg, &mem_cgroup_subsys); 6539 return memcg_init_kmem(memcg, &memory_cgrp_subsys);
6574} 6540}
6575 6541
6576/* 6542/*
@@ -7272,9 +7238,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7272 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7238 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7273} 7239}
7274 7240
7275struct cgroup_subsys mem_cgroup_subsys = { 7241struct cgroup_subsys memory_cgrp_subsys = {
7276 .name = "memory",
7277 .subsys_id = mem_cgroup_subsys_id,
7278 .css_alloc = mem_cgroup_css_alloc, 7242 .css_alloc = mem_cgroup_css_alloc,
7279 .css_online = mem_cgroup_css_online, 7243 .css_online = mem_cgroup_css_online,
7280 .css_offline = mem_cgroup_css_offline, 7244 .css_offline = mem_cgroup_css_offline,
@@ -7300,7 +7264,7 @@ __setup("swapaccount=", enable_swap_account);
7300 7264
7301static void __init memsw_file_init(void) 7265static void __init memsw_file_init(void)
7302{ 7266{
7303 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); 7267 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
7304} 7268}
7305 7269
7306static void __init enable_swap_cgroup(void) 7270static void __init enable_swap_cgroup(void)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 90002ea43638..35ef28acf137 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
145 return -EINVAL; 145 return -EINVAL;
146 146
147 css = mem_cgroup_css(mem); 147 css = mem_cgroup_css(mem);
148 /* root_mem_cgroup has NULL dentries */ 148 ino = cgroup_ino(css->cgroup);
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css); 149 css_put(css);
154 150
155 if (ino != hwpoison_filter_memcg) 151 if (!ino || ino != hwpoison_filter_memcg)
156 return -EINVAL; 152 return -EINVAL;
157 153
158 return 0; 154 return 0;
diff --git a/net/Kconfig b/net/Kconfig
index d1f6f968fc09..d92afe4204d9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,7 +243,7 @@ config XPS
243 default y 243 default y
244 244
245config CGROUP_NET_PRIO 245config CGROUP_NET_PRIO
246 tristate "Network priority cgroup" 246 bool "Network priority cgroup"
247 depends on CGROUPS 247 depends on CGROUPS
248 ---help--- 248 ---help---
249 Cgroup subsystem for use in assigning processes to network priorities on 249 Cgroup subsystem for use in assigning processes to network priorities on
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd541668..22931e1b99b4 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
23 23
24struct cgroup_cls_state *task_cls_state(struct task_struct *p) 24struct cgroup_cls_state *task_cls_state(struct task_struct *p)
25{ 25{
26 return css_cls_state(task_css(p, net_cls_subsys_id)); 26 return css_cls_state(task_css(p, net_cls_cgrp_id));
27} 27}
28EXPORT_SYMBOL_GPL(task_cls_state); 28EXPORT_SYMBOL_GPL(task_cls_state);
29 29
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
73 void *v = (void *)(unsigned long)cs->classid; 73 void *v = (void *)(unsigned long)cs->classid;
74 struct task_struct *p; 74 struct task_struct *p;
75 75
76 cgroup_taskset_for_each(p, css, tset) { 76 cgroup_taskset_for_each(p, tset) {
77 task_lock(p); 77 task_lock(p);
78 iterate_fd(p->files, 0, update_classid, v); 78 iterate_fd(p->files, 0, update_classid, v);
79 task_unlock(p); 79 task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
102 { } /* terminate */ 102 { } /* terminate */
103}; 103};
104 104
105struct cgroup_subsys net_cls_subsys = { 105struct cgroup_subsys net_cls_cgrp_subsys = {
106 .name = "net_cls",
107 .css_alloc = cgrp_css_alloc, 106 .css_alloc = cgrp_css_alloc,
108 .css_online = cgrp_css_online, 107 .css_online = cgrp_css_online,
109 .css_free = cgrp_css_free, 108 .css_free = cgrp_css_free,
110 .attach = cgrp_attach, 109 .attach = cgrp_attach,
111 .subsys_id = net_cls_subsys_id,
112 .base_cftypes = ss_files, 110 .base_cftypes = ss_files,
113 .module = THIS_MODULE,
114}; 111};
115
116static int __init init_netclassid_cgroup(void)
117{
118 return cgroup_load_subsys(&net_cls_subsys);
119}
120__initcall(init_netclassid_cgroup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd08..3825f669147b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -186,7 +186,7 @@ static int read_priomap(struct seq_file *sf, void *v)
186} 186}
187 187
188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
189 const char *buffer) 189 char *buffer)
190{ 190{
191 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
192 struct net_device *dev; 192 struct net_device *dev;
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
224 struct task_struct *p; 224 struct task_struct *p;
225 void *v = (void *)(unsigned long)css->cgroup->id; 225 void *v = (void *)(unsigned long)css->cgroup->id;
226 226
227 cgroup_taskset_for_each(p, css, tset) { 227 cgroup_taskset_for_each(p, tset) {
228 task_lock(p); 228 task_lock(p);
229 iterate_fd(p->files, 0, update_netprio, v); 229 iterate_fd(p->files, 0, update_netprio, v);
230 task_unlock(p); 230 task_unlock(p);
@@ -244,15 +244,12 @@ static struct cftype ss_files[] = {
244 { } /* terminate */ 244 { } /* terminate */
245}; 245};
246 246
247struct cgroup_subsys net_prio_subsys = { 247struct cgroup_subsys net_prio_cgrp_subsys = {
248 .name = "net_prio",
249 .css_alloc = cgrp_css_alloc, 248 .css_alloc = cgrp_css_alloc,
250 .css_online = cgrp_css_online, 249 .css_online = cgrp_css_online,
251 .css_free = cgrp_css_free, 250 .css_free = cgrp_css_free,
252 .attach = net_prio_attach, 251 .attach = net_prio_attach,
253 .subsys_id = net_prio_subsys_id,
254 .base_cftypes = ss_files, 252 .base_cftypes = ss_files,
255 .module = THIS_MODULE,
256}; 253};
257 254
258static int netprio_device_event(struct notifier_block *unused, 255static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
283 280
284static int __init init_cgroup_netprio(void) 281static int __init init_cgroup_netprio(void)
285{ 282{
286 int ret;
287
288 ret = cgroup_load_subsys(&net_prio_subsys);
289 if (ret)
290 goto out;
291
292 register_netdevice_notifier(&netprio_device_notifier); 283 register_netdevice_notifier(&netprio_device_notifier);
293 284 return 0;
294out:
295 return ret;
296}
297
298static void __exit exit_cgroup_netprio(void)
299{
300 struct netprio_map *old;
301 struct net_device *dev;
302
303 unregister_netdevice_notifier(&netprio_device_notifier);
304
305 cgroup_unload_subsys(&net_prio_subsys);
306
307 rtnl_lock();
308 for_each_netdev(&init_net, dev) {
309 old = rtnl_dereference(dev->priomap);
310 RCU_INIT_POINTER(dev->priomap, NULL);
311 if (old)
312 kfree_rcu(old, rcu);
313 }
314 rtnl_unlock();
315} 285}
316 286
317module_init(init_cgroup_netprio); 287subsys_initcall(init_cgroup_netprio);
318module_exit(exit_cgroup_netprio);
319MODULE_LICENSE("GPL v2"); 288MODULE_LICENSE("GPL v2");
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
103} 103}
104 104
105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
106 const char *buffer) 106 char *buffer)
107{ 107{
108 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 108 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
109 unsigned long long val; 109 unsigned long long val;
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
219 219
220static int __init tcp_memcontrol_init(void) 220static int __init tcp_memcontrol_init(void)
221{ 221{
222 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); 222 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
223 return 0; 223 return 0;
224} 224}
225__initcall(tcp_memcontrol_init); 225__initcall(tcp_memcontrol_init);
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d3b6d2cd3a06..8365909f5f8c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
58 58
59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
60{ 60{
61 return css_to_devcgroup(task_css(task, devices_subsys_id)); 61 return css_to_devcgroup(task_css(task, devices_cgrp_id));
62} 62}
63 63
64struct cgroup_subsys devices_subsys;
65
66/* 64/*
67 * called under devcgroup_mutex 65 * called under devcgroup_mutex
68 */ 66 */
@@ -498,7 +496,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup)
498 * parent cgroup has the access you're asking for. 496 * parent cgroup has the access you're asking for.
499 */ 497 */
500static int devcgroup_update_access(struct dev_cgroup *devcgroup, 498static int devcgroup_update_access(struct dev_cgroup *devcgroup,
501 int filetype, const char *buffer) 499 int filetype, char *buffer)
502{ 500{
503 const char *b; 501 const char *b;
504 char temp[12]; /* 11 + 1 characters needed for a u32 */ 502 char temp[12]; /* 11 + 1 characters needed for a u32 */
@@ -654,7 +652,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
654} 652}
655 653
656static int devcgroup_access_write(struct cgroup_subsys_state *css, 654static int devcgroup_access_write(struct cgroup_subsys_state *css,
657 struct cftype *cft, const char *buffer) 655 struct cftype *cft, char *buffer)
658{ 656{
659 int retval; 657 int retval;
660 658
@@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = {
684 { } /* terminate */ 682 { } /* terminate */
685}; 683};
686 684
687struct cgroup_subsys devices_subsys = { 685struct cgroup_subsys devices_cgrp_subsys = {
688 .name = "devices",
689 .css_alloc = devcgroup_css_alloc, 686 .css_alloc = devcgroup_css_alloc,
690 .css_free = devcgroup_css_free, 687 .css_free = devcgroup_css_free,
691 .css_online = devcgroup_online, 688 .css_online = devcgroup_online,
692 .css_offline = devcgroup_offline, 689 .css_offline = devcgroup_offline,
693 .subsys_id = devices_subsys_id,
694 .base_cftypes = dev_cgroup_files, 690 .base_cftypes = dev_cgroup_files,
695}; 691};
696 692