aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroup-v2.txt60
-rw-r--r--include/linux/cgroup-defs.h12
-rw-r--r--init/Kconfig7
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c155
-rw-r--r--kernel/cgroup/cgroup.c155
-rw-r--r--kernel/cgroup/debug.c357
8 files changed, 548 insertions, 201 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc5e2dcdbef4..558c3a739baf 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
149and experimenting easier, the kernel parameter cgroup_no_v1= allows 149and experimenting easier, the kernel parameter cgroup_no_v1= allows
150disabling controllers in v1 and make them always available in v2. 150disabling controllers in v1 and make them always available in v2.
151 151
152cgroup v2 currently supports the following mount options.
153
154 nsdelegate
155
156 Consider cgroup namespaces as delegation boundaries. This
157 option is system wide and can only be set on mount or modified
158 through remount from the init namespace. The mount option is
159 ignored on non-init namespace mounts. Please refer to the
160 Delegation section for details.
161
152 162
1532-2. Organizing Processes 1632-2. Organizing Processes
154 164
@@ -308,18 +318,27 @@ file.
308 318
3092-5-1. Model of Delegation 3192-5-1. Model of Delegation
310 320
311A cgroup can be delegated to a less privileged user by granting write 321A cgroup can be delegated in two ways. First, to a less privileged
312access of the directory and its "cgroup.procs" file to the user. Note 322user by granting write access of the directory and its "cgroup.procs"
313that resource control interface files in a given directory control the 323and "cgroup.subtree_control" files to the user. Second, if the
314distribution of the parent's resources and thus must not be delegated 324"nsdelegate" mount option is set, automatically to a cgroup namespace
315along with the directory. 325on namespace creation.
316 326
317Once delegated, the user can build sub-hierarchy under the directory, 327Because the resource control interface files in a given directory
318organize processes as it sees fit and further distribute the resources 328control the distribution of the parent's resources, the delegatee
319it received from the parent. The limits and other settings of all 329shouldn't be allowed to write to them. For the first method, this is
320resource controllers are hierarchical and regardless of what happens 330achieved by not granting access to these files. For the second, the
321in the delegated sub-hierarchy, nothing can escape the resource 331kernel rejects writes to all files other than "cgroup.procs" and
322restrictions imposed by the parent. 332"cgroup.subtree_control" on a namespace root from inside the
333namespace.
334
335The end results are equivalent for both delegation types. Once
336delegated, the user can build sub-hierarchy under the directory,
337organize processes inside it as it sees fit and further distribute the
338resources it received from the parent. The limits and other settings
339of all resource controllers are hierarchical and regardless of what
340happens in the delegated sub-hierarchy, nothing can escape the
341resource restrictions imposed by the parent.
323 342
324Currently, cgroup doesn't impose any restrictions on the number of 343Currently, cgroup doesn't impose any restrictions on the number of
325cgroups in or nesting depth of a delegated sub-hierarchy; however, 344cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -329,10 +348,12 @@ this may be limited explicitly in the future.
3292-5-2. Delegation Containment 3482-5-2. Delegation Containment
330 349
331A delegated sub-hierarchy is contained in the sense that processes 350A delegated sub-hierarchy is contained in the sense that processes
332can't be moved into or out of the sub-hierarchy by the delegatee. For 351can't be moved into or out of the sub-hierarchy by the delegatee.
333a process with a non-root euid to migrate a target process into a 352
334cgroup by writing its PID to the "cgroup.procs" file, the following 353For delegations to a less privileged user, this is achieved by
335conditions must be met. 354requiring the following conditions for a process with a non-root euid
355to migrate a target process into a cgroup by writing its PID to the
356"cgroup.procs" file.
336 357
337- The writer must have write access to the "cgroup.procs" file. 358- The writer must have write access to the "cgroup.procs" file.
338 359
@@ -359,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
359not have write access to its "cgroup.procs" files and thus the write 380not have write access to its "cgroup.procs" files and thus the write
360will be denied with -EACCES. 381will be denied with -EACCES.
361 382
383For delegations to namespaces, containment is achieved by requiring
384that both the source and destination cgroups are reachable from the
385namespace of the process which is attempting the migration. If either
386is not reachable, the migration is rejected with -ENOENT.
387
362 388
3632-6. Guidelines 3892-6. Guidelines
364 390
@@ -1413,7 +1439,7 @@ D. Deprecated v1 Core Features
1413 1439
1414- Multiple hierarchies including named ones are not supported. 1440- Multiple hierarchies including named ones are not supported.
1415 1441
1416- All mount options and remounting are not supported. 1442- All v1 mount options are not supported.
1417 1443
1418- The "tasks" file is removed and "cgroup.procs" is not sorted. 1444- The "tasks" file is removed and "cgroup.procs" is not sorted.
1419 1445
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ec47101cb1bf..09f4c7df1478 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -67,12 +67,21 @@ enum {
67enum { 67enum {
68 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 68 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
69 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 69 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
70
71 /*
72 * Consider namespaces as delegation boundaries. If this flag is
73 * set, controller specific interface files in a namespace root
74 * aren't writeable from inside the namespace.
75 */
76 CGRP_ROOT_NS_DELEGATE = (1 << 3),
70}; 77};
71 78
72/* cftype->flags */ 79/* cftype->flags */
73enum { 80enum {
74 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 81 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
75 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 82 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
83 CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */
84
76 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 85 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
77 CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ 86 CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
78 87
@@ -166,6 +175,9 @@ struct css_set {
166 /* the default cgroup associated with this css_set */ 175 /* the default cgroup associated with this css_set */
167 struct cgroup *dfl_cgrp; 176 struct cgroup *dfl_cgrp;
168 177
178 /* internal task count, protected by css_set_lock */
179 int nr_tasks;
180
169 /* 181 /*
170 * Lists running through all tasks using this cgroup group. 182 * Lists running through all tasks using this cgroup group.
171 * mg_tasks lists tasks which belong to this cset but are in the 183 * mg_tasks lists tasks which belong to this cset but are in the
diff --git a/init/Kconfig b/init/Kconfig
index ee0f03b69d11..b0fcbb2c6f56 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -859,11 +859,14 @@ config CGROUP_BPF
859 inet sockets. 859 inet sockets.
860 860
861config CGROUP_DEBUG 861config CGROUP_DEBUG
862 bool "Example controller" 862 bool "Debug controller"
863 default n 863 default n
864 depends on DEBUG_KERNEL
864 help 865 help
865 This option enables a simple controller that exports 866 This option enables a simple controller that exports
866 debugging information about the cgroups framework. 867 debugging information about the cgroups framework. This
868 controller is for control cgroup debugging only. Its
869 interfaces are not stable.
867 870
868 Say N. 871 Say N.
869 872
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
5obj-$(CONFIG_CGROUP_RDMA) += rdma.o 5obj-$(CONFIG_CGROUP_RDMA) += rdma.o
6obj-$(CONFIG_CPUSETS) += cpuset.o 6obj-$(CONFIG_CPUSETS) += cpuset.o
7obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, 192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
193 struct kernfs_root *kf_root); 193 struct kernfs_root *kf_root);
194 194
195int cgroup_task_count(const struct cgroup *cgrp);
196
195/* 197/*
196 * namespace.c 198 * namespace.c
197 */ 199 */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
334/** 334/**
335 * cgroup_task_count - count the number of tasks in a cgroup. 335 * cgroup_task_count - count the number of tasks in a cgroup.
336 * @cgrp: the cgroup in question 336 * @cgrp: the cgroup in question
337 *
338 * Return the number of tasks in the cgroup. The returned number can be
339 * higher than the actual number of tasks due to css_set references from
340 * namespace roots and temporary usages.
341 */ 337 */
342static int cgroup_task_count(const struct cgroup *cgrp) 338int cgroup_task_count(const struct cgroup *cgrp)
343{ 339{
344 int count = 0; 340 int count = 0;
345 struct cgrp_cset_link *link; 341 struct cgrp_cset_link *link;
346 342
347 spin_lock_irq(&css_set_lock); 343 spin_lock_irq(&css_set_lock);
348 list_for_each_entry(link, &cgrp->cset_links, cset_link) 344 list_for_each_entry(link, &cgrp->cset_links, cset_link)
349 count += refcount_read(&link->cset->refcount); 345 count += link->cset->nr_tasks;
350 spin_unlock_irq(&css_set_lock); 346 spin_unlock_irq(&css_set_lock);
351 return count; 347 return count;
352} 348}
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
1263 return 1; 1259 return 1;
1264} 1260}
1265__setup("cgroup_no_v1=", cgroup_no_v1); 1261__setup("cgroup_no_v1=", cgroup_no_v1);
1266
1267
1268#ifdef CONFIG_CGROUP_DEBUG
1269static struct cgroup_subsys_state *
1270debug_css_alloc(struct cgroup_subsys_state *parent_css)
1271{
1272 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
1273
1274 if (!css)
1275 return ERR_PTR(-ENOMEM);
1276
1277 return css;
1278}
1279
1280static void debug_css_free(struct cgroup_subsys_state *css)
1281{
1282 kfree(css);
1283}
1284
1285static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
1286 struct cftype *cft)
1287{
1288 return cgroup_task_count(css->cgroup);
1289}
1290
1291static u64 current_css_set_read(struct cgroup_subsys_state *css,
1292 struct cftype *cft)
1293{
1294 return (u64)(unsigned long)current->cgroups;
1295}
1296
1297static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
1298 struct cftype *cft)
1299{
1300 u64 count;
1301
1302 rcu_read_lock();
1303 count = refcount_read(&task_css_set(current)->refcount);
1304 rcu_read_unlock();
1305 return count;
1306}
1307
1308static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
1309{
1310 struct cgrp_cset_link *link;
1311 struct css_set *cset;
1312 char *name_buf;
1313
1314 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
1315 if (!name_buf)
1316 return -ENOMEM;
1317
1318 spin_lock_irq(&css_set_lock);
1319 rcu_read_lock();
1320 cset = rcu_dereference(current->cgroups);
1321 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1322 struct cgroup *c = link->cgrp;
1323
1324 cgroup_name(c, name_buf, NAME_MAX + 1);
1325 seq_printf(seq, "Root %d group %s\n",
1326 c->root->hierarchy_id, name_buf);
1327 }
1328 rcu_read_unlock();
1329 spin_unlock_irq(&css_set_lock);
1330 kfree(name_buf);
1331 return 0;
1332}
1333
1334#define MAX_TASKS_SHOWN_PER_CSS 25
1335static int cgroup_css_links_read(struct seq_file *seq, void *v)
1336{
1337 struct cgroup_subsys_state *css = seq_css(seq);
1338 struct cgrp_cset_link *link;
1339
1340 spin_lock_irq(&css_set_lock);
1341 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
1342 struct css_set *cset = link->cset;
1343 struct task_struct *task;
1344 int count = 0;
1345
1346 seq_printf(seq, "css_set %pK\n", cset);
1347
1348 list_for_each_entry(task, &cset->tasks, cg_list) {
1349 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1350 goto overflow;
1351 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1352 }
1353
1354 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
1355 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1356 goto overflow;
1357 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1358 }
1359 continue;
1360 overflow:
1361 seq_puts(seq, " ...\n");
1362 }
1363 spin_unlock_irq(&css_set_lock);
1364 return 0;
1365}
1366
1367static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
1368{
1369 return (!cgroup_is_populated(css->cgroup) &&
1370 !css_has_online_children(&css->cgroup->self));
1371}
1372
1373static struct cftype debug_files[] = {
1374 {
1375 .name = "taskcount",
1376 .read_u64 = debug_taskcount_read,
1377 },
1378
1379 {
1380 .name = "current_css_set",
1381 .read_u64 = current_css_set_read,
1382 },
1383
1384 {
1385 .name = "current_css_set_refcount",
1386 .read_u64 = current_css_set_refcount_read,
1387 },
1388
1389 {
1390 .name = "current_css_set_cg_links",
1391 .seq_show = current_css_set_cg_links_read,
1392 },
1393
1394 {
1395 .name = "cgroup_css_links",
1396 .seq_show = cgroup_css_links_read,
1397 },
1398
1399 {
1400 .name = "releasable",
1401 .read_u64 = releasable_read,
1402 },
1403
1404 { } /* terminate */
1405};
1406
1407struct cgroup_subsys debug_cgrp_subsys = {
1408 .css_alloc = debug_css_alloc,
1409 .css_free = debug_css_free,
1410 .legacy_cftypes = debug_files,
1411};
1412#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */
573/** 573/**
574 * css_set_populated - does a css_set contain any tasks? 574 * css_set_populated - does a css_set contain any tasks?
575 * @cset: target css_set 575 * @cset: target css_set
576 *
577 * css_set_populated() should be the same as !!cset->nr_tasks at steady
578 * state. However, css_set_populated() can be called while a task is being
579 * added to or removed from the linked list before the nr_tasks is
580 * properly updated. Hence, we can't just look at ->nr_tasks here.
576 */ 581 */
577static bool css_set_populated(struct css_set *cset) 582static bool css_set_populated(struct css_set *cset)
578{ 583{
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1542 return len; 1547 return len;
1543} 1548}
1544 1549
1550static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1551{
1552 char *token;
1553
1554 *root_flags = 0;
1555
1556 if (!data)
1557 return 0;
1558
1559 while ((token = strsep(&data, ",")) != NULL) {
1560 if (!strcmp(token, "nsdelegate")) {
1561 *root_flags |= CGRP_ROOT_NS_DELEGATE;
1562 continue;
1563 }
1564
1565 pr_err("cgroup2: unknown option \"%s\"\n", token);
1566 return -EINVAL;
1567 }
1568
1569 return 0;
1570}
1571
1572static void apply_cgroup_root_flags(unsigned int root_flags)
1573{
1574 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1575 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1576 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1577 else
1578 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1579 }
1580}
1581
1582static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1583{
1584 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1585 seq_puts(seq, ",nsdelegate");
1586 return 0;
1587}
1588
1545static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1589static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1546{ 1590{
1547 pr_err("remount is not allowed\n"); 1591 unsigned int root_flags;
1548 return -EINVAL; 1592 int ret;
1593
1594 ret = parse_cgroup_root_flags(data, &root_flags);
1595 if (ret)
1596 return ret;
1597
1598 apply_cgroup_root_flags(root_flags);
1599 return 0;
1549} 1600}
1550 1601
1551/* 1602/*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
1598 css_set_update_populated(cset, true); 1649 css_set_update_populated(cset, true);
1599 list_add_tail(&p->cg_list, &cset->tasks); 1650 list_add_tail(&p->cg_list, &cset->tasks);
1600 get_css_set(cset); 1651 get_css_set(cset);
1652 cset->nr_tasks++;
1601 } 1653 }
1602 spin_unlock(&p->sighand->siglock); 1654 spin_unlock(&p->sighand->siglock);
1603 } while_each_thread(g, p); 1655 } while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1784{ 1836{
1785 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 1837 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1786 struct dentry *dentry; 1838 struct dentry *dentry;
1839 int ret;
1787 1840
1788 get_cgroup_ns(ns); 1841 get_cgroup_ns(ns);
1789 1842
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1801 cgroup_enable_task_cg_lists(); 1854 cgroup_enable_task_cg_lists();
1802 1855
1803 if (fs_type == &cgroup2_fs_type) { 1856 if (fs_type == &cgroup2_fs_type) {
1804 if (data) { 1857 unsigned int root_flags;
1805 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); 1858
1859 ret = parse_cgroup_root_flags(data, &root_flags);
1860 if (ret) {
1806 put_cgroup_ns(ns); 1861 put_cgroup_ns(ns);
1807 return ERR_PTR(-EINVAL); 1862 return ERR_PTR(ret);
1808 } 1863 }
1864
1809 cgrp_dfl_visible = true; 1865 cgrp_dfl_visible = true;
1810 cgroup_get_live(&cgrp_dfl_root.cgrp); 1866 cgroup_get_live(&cgrp_dfl_root.cgrp);
1811 1867
1812 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, 1868 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1813 CGROUP2_SUPER_MAGIC, ns); 1869 CGROUP2_SUPER_MAGIC, ns);
1870 if (!IS_ERR(dentry))
1871 apply_cgroup_root_flags(root_flags);
1814 } else { 1872 } else {
1815 dentry = cgroup1_mount(&cgroup_fs_type, flags, data, 1873 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1816 CGROUP_SUPER_MAGIC, ns); 1874 CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2064 struct css_set *to_cset = cset->mg_dst_cset; 2122 struct css_set *to_cset = cset->mg_dst_cset;
2065 2123
2066 get_css_set(to_cset); 2124 get_css_set(to_cset);
2125 to_cset->nr_tasks++;
2067 css_set_move_task(task, from_cset, to_cset, true); 2126 css_set_move_task(task, from_cset, to_cset, true);
2068 put_css_set_locked(from_cset); 2127 put_css_set_locked(from_cset);
2128 from_cset->nr_tasks--;
2069 } 2129 }
2070 } 2130 }
2071 spin_unlock_irq(&css_set_lock); 2131 spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2355 struct cgroup *dst_cgrp, 2415 struct cgroup *dst_cgrp,
2356 struct kernfs_open_file *of) 2416 struct kernfs_open_file *of)
2357{ 2417{
2358 int ret = 0; 2418 struct super_block *sb = of->file->f_path.dentry->d_sb;
2359 2419 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2360 if (cgroup_on_dfl(dst_cgrp)) { 2420 struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2361 struct super_block *sb = of->file->f_path.dentry->d_sb; 2421 struct cgroup *src_cgrp, *com_cgrp;
2362 struct cgroup *cgrp; 2422 struct inode *inode;
2363 struct inode *inode; 2423 int ret;
2364
2365 spin_lock_irq(&css_set_lock);
2366 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2367 spin_unlock_irq(&css_set_lock);
2368
2369 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2370 cgrp = cgroup_parent(cgrp);
2371 2424
2372 ret = -ENOMEM; 2425 if (!cgroup_on_dfl(dst_cgrp)) {
2373 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2374 if (inode) {
2375 ret = inode_permission(inode, MAY_WRITE);
2376 iput(inode);
2377 }
2378 } else {
2379 const struct cred *cred = current_cred(); 2426 const struct cred *cred = current_cred();
2380 const struct cred *tcred = get_task_cred(task); 2427 const struct cred *tcred = get_task_cred(task);
2381 2428
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2383 * even if we're attaching all tasks in the thread group, 2430 * even if we're attaching all tasks in the thread group,
2384 * we only need to check permissions on one of them. 2431 * we only need to check permissions on one of them.
2385 */ 2432 */
2386 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2433 if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2387 !uid_eq(cred->euid, tcred->uid) && 2434 uid_eq(cred->euid, tcred->uid) ||
2388 !uid_eq(cred->euid, tcred->suid)) 2435 uid_eq(cred->euid, tcred->suid))
2436 ret = 0;
2437 else
2389 ret = -EACCES; 2438 ret = -EACCES;
2439
2390 put_cred(tcred); 2440 put_cred(tcred);
2441 return ret;
2391 } 2442 }
2392 2443
2393 return ret; 2444 /* find the source cgroup */
2445 spin_lock_irq(&css_set_lock);
2446 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2447 spin_unlock_irq(&css_set_lock);
2448
2449 /* and the common ancestor */
2450 com_cgrp = src_cgrp;
2451 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2452 com_cgrp = cgroup_parent(com_cgrp);
2453
2454 /* %current should be authorized to migrate to the common ancestor */
2455 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2456 if (!inode)
2457 return -ENOMEM;
2458
2459 ret = inode_permission(inode, MAY_WRITE);
2460 iput(inode);
2461 if (ret)
2462 return ret;
2463
2464 /*
2465 * If namespaces are delegation boundaries, %current must be able
2466 * to see both source and destination cgroups from its namespace.
2467 */
2468 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2469 (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2470 !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2471 return -ENOENT;
2472
2473 return 0;
2394} 2474}
2395 2475
2396/* 2476/*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
2954static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 3034static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2955 size_t nbytes, loff_t off) 3035 size_t nbytes, loff_t off)
2956{ 3036{
3037 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2957 struct cgroup *cgrp = of->kn->parent->priv; 3038 struct cgroup *cgrp = of->kn->parent->priv;
2958 struct cftype *cft = of->kn->priv; 3039 struct cftype *cft = of->kn->priv;
2959 struct cgroup_subsys_state *css; 3040 struct cgroup_subsys_state *css;
2960 int ret; 3041 int ret;
2961 3042
3043 /*
3044 * If namespaces are delegation boundaries, disallow writes to
3045 * files in an non-init namespace root from inside the namespace
3046 * except for the files explicitly marked delegatable -
3047 * cgroup.procs and cgroup.subtree_control.
3048 */
3049 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3050 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3051 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3052 return -EPERM;
3053
2962 if (cft->write) 3054 if (cft->write)
2963 return cft->write(of, buf, nbytes, off); 3055 return cft->write(of, buf, nbytes, off);
2964 3056
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
3792static struct cftype cgroup_base_files[] = { 3884static struct cftype cgroup_base_files[] = {
3793 { 3885 {
3794 .name = "cgroup.procs", 3886 .name = "cgroup.procs",
3887 .flags = CFTYPE_NS_DELEGATABLE,
3795 .file_offset = offsetof(struct cgroup, procs_file), 3888 .file_offset = offsetof(struct cgroup, procs_file),
3796 .release = cgroup_procs_release, 3889 .release = cgroup_procs_release,
3797 .seq_start = cgroup_procs_start, 3890 .seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
3805 }, 3898 },
3806 { 3899 {
3807 .name = "cgroup.subtree_control", 3900 .name = "cgroup.subtree_control",
3901 .flags = CFTYPE_NS_DELEGATABLE,
3808 .seq_show = cgroup_subtree_control_show, 3902 .seq_show = cgroup_subtree_control_show,
3809 .write = cgroup_subtree_control_write, 3903 .write = cgroup_subtree_control_write,
3810 }, 3904 },
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
4393} 4487}
4394 4488
4395static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4489static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4490 .show_options = cgroup_show_options,
4396 .remount_fs = cgroup_remount, 4491 .remount_fs = cgroup_remount,
4397 .mkdir = cgroup_mkdir, 4492 .mkdir = cgroup_mkdir,
4398 .rmdir = cgroup_rmdir, 4493 .rmdir = cgroup_rmdir,
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
4789 cset = task_css_set(current); 4884 cset = task_css_set(current);
4790 if (list_empty(&child->cg_list)) { 4885 if (list_empty(&child->cg_list)) {
4791 get_css_set(cset); 4886 get_css_set(cset);
4887 cset->nr_tasks++;
4792 css_set_move_task(child, NULL, cset, false); 4888 css_set_move_task(child, NULL, cset, false);
4793 } 4889 }
4794 spin_unlock_irq(&css_set_lock); 4890 spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
4838 if (!list_empty(&tsk->cg_list)) { 4934 if (!list_empty(&tsk->cg_list)) {
4839 spin_lock_irq(&css_set_lock); 4935 spin_lock_irq(&css_set_lock);
4840 css_set_move_task(tsk, cset, NULL, false); 4936 css_set_move_task(tsk, cset, NULL, false);
4937 cset->nr_tasks--;
4841 spin_unlock_irq(&css_set_lock); 4938 spin_unlock_irq(&css_set_lock);
4842 } else { 4939 } else {
4843 get_css_set(cset); 4940 get_css_set(cset);
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..dac46af22782
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
1/*
2 * Debug controller
3 *
4 * WARNING: This controller is for cgroup core debugging only.
5 * Its interfaces are unstable and subject to changes at any time.
6 */
7#include <linux/ctype.h>
8#include <linux/mm.h>
9#include <linux/slab.h>
10
11#include "cgroup-internal.h"
12
13static struct cgroup_subsys_state *
14debug_css_alloc(struct cgroup_subsys_state *parent_css)
15{
16 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
17
18 if (!css)
19 return ERR_PTR(-ENOMEM);
20
21 return css;
22}
23
24static void debug_css_free(struct cgroup_subsys_state *css)
25{
26 kfree(css);
27}
28
29/*
30 * debug_taskcount_read - return the number of tasks in a cgroup.
31 * @cgrp: the cgroup in question
32 */
33static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
34 struct cftype *cft)
35{
36 return cgroup_task_count(css->cgroup);
37}
38
39static int current_css_set_read(struct seq_file *seq, void *v)
40{
41 struct kernfs_open_file *of = seq->private;
42 struct css_set *cset;
43 struct cgroup_subsys *ss;
44 struct cgroup_subsys_state *css;
45 int i, refcnt;
46
47 if (!cgroup_kn_lock_live(of->kn, false))
48 return -ENODEV;
49
50 spin_lock_irq(&css_set_lock);
51 rcu_read_lock();
52 cset = rcu_dereference(current->cgroups);
53 refcnt = refcount_read(&cset->refcount);
54 seq_printf(seq, "css_set %pK %d", cset, refcnt);
55 if (refcnt > cset->nr_tasks)
56 seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
57 seq_puts(seq, "\n");
58
59 /*
60 * Print the css'es stored in the current css_set.
61 */
62 for_each_subsys(ss, i) {
63 css = cset->subsys[ss->id];
64 if (!css)
65 continue;
66 seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
67 (unsigned long)css, css->id);
68 }
69 rcu_read_unlock();
70 spin_unlock_irq(&css_set_lock);
71 cgroup_kn_unlock(of->kn);
72 return 0;
73}
74
75static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
76 struct cftype *cft)
77{
78 u64 count;
79
80 rcu_read_lock();
81 count = refcount_read(&task_css_set(current)->refcount);
82 rcu_read_unlock();
83 return count;
84}
85
86static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
87{
88 struct cgrp_cset_link *link;
89 struct css_set *cset;
90 char *name_buf;
91
92 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
93 if (!name_buf)
94 return -ENOMEM;
95
96 spin_lock_irq(&css_set_lock);
97 rcu_read_lock();
98 cset = rcu_dereference(current->cgroups);
99 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
100 struct cgroup *c = link->cgrp;
101
102 cgroup_name(c, name_buf, NAME_MAX + 1);
103 seq_printf(seq, "Root %d group %s\n",
104 c->root->hierarchy_id, name_buf);
105 }
106 rcu_read_unlock();
107 spin_unlock_irq(&css_set_lock);
108 kfree(name_buf);
109 return 0;
110}
111
112#define MAX_TASKS_SHOWN_PER_CSS 25
113static int cgroup_css_links_read(struct seq_file *seq, void *v)
114{
115 struct cgroup_subsys_state *css = seq_css(seq);
116 struct cgrp_cset_link *link;
117 int dead_cnt = 0, extra_refs = 0;
118
119 spin_lock_irq(&css_set_lock);
120 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
121 struct css_set *cset = link->cset;
122 struct task_struct *task;
123 int count = 0;
124 int refcnt = refcount_read(&cset->refcount);
125
126 seq_printf(seq, " %d", refcnt);
127 if (refcnt - cset->nr_tasks > 0) {
128 int extra = refcnt - cset->nr_tasks;
129
130 seq_printf(seq, " +%d", extra);
131 /*
132 * Take out the one additional reference in
133 * init_css_set.
134 */
135 if (cset == &init_css_set)
136 extra--;
137 extra_refs += extra;
138 }
139 seq_puts(seq, "\n");
140
141 list_for_each_entry(task, &cset->tasks, cg_list) {
142 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
143 seq_printf(seq, " task %d\n",
144 task_pid_vnr(task));
145 }
146
147 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
148 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
149 seq_printf(seq, " task %d\n",
150 task_pid_vnr(task));
151 }
152 /* show # of overflowed tasks */
153 if (count > MAX_TASKS_SHOWN_PER_CSS)
154 seq_printf(seq, " ... (%d)\n",
155 count - MAX_TASKS_SHOWN_PER_CSS);
156
157 if (cset->dead) {
158 seq_puts(seq, " [dead]\n");
159 dead_cnt++;
160 }
161
162 WARN_ON(count != cset->nr_tasks);
163 }
164 spin_unlock_irq(&css_set_lock);
165
166 if (!dead_cnt && !extra_refs)
167 return 0;
168
169 seq_puts(seq, "\n");
170 if (extra_refs)
171 seq_printf(seq, "extra references = %d\n", extra_refs);
172 if (dead_cnt)
173 seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
174
175 return 0;
176}
177
178static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
179{
180 struct kernfs_open_file *of = seq->private;
181 struct cgroup *cgrp;
182 struct cgroup_subsys *ss;
183 struct cgroup_subsys_state *css;
184 char pbuf[16];
185 int i;
186
187 cgrp = cgroup_kn_lock_live(of->kn, false);
188 if (!cgrp)
189 return -ENODEV;
190
191 for_each_subsys(ss, i) {
192 css = rcu_dereference_check(cgrp->subsys[ss->id], true);
193 if (!css)
194 continue;
195
196 pbuf[0] = '\0';
197
198 /* Show the parent CSS if applicable*/
199 if (css->parent)
200 snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
201 css->parent->id);
202 seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
203 (unsigned long)css, css->id,
204 atomic_read(&css->online_cnt), pbuf);
205 }
206
207 cgroup_kn_unlock(of->kn);
208 return 0;
209}
210
211static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
212 u16 mask)
213{
214 struct cgroup_subsys *ss;
215 int ssid;
216 bool first = true;
217
218 seq_printf(seq, "%-17s: ", name);
219 for_each_subsys(ss, ssid) {
220 if (!(mask & (1 << ssid)))
221 continue;
222 if (!first)
223 seq_puts(seq, ", ");
224 seq_puts(seq, ss->name);
225 first = false;
226 }
227 seq_putc(seq, '\n');
228}
229
230static int cgroup_masks_read(struct seq_file *seq, void *v)
231{
232 struct kernfs_open_file *of = seq->private;
233 struct cgroup *cgrp;
234
235 cgrp = cgroup_kn_lock_live(of->kn, false);
236 if (!cgrp)
237 return -ENODEV;
238
239 cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
240 cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
241
242 cgroup_kn_unlock(of->kn);
243 return 0;
244}
245
246static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
247{
248 return (!cgroup_is_populated(css->cgroup) &&
249 !css_has_online_children(&css->cgroup->self));
250}
251
252static struct cftype debug_legacy_files[] = {
253 {
254 .name = "taskcount",
255 .read_u64 = debug_taskcount_read,
256 },
257
258 {
259 .name = "current_css_set",
260 .seq_show = current_css_set_read,
261 .flags = CFTYPE_ONLY_ON_ROOT,
262 },
263
264 {
265 .name = "current_css_set_refcount",
266 .read_u64 = current_css_set_refcount_read,
267 .flags = CFTYPE_ONLY_ON_ROOT,
268 },
269
270 {
271 .name = "current_css_set_cg_links",
272 .seq_show = current_css_set_cg_links_read,
273 .flags = CFTYPE_ONLY_ON_ROOT,
274 },
275
276 {
277 .name = "cgroup_css_links",
278 .seq_show = cgroup_css_links_read,
279 },
280
281 {
282 .name = "cgroup_subsys_states",
283 .seq_show = cgroup_subsys_states_read,
284 },
285
286 {
287 .name = "cgroup_masks",
288 .seq_show = cgroup_masks_read,
289 },
290
291 {
292 .name = "releasable",
293 .read_u64 = releasable_read,
294 },
295
296 { } /* terminate */
297};
298
299static struct cftype debug_files[] = {
300 {
301 .name = "taskcount",
302 .read_u64 = debug_taskcount_read,
303 },
304
305 {
306 .name = "current_css_set",
307 .seq_show = current_css_set_read,
308 .flags = CFTYPE_ONLY_ON_ROOT,
309 },
310
311 {
312 .name = "current_css_set_refcount",
313 .read_u64 = current_css_set_refcount_read,
314 .flags = CFTYPE_ONLY_ON_ROOT,
315 },
316
317 {
318 .name = "current_css_set_cg_links",
319 .seq_show = current_css_set_cg_links_read,
320 .flags = CFTYPE_ONLY_ON_ROOT,
321 },
322
323 {
324 .name = "css_links",
325 .seq_show = cgroup_css_links_read,
326 },
327
328 {
329 .name = "csses",
330 .seq_show = cgroup_subsys_states_read,
331 },
332
333 {
334 .name = "masks",
335 .seq_show = cgroup_masks_read,
336 },
337
338 { } /* terminate */
339};
340
341struct cgroup_subsys debug_cgrp_subsys = {
342 .css_alloc = debug_css_alloc,
343 .css_free = debug_css_free,
344 .legacy_cftypes = debug_legacy_files,
345};
346
347/*
348 * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
349 * parameter.
350 */
351static int __init enable_cgroup_debug(char *str)
352{
353 debug_cgrp_subsys.dfl_cftypes = debug_files;
354 debug_cgrp_subsys.implicit_on_dfl = true;
355 return 1;
356}
357__setup("cgroup_debug", enable_cgroup_debug);