aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-28 00:41:08 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-28 00:41:08 -0500
commitf7878dc3a9d3d900c86a66d9742f7e06681b06cd (patch)
treecaf8dc1b1b668309200159519f0dc5c25c515acd
parentfb15a78210f169cf39a42df208cff09cdac86574 (diff)
parentf83f3c515654474e19c7fc86e3b06564bb5cb4d4 (diff)
Merge branch 'for-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "Several noteworthy changes. - Parav's rdma controller is finally merged. It is very straight forward and can limit the abosolute numbers of common rdma constructs used by different cgroups. - kernel/cgroup.c got too chubby and disorganized. Created kernel/cgroup/ subdirectory and moved all cgroup related files under kernel/ there and reorganized the core code. This hurts for backporting patches but was long overdue. - cgroup v2 process listing reimplemented so that it no longer depends on allocating a buffer large enough to cache the entire result to sort and uniq the output. v2 has always mangled the sort order to ensure that users don't depend on the sorted output, so this shouldn't surprise anybody. This makes the pid listing functions use the same iterators that are used internally, which have to have the same iterating capabilities anyway. - perf cgroup filtering now works automatically on cgroup v2. This patch was posted a long time ago but somehow fell through the cracks. - misc fixes asnd documentation updates" * 'for-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (27 commits) kernfs: fix locking around kernfs_ops->release() callback cgroup: drop the matching uid requirement on migration for cgroup v2 cgroup, perf_event: make perf_event controller work on cgroup2 hierarchy cgroup: misc cleanups cgroup: call subsys->*attach() only for subsystems which are actually affected by migration cgroup: track migration context in cgroup_mgctx cgroup: cosmetic update to cgroup_taskset_add() rdmacg: Fixed uninitialized current resource usage cgroup: Add missing cgroup-v2 PID controller documentation. rdmacg: Added documentation for rdmacg IB/core: added support to use rdma cgroup controller rdmacg: Added rdma cgroup controller cgroup: fix a comment typo cgroup: fix RCU related sparse warnings cgroup: move namespace code to kernel/cgroup/namespace.c cgroup: rename functions for consistency cgroup: move v1 mount functions to kernel/cgroup/cgroup-v1.c cgroup: separate out cgroup1_kf_syscall_ops cgroup: refactor mount path and clearly distinguish v1 and v2 paths cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c ...
-rw-r--r--Documentation/cgroup-v1/rdma.txt109
-rw-r--r--Documentation/cgroup-v2.txt103
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/cgroup.c62
-rw-r--r--drivers/infiniband/core/core_priv.h30
-rw-r--r--drivers/infiniband/core/device.c10
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c102
-rw-r--r--drivers/infiniband/core/uverbs_main.c20
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c62
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--include/linux/cgroup-defs.h57
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/cgroup_rdma.h53
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/kernfs.h12
-rw-r--r--include/rdma/ib_verbs.h14
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cgroup/Makefile6
-rw-r--r--kernel/cgroup/cgroup-internal.h214
-rw-r--r--kernel/cgroup/cgroup-v1.c1395
-rw-r--r--kernel/cgroup/cgroup.c (renamed from kernel/cgroup.c)2081
-rw-r--r--kernel/cgroup/cpuset.c (renamed from kernel/cpuset.c)0
-rw-r--r--kernel/cgroup/freezer.c (renamed from kernel/cgroup_freezer.c)0
-rw-r--r--kernel/cgroup/namespace.c155
-rw-r--r--kernel/cgroup/pids.c (renamed from kernel/cgroup_pids.c)0
-rw-r--r--kernel/cgroup/rdma.c619
-rw-r--r--kernel/events/core.c6
-rw-r--r--tools/perf/util/cgroup.c26
30 files changed, 3264 insertions, 1898 deletions
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644
index 000000000000..af618171e0eb
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
1 RDMA Controller
2 ----------------
3
4Contents
5--------
6
71. Overview
8 1-1. What is RDMA controller?
9 1-2. Why RDMA controller needed?
10 1-3. How is RDMA controller implemented?
112. Usage Examples
12
131. Overview
14
151-1. What is RDMA controller?
16-----------------------------
17
18RDMA controller allows user to limit RDMA/IB specific resources that a given
19set of processes can use. These processes are grouped using RDMA controller.
20
21RDMA controller defines two resources which can be limited for processes of a
22cgroup.
23
241-2. Why RDMA controller needed?
25--------------------------------
26
27Currently user space applications can easily take away all the rdma verb
28specific resources such as AH, CQ, QP, MR etc. Due to which other applications
29in other cgroup or kernel space ULPs may not even get chance to allocate any
30rdma resources. This can leads to service unavailability.
31
32Therefore RDMA controller is needed through which resource consumption
33of processes can be limited. Through this controller different rdma
34resources can be accounted.
35
361-3. How is RDMA controller implemented?
37----------------------------------------
38
39RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
40resource accounting per cgroup, per device using resource pool structure.
41Each such resource pool is limited up to 64 resources in given resource pool
42by rdma cgroup, which can be extended later if required.
43
44This resource pool object is linked to the cgroup css. Typically there
45are 0 to 4 resource pool instances per cgroup, per device in most use cases.
46But nothing limits to have it more. At present hundreds of RDMA devices per
47single cgroup may not be handled optimally, however there is no
48known use case or requirement for such configuration either.
49
50Since RDMA resources can be allocated from any process and can be freed by any
51of the child processes which shares the address space, rdma resources are
52always owned by the creator cgroup css. This allows process migration from one
53to other cgroup without major complexity of transferring resource ownership;
54because such ownership is not really present due to shared nature of
55rdma resources. Linking resources around css also ensures that cgroups can be
56deleted after processes migrated. This allow progress migration as well with
57active resources, even though that is not a primary use case.
58
59Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
60the caller. Same rdma cgroup should be passed while uncharging the resource.
61This also allows process migrated with active RDMA resource to charge
62to new owner cgroup for new resource. It also allows to uncharge resource of
63a process from previously charged cgroup which is migrated to new cgroup,
64even though that is not a primary use case.
65
66Resource pool object is created in following situations.
67(a) User sets the limit and no previous resource pool exist for the device
68of interest for the cgroup.
69(b) No resource limits were configured, but IB/RDMA stack tries to
70charge the resource. So that it correctly uncharge them when applications are
71running without limits and later on when limits are enforced during uncharging,
72otherwise usage count will drop to negative.
73
74Resource pool is destroyed if all the resource limits are set to max and
75it is the last resource getting deallocated.
76
77User should set all the limit to max value if it intents to remove/unconfigure
78the resource pool for a particular device.
79
80IB stack honors limits enforced by the rdma controller. When application
81query about maximum resource limits of IB device, it returns minimum of
82what is configured by user for a given cgroup and what is supported by
83IB device.
84
85Following resources can be accounted by rdma controller.
86 hca_handle Maximum number of HCA Handles
87 hca_object Maximum number of HCA Objects
88
892. Usage Examples
90-----------------
91
92(a) Configure resource limit:
93echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
94echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
95
96(b) Query resource limit:
97cat /sys/fs/cgroup/rdma/2/rdma.max
98#Output:
99mlx4_0 hca_handle=2 hca_object=2000
100ocrdma1 hca_handle=3 hca_object=max
101
102(c) Query current usage:
103cat /sys/fs/cgroup/rdma/2/rdma.current
104#Output:
105mlx4_0 hca_handle=1 hca_object=20
106ocrdma1 hca_handle=1 hca_object=23
107
108(d) Delete resource limit:
109echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 4cc07ce3b8dd..3b8449f8ac7e 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -47,6 +47,12 @@ CONTENTS
47 5-3. IO 47 5-3. IO
48 5-3-1. IO Interface Files 48 5-3-1. IO Interface Files
49 5-3-2. Writeback 49 5-3-2. Writeback
50 5-4. PID
51 5-4-1. PID Interface Files
52 5-5. RDMA
53 5-5-1. RDMA Interface Files
54 5-6. Misc
55 5-6-1. perf_event
506. Namespace 566. Namespace
51 6-1. Basics 57 6-1. Basics
52 6-2. The Root and Views 58 6-2. The Root and Views
@@ -328,14 +334,12 @@ a process with a non-root euid to migrate a target process into a
328cgroup by writing its PID to the "cgroup.procs" file, the following 334cgroup by writing its PID to the "cgroup.procs" file, the following
329conditions must be met. 335conditions must be met.
330 336
331- The writer's euid must match either uid or suid of the target process.
332
333- The writer must have write access to the "cgroup.procs" file. 337- The writer must have write access to the "cgroup.procs" file.
334 338
335- The writer must have write access to the "cgroup.procs" file of the 339- The writer must have write access to the "cgroup.procs" file of the
336 common ancestor of the source and destination cgroups. 340 common ancestor of the source and destination cgroups.
337 341
338The above three constraints ensure that while a delegatee may migrate 342The above two constraints ensure that while a delegatee may migrate
339processes around freely in the delegated sub-hierarchy it can't pull 343processes around freely in the delegated sub-hierarchy it can't pull
340in from or push out to outside the sub-hierarchy. 344in from or push out to outside the sub-hierarchy.
341 345
@@ -350,10 +354,10 @@ all processes under C0 and C1 belong to U0.
350 354
351Let's also say U0 wants to write the PID of a process which is 355Let's also say U0 wants to write the PID of a process which is
352currently in C10 into "C00/cgroup.procs". U0 has write access to the 356currently in C10 into "C00/cgroup.procs". U0 has write access to the
353file and uid match on the process; however, the common ancestor of the 357file; however, the common ancestor of the source cgroup C10 and the
354source cgroup C10 and the destination cgroup C00 is above the points 358destination cgroup C00 is above the points of delegation and U0 would
355of delegation and U0 would not have write access to its "cgroup.procs" 359not have write access to its "cgroup.procs" files and thus the write
356files and thus the write will be denied with -EACCES. 360will be denied with -EACCES.
357 361
358 362
3592-6. Guidelines 3632-6. Guidelines
@@ -1119,6 +1123,91 @@ writeback as follows.
1119 vm.dirty[_background]_ratio. 1123 vm.dirty[_background]_ratio.
1120 1124
1121 1125
11265-4. PID
1127
1128The process number controller is used to allow a cgroup to stop any
1129new tasks from being fork()'d or clone()'d after a specified limit is
1130reached.
1131
1132The number of tasks in a cgroup can be exhausted in ways which other
1133controllers cannot prevent, thus warranting its own controller. For
1134example, a fork bomb is likely to exhaust the number of tasks before
1135hitting memory restrictions.
1136
1137Note that PIDs used in this controller refer to TIDs, process IDs as
1138used by the kernel.
1139
1140
11415-4-1. PID Interface Files
1142
1143 pids.max
1144
1145 A read-write single value file which exists on non-root cgroups. The
1146 default is "max".
1147
1148 Hard limit of number of processes.
1149
1150 pids.current
1151
1152 A read-only single value file which exists on all cgroups.
1153
1154 The number of processes currently in the cgroup and its descendants.
1155
1156Organisational operations are not blocked by cgroup policies, so it is
1157possible to have pids.current > pids.max. This can be done by either
1158setting the limit to be smaller than pids.current, or attaching enough
1159processes to the cgroup such that pids.current is larger than
1160pids.max. However, it is not possible to violate a cgroup PID policy
1161through fork() or clone(). These will return -EAGAIN if the creation
1162of a new process would cause a cgroup policy to be violated.
1163
1164
11655-5. RDMA
1166
1167The "rdma" controller regulates the distribution and accounting of
1168of RDMA resources.
1169
11705-5-1. RDMA Interface Files
1171
1172 rdma.max
1173 A readwrite nested-keyed file that exists for all the cgroups
1174 except root that describes current configured resource limit
1175 for a RDMA/IB device.
1176
1177 Lines are keyed by device name and are not ordered.
1178 Each line contains space separated resource name and its configured
1179 limit that can be distributed.
1180
1181 The following nested keys are defined.
1182
1183 hca_handle Maximum number of HCA Handles
1184 hca_object Maximum number of HCA Objects
1185
1186 An example for mlx4 and ocrdma device follows.
1187
1188 mlx4_0 hca_handle=2 hca_object=2000
1189 ocrdma1 hca_handle=3 hca_object=max
1190
1191 rdma.current
1192 A read-only file that describes current resource usage.
1193 It exists for all the cgroup except root.
1194
1195 An example for mlx4 and ocrdma device follows.
1196
1197 mlx4_0 hca_handle=1 hca_object=20
1198 ocrdma1 hca_handle=1 hca_object=23
1199
1200
12015-6. Misc
1202
12035-6-1. perf_event
1204
1205perf_event controller, if not mounted on a legacy hierarchy, is
1206automatically enabled on the v2 hierarchy so that perf events can
1207always be filtered by cgroup v2 path. The controller can still be
1208moved to a legacy hierarchy after v2 hierarchy is populated.
1209
1210
11226. Namespace 12116. Namespace
1123 1212
11246-1. Basics 12136-1. Basics
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index edaae9f9853c..e426ac877d19 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
13 multicast.o mad.o smi.o agent.o mad_rmpp.o 13 multicast.o mad.o smi.o agent.o mad_rmpp.o
14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
15ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o 15ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
16ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
16 17
17ib_cm-y := cm.o 18ib_cm-y := cm.o
18 19
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000000..126ac5f99db7
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include "core_priv.h"
15
16/**
17 * ib_device_register_rdmacg - register with rdma cgroup.
18 * @device: device to register to participate in resource
19 * accounting by rdma cgroup.
20 *
21 * Register with the rdma cgroup. Should be called before
22 * exposing rdma device to user space applications to avoid
23 * resource accounting leak.
24 * Returns 0 on success or otherwise failure code.
25 */
26int ib_device_register_rdmacg(struct ib_device *device)
27{
28 device->cg_device.name = device->name;
29 return rdmacg_register_device(&device->cg_device);
30}
31
32/**
33 * ib_device_unregister_rdmacg - unregister with rdma cgroup.
34 * @device: device to unregister.
35 *
36 * Unregister with the rdma cgroup. Should be called after
37 * all the resources are deallocated, and after a stage when any
38 * other resource allocation by user application cannot be done
39 * for this device to avoid any leak in accounting.
40 */
41void ib_device_unregister_rdmacg(struct ib_device *device)
42{
43 rdmacg_unregister_device(&device->cg_device);
44}
45
46int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
47 struct ib_device *device,
48 enum rdmacg_resource_type resource_index)
49{
50 return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
51 resource_index);
52}
53EXPORT_SYMBOL(ib_rdmacg_try_charge);
54
55void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
56 struct ib_device *device,
57 enum rdmacg_resource_type resource_index)
58{
59 rdmacg_uncharge(cg_obj->cg, &device->cg_device,
60 resource_index);
61}
62EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 912ab4cd6eae..cb7d372e4bdf 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
35 35
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/spinlock.h> 37#include <linux/spinlock.h>
38#include <linux/cgroup_rdma.h>
38 39
39#include <rdma/ib_verbs.h> 40#include <rdma/ib_verbs.h>
40 41
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
124void ib_cache_cleanup_one(struct ib_device *device); 125void ib_cache_cleanup_one(struct ib_device *device);
125void ib_cache_release_one(struct ib_device *device); 126void ib_cache_release_one(struct ib_device *device);
126 127
128#ifdef CONFIG_CGROUP_RDMA
129int ib_device_register_rdmacg(struct ib_device *device);
130void ib_device_unregister_rdmacg(struct ib_device *device);
131
132int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
133 struct ib_device *device,
134 enum rdmacg_resource_type resource_index);
135
136void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
137 struct ib_device *device,
138 enum rdmacg_resource_type resource_index);
139#else
140static inline int ib_device_register_rdmacg(struct ib_device *device)
141{ return 0; }
142
143static inline void ib_device_unregister_rdmacg(struct ib_device *device)
144{ }
145
146static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
147 struct ib_device *device,
148 enum rdmacg_resource_type resource_index)
149{ return 0; }
150
151static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
152 struct ib_device *device,
153 enum rdmacg_resource_type resource_index)
154{ }
155#endif
156
127static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, 157static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
128 struct net_device *upper) 158 struct net_device *upper)
129{ 159{
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a63e8400ea3b..593d2ce6ec7c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
369 goto out; 369 goto out;
370 } 370 }
371 371
372 ret = ib_device_register_rdmacg(device);
373 if (ret) {
374 pr_warn("Couldn't register device with rdma cgroup\n");
375 ib_cache_cleanup_one(device);
376 goto out;
377 }
378
372 memset(&device->attrs, 0, sizeof(device->attrs)); 379 memset(&device->attrs, 0, sizeof(device->attrs));
373 ret = device->query_device(device, &device->attrs, &uhw); 380 ret = device->query_device(device, &device->attrs, &uhw);
374 if (ret) { 381 if (ret) {
375 pr_warn("Couldn't query the device attributes\n"); 382 pr_warn("Couldn't query the device attributes\n");
383 ib_device_unregister_rdmacg(device);
376 ib_cache_cleanup_one(device); 384 ib_cache_cleanup_one(device);
377 goto out; 385 goto out;
378 } 386 }
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
381 if (ret) { 389 if (ret) {
382 pr_warn("Couldn't register device %s with driver model\n", 390 pr_warn("Couldn't register device %s with driver model\n",
383 device->name); 391 device->name);
392 ib_device_unregister_rdmacg(device);
384 ib_cache_cleanup_one(device); 393 ib_cache_cleanup_one(device);
385 goto out; 394 goto out;
386 } 395 }
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
430 439
431 mutex_unlock(&device_mutex); 440 mutex_unlock(&device_mutex);
432 441
442 ib_device_unregister_rdmacg(device);
433 ib_device_unregister_sysfs(device); 443 ib_device_unregister_sysfs(device);
434 ib_cache_cleanup_one(device); 444 ib_cache_cleanup_one(device);
435 445
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b4b395a054ac..7b7a76e1279a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
316 struct ib_udata udata; 316 struct ib_udata udata;
317 struct ib_ucontext *ucontext; 317 struct ib_ucontext *ucontext;
318 struct file *filp; 318 struct file *filp;
319 struct ib_rdmacg_object cg_obj;
319 int ret; 320 int ret;
320 321
321 if (out_len < sizeof resp) 322 if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
335 (unsigned long) cmd.response + sizeof resp, 336 (unsigned long) cmd.response + sizeof resp,
336 in_len - sizeof cmd, out_len - sizeof resp); 337 in_len - sizeof cmd, out_len - sizeof resp);
337 338
339 ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
340 if (ret)
341 goto err;
342
338 ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); 343 ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
339 if (IS_ERR(ucontext)) { 344 if (IS_ERR(ucontext)) {
340 ret = PTR_ERR(ucontext); 345 ret = PTR_ERR(ucontext);
341 goto err; 346 goto err_alloc;
342 } 347 }
343 348
344 ucontext->device = ib_dev; 349 ucontext->device = ib_dev;
350 ucontext->cg_obj = cg_obj;
345 INIT_LIST_HEAD(&ucontext->pd_list); 351 INIT_LIST_HEAD(&ucontext->pd_list);
346 INIT_LIST_HEAD(&ucontext->mr_list); 352 INIT_LIST_HEAD(&ucontext->mr_list);
347 INIT_LIST_HEAD(&ucontext->mw_list); 353 INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
407 put_pid(ucontext->tgid); 413 put_pid(ucontext->tgid);
408 ib_dev->dealloc_ucontext(ucontext); 414 ib_dev->dealloc_ucontext(ucontext);
409 415
416err_alloc:
417 ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
418
410err: 419err:
411 mutex_unlock(&file->mutex); 420 mutex_unlock(&file->mutex);
412 return ret; 421 return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
561 return -ENOMEM; 570 return -ENOMEM;
562 571
563 init_uobj(uobj, 0, file->ucontext, &pd_lock_class); 572 init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
573 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
574 RDMACG_RESOURCE_HCA_OBJECT);
575 if (ret) {
576 kfree(uobj);
577 return ret;
578 }
579
564 down_write(&uobj->mutex); 580 down_write(&uobj->mutex);
565 581
566 pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); 582 pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
605 ib_dealloc_pd(pd); 621 ib_dealloc_pd(pd);
606 622
607err: 623err:
624 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
608 put_uobj_write(uobj); 625 put_uobj_write(uobj);
609 return ret; 626 return ret;
610} 627}
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
637 if (ret) 654 if (ret)
638 goto err_put; 655 goto err_put;
639 656
657 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
658
640 uobj->live = 0; 659 uobj->live = 0;
641 put_uobj_write(uobj); 660 put_uobj_write(uobj);
642 661
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
1006 goto err_put; 1025 goto err_put;
1007 } 1026 }
1008 } 1027 }
1028 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
1029 RDMACG_RESOURCE_HCA_OBJECT);
1030 if (ret)
1031 goto err_charge;
1009 1032
1010 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, 1033 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
1011 cmd.access_flags, &udata); 1034 cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
1054 ib_dereg_mr(mr); 1077 ib_dereg_mr(mr);
1055 1078
1056err_put: 1079err_put:
1080 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1081
1082err_charge:
1057 put_pd_read(pd); 1083 put_pd_read(pd);
1058 1084
1059err_free: 1085err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
1178 if (ret) 1204 if (ret)
1179 return ret; 1205 return ret;
1180 1206
1207 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1208
1181 idr_remove_uobj(&ib_uverbs_mr_idr, uobj); 1209 idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
1182 1210
1183 mutex_lock(&file->mutex); 1211 mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
1226 in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), 1254 in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
1227 out_len - sizeof(resp)); 1255 out_len - sizeof(resp));
1228 1256
1257 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
1258 RDMACG_RESOURCE_HCA_OBJECT);
1259 if (ret)
1260 goto err_charge;
1261
1229 mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); 1262 mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
1230 if (IS_ERR(mw)) { 1263 if (IS_ERR(mw)) {
1231 ret = PTR_ERR(mw); 1264 ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
1271 uverbs_dealloc_mw(mw); 1304 uverbs_dealloc_mw(mw);
1272 1305
1273err_put: 1306err_put:
1307 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1308
1309err_charge:
1274 put_pd_read(pd); 1310 put_pd_read(pd);
1275 1311
1276err_free: 1312err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
1306 if (ret) 1342 if (ret)
1307 return ret; 1343 return ret;
1308 1344
1345 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1346
1309 idr_remove_uobj(&ib_uverbs_mw_idr, uobj); 1347 idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
1310 1348
1311 mutex_lock(&file->mutex); 1349 mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
1405 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) 1443 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
1406 attr.flags = cmd->flags; 1444 attr.flags = cmd->flags;
1407 1445
1446 ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
1447 RDMACG_RESOURCE_HCA_OBJECT);
1448 if (ret)
1449 goto err_charge;
1450
1408 cq = ib_dev->create_cq(ib_dev, &attr, 1451 cq = ib_dev->create_cq(ib_dev, &attr,
1409 file->ucontext, uhw); 1452 file->ucontext, uhw);
1410 if (IS_ERR(cq)) { 1453 if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
1452 ib_destroy_cq(cq); 1495 ib_destroy_cq(cq);
1453 1496
1454err_file: 1497err_file:
1498 ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
1499 RDMACG_RESOURCE_HCA_OBJECT);
1500
1501err_charge:
1455 if (ev_file) 1502 if (ev_file)
1456 ib_uverbs_release_ucq(file, ev_file, obj); 1503 ib_uverbs_release_ucq(file, ev_file, obj);
1457 1504
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
1732 if (ret) 1779 if (ret)
1733 return ret; 1780 return ret;
1734 1781
1782 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1783
1735 idr_remove_uobj(&ib_uverbs_cq_idr, uobj); 1784 idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
1736 1785
1737 mutex_lock(&file->mutex); 1786 mutex_lock(&file->mutex);
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
1905 goto err_put; 1954 goto err_put;
1906 } 1955 }
1907 1956
1957 ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
1958 RDMACG_RESOURCE_HCA_OBJECT);
1959 if (ret)
1960 goto err_put;
1961
1908 if (cmd->qp_type == IB_QPT_XRC_TGT) 1962 if (cmd->qp_type == IB_QPT_XRC_TGT)
1909 qp = ib_create_qp(pd, &attr); 1963 qp = ib_create_qp(pd, &attr);
1910 else 1964 else
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
1912 1966
1913 if (IS_ERR(qp)) { 1967 if (IS_ERR(qp)) {
1914 ret = PTR_ERR(qp); 1968 ret = PTR_ERR(qp);
1915 goto err_put; 1969 goto err_create;
1916 } 1970 }
1917 1971
1918 if (cmd->qp_type != IB_QPT_XRC_TGT) { 1972 if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1993,6 +2047,10 @@ err_cb:
1993err_destroy: 2047err_destroy:
1994 ib_destroy_qp(qp); 2048 ib_destroy_qp(qp);
1995 2049
2050err_create:
2051 ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
2052 RDMACG_RESOURCE_HCA_OBJECT);
2053
1996err_put: 2054err_put:
1997 if (xrcd) 2055 if (xrcd)
1998 put_xrcd_read(xrcd_uobj); 2056 put_xrcd_read(xrcd_uobj);
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
2519 if (ret) 2577 if (ret)
2520 return ret; 2578 return ret;
2521 2579
2580 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
2581
2522 if (obj->uxrcd) 2582 if (obj->uxrcd)
2523 atomic_dec(&obj->uxrcd->refcnt); 2583 atomic_dec(&obj->uxrcd->refcnt);
2524 2584
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
2970 memset(&attr.dmac, 0, sizeof(attr.dmac)); 3030 memset(&attr.dmac, 0, sizeof(attr.dmac));
2971 memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); 3031 memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
2972 3032
3033 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
3034 RDMACG_RESOURCE_HCA_OBJECT);
3035 if (ret)
3036 goto err_charge;
3037
2973 ah = pd->device->create_ah(pd, &attr, &udata); 3038 ah = pd->device->create_ah(pd, &attr, &udata);
2974 3039
2975 if (IS_ERR(ah)) { 3040 if (IS_ERR(ah)) {
2976 ret = PTR_ERR(ah); 3041 ret = PTR_ERR(ah);
2977 goto err_put; 3042 goto err_create;
2978 } 3043 }
2979 3044
2980 ah->device = pd->device; 3045 ah->device = pd->device;
@@ -3013,7 +3078,10 @@ err_copy:
3013err_destroy: 3078err_destroy:
3014 ib_destroy_ah(ah); 3079 ib_destroy_ah(ah);
3015 3080
3016err_put: 3081err_create:
3082 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3083
3084err_charge:
3017 put_pd_read(pd); 3085 put_pd_read(pd);
3018 3086
3019err: 3087err:
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
3047 if (ret) 3115 if (ret)
3048 return ret; 3116 return ret;
3049 3117
3118 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3119
3050 idr_remove_uobj(&ib_uverbs_ah_idr, uobj); 3120 idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
3051 3121
3052 mutex_lock(&file->mutex); 3122 mutex_lock(&file->mutex);
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
3861 err = -EINVAL; 3931 err = -EINVAL;
3862 goto err_free; 3932 goto err_free;
3863 } 3933 }
3934
3935 err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
3936 RDMACG_RESOURCE_HCA_OBJECT);
3937 if (err)
3938 goto err_free;
3939
3864 flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); 3940 flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
3865 if (IS_ERR(flow_id)) { 3941 if (IS_ERR(flow_id)) {
3866 err = PTR_ERR(flow_id); 3942 err = PTR_ERR(flow_id);
3867 goto err_free; 3943 goto err_create;
3868 } 3944 }
3869 flow_id->uobject = uobj; 3945 flow_id->uobject = uobj;
3870 uobj->object = flow_id; 3946 uobj->object = flow_id;
@@ -3897,6 +3973,8 @@ err_copy:
3897 idr_remove_uobj(&ib_uverbs_rule_idr, uobj); 3973 idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
3898destroy_flow: 3974destroy_flow:
3899 ib_destroy_flow(flow_id); 3975 ib_destroy_flow(flow_id);
3976err_create:
3977 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3900err_free: 3978err_free:
3901 kfree(flow_attr); 3979 kfree(flow_attr);
3902err_put: 3980err_put:
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
3936 flow_id = uobj->object; 4014 flow_id = uobj->object;
3937 4015
3938 ret = ib_destroy_flow(flow_id); 4016 ret = ib_destroy_flow(flow_id);
3939 if (!ret) 4017 if (!ret) {
4018 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
4019 RDMACG_RESOURCE_HCA_OBJECT);
3940 uobj->live = 0; 4020 uobj->live = 0;
4021 }
3941 4022
3942 put_uobj_write(uobj); 4023 put_uobj_write(uobj);
3943 4024
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
4005 obj->uevent.events_reported = 0; 4086 obj->uevent.events_reported = 0;
4006 INIT_LIST_HEAD(&obj->uevent.event_list); 4087 INIT_LIST_HEAD(&obj->uevent.event_list);
4007 4088
4089 ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
4090 RDMACG_RESOURCE_HCA_OBJECT);
4091 if (ret)
4092 goto err_put_cq;
4093
4008 srq = pd->device->create_srq(pd, &attr, udata); 4094 srq = pd->device->create_srq(pd, &attr, udata);
4009 if (IS_ERR(srq)) { 4095 if (IS_ERR(srq)) {
4010 ret = PTR_ERR(srq); 4096 ret = PTR_ERR(srq);
@@ -4069,6 +4155,8 @@ err_destroy:
4069 ib_destroy_srq(srq); 4155 ib_destroy_srq(srq);
4070 4156
4071err_put: 4157err_put:
4158 ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
4159 RDMACG_RESOURCE_HCA_OBJECT);
4072 put_pd_read(pd); 4160 put_pd_read(pd);
4073 4161
4074err_put_cq: 4162err_put_cq:
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
4255 if (ret) 4343 if (ret)
4256 return ret; 4344 return ret;
4257 4345
4346 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
4347
4258 if (srq_type == IB_SRQT_XRC) { 4348 if (srq_type == IB_SRQT_XRC) {
4259 us = container_of(obj, struct ib_usrq_object, uevent); 4349 us = container_of(obj, struct ib_usrq_object, uevent);
4260 atomic_dec(&us->uxrcd->refcnt); 4350 atomic_dec(&us->uxrcd->refcnt);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3fb4b1af1ad..35c788a32e26 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
51#include <rdma/ib.h> 51#include <rdma/ib.h>
52 52
53#include "uverbs.h" 53#include "uverbs.h"
54#include "core_priv.h"
54 55
55MODULE_AUTHOR("Roland Dreier"); 56MODULE_AUTHOR("Roland Dreier");
56MODULE_DESCRIPTION("InfiniBand userspace verbs access"); 57MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
237 238
238 idr_remove_uobj(&ib_uverbs_ah_idr, uobj); 239 idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
239 ib_destroy_ah(ah); 240 ib_destroy_ah(ah);
241 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
242 RDMACG_RESOURCE_HCA_OBJECT);
240 kfree(uobj); 243 kfree(uobj);
241 } 244 }
242 245
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
246 249
247 idr_remove_uobj(&ib_uverbs_mw_idr, uobj); 250 idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
248 uverbs_dealloc_mw(mw); 251 uverbs_dealloc_mw(mw);
252 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
253 RDMACG_RESOURCE_HCA_OBJECT);
249 kfree(uobj); 254 kfree(uobj);
250 } 255 }
251 256
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
254 259
255 idr_remove_uobj(&ib_uverbs_rule_idr, uobj); 260 idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
256 ib_destroy_flow(flow_id); 261 ib_destroy_flow(flow_id);
262 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
263 RDMACG_RESOURCE_HCA_OBJECT);
257 kfree(uobj); 264 kfree(uobj);
258 } 265 }
259 266
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
266 if (qp == qp->real_qp) 273 if (qp == qp->real_qp)
267 ib_uverbs_detach_umcast(qp, uqp); 274 ib_uverbs_detach_umcast(qp, uqp);
268 ib_destroy_qp(qp); 275 ib_destroy_qp(qp);
276 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
277 RDMACG_RESOURCE_HCA_OBJECT);
269 ib_uverbs_release_uevent(file, &uqp->uevent); 278 ib_uverbs_release_uevent(file, &uqp->uevent);
270 kfree(uqp); 279 kfree(uqp);
271 } 280 }
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
298 307
299 idr_remove_uobj(&ib_uverbs_srq_idr, uobj); 308 idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
300 ib_destroy_srq(srq); 309 ib_destroy_srq(srq);
310 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
311 RDMACG_RESOURCE_HCA_OBJECT);
301 ib_uverbs_release_uevent(file, uevent); 312 ib_uverbs_release_uevent(file, uevent);
302 kfree(uevent); 313 kfree(uevent);
303 } 314 }
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
310 321
311 idr_remove_uobj(&ib_uverbs_cq_idr, uobj); 322 idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
312 ib_destroy_cq(cq); 323 ib_destroy_cq(cq);
324 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
325 RDMACG_RESOURCE_HCA_OBJECT);
313 ib_uverbs_release_ucq(file, ev_file, ucq); 326 ib_uverbs_release_ucq(file, ev_file, ucq);
314 kfree(ucq); 327 kfree(ucq);
315 } 328 }
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
319 332
320 idr_remove_uobj(&ib_uverbs_mr_idr, uobj); 333 idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
321 ib_dereg_mr(mr); 334 ib_dereg_mr(mr);
335 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
336 RDMACG_RESOURCE_HCA_OBJECT);
322 kfree(uobj); 337 kfree(uobj);
323 } 338 }
324 339
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
339 354
340 idr_remove_uobj(&ib_uverbs_pd_idr, uobj); 355 idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
341 ib_dealloc_pd(pd); 356 ib_dealloc_pd(pd);
357 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
358 RDMACG_RESOURCE_HCA_OBJECT);
342 kfree(uobj); 359 kfree(uobj);
343 } 360 }
344 361
345 put_pid(context->tgid); 362 put_pid(context->tgid);
346 363
364 ib_rdmacg_uncharge(&context->cg_obj, context->device,
365 RDMACG_RESOURCE_HCA_HANDLE);
366
347 return context->device->dealloc_ucontext(context); 367 return context->device->dealloc_ucontext(context);
348} 368}
349 369
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 439b946c4808..db5900aaa55a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
478 rwsem_release(&kn->dep_map, 1, _RET_IP_); 478 rwsem_release(&kn->dep_map, 1, _RET_IP_);
479 } 479 }
480 480
481 kernfs_unmap_bin_file(kn); 481 kernfs_drain_open_files(kn);
482 482
483 mutex_lock(&kernfs_mutex); 483 mutex_lock(&kernfs_mutex);
484} 484}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4f0535890b30..35043a8c4529 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
515 goto out_put; 515 goto out_put;
516 516
517 rc = 0; 517 rc = 0;
518 of->mmapped = 1; 518 of->mmapped = true;
519 of->vm_ops = vma->vm_ops; 519 of->vm_ops = vma->vm_ops;
520 vma->vm_ops = &kernfs_vm_ops; 520 vma->vm_ops = &kernfs_vm_ops;
521out_put: 521out_put:
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
707 if (error) 707 if (error)
708 goto err_free; 708 goto err_free;
709 709
710 ((struct seq_file *)file->private_data)->private = of; 710 of->seq_file = file->private_data;
711 of->seq_file->private = of;
711 712
712 /* seq_file clears PWRITE unconditionally, restore it if WRITE */ 713 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
713 if (file->f_mode & FMODE_WRITE) 714 if (file->f_mode & FMODE_WRITE)
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
716 /* make sure we have open node struct */ 717 /* make sure we have open node struct */
717 error = kernfs_get_open_node(kn, of); 718 error = kernfs_get_open_node(kn, of);
718 if (error) 719 if (error)
719 goto err_close; 720 goto err_seq_release;
721
722 if (ops->open) {
723 /* nobody has access to @of yet, skip @of->mutex */
724 error = ops->open(of);
725 if (error)
726 goto err_put_node;
727 }
720 728
721 /* open succeeded, put active references */ 729 /* open succeeded, put active references */
722 kernfs_put_active(kn); 730 kernfs_put_active(kn);
723 return 0; 731 return 0;
724 732
725err_close: 733err_put_node:
734 kernfs_put_open_node(kn, of);
735err_seq_release:
726 seq_release(inode, file); 736 seq_release(inode, file);
727err_free: 737err_free:
728 kfree(of->prealloc_buf); 738 kfree(of->prealloc_buf);
@@ -732,11 +742,41 @@ err_out:
732 return error; 742 return error;
733} 743}
734 744
745/* used from release/drain to ensure that ->release() is called exactly once */
746static void kernfs_release_file(struct kernfs_node *kn,
747 struct kernfs_open_file *of)
748{
749 /*
750 * @of is guaranteed to have no other file operations in flight and
751 * we just want to synchronize release and drain paths.
752 * @kernfs_open_file_mutex is enough. @of->mutex can't be used
753 * here because drain path may be called from places which can
754 * cause circular dependency.
755 */
756 lockdep_assert_held(&kernfs_open_file_mutex);
757
758 if (!of->released) {
759 /*
760 * A file is never detached without being released and we
761 * need to be able to release files which are deactivated
762 * and being drained. Don't use kernfs_ops().
763 */
764 kn->attr.ops->release(of);
765 of->released = true;
766 }
767}
768
735static int kernfs_fop_release(struct inode *inode, struct file *filp) 769static int kernfs_fop_release(struct inode *inode, struct file *filp)
736{ 770{
737 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; 771 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
738 struct kernfs_open_file *of = kernfs_of(filp); 772 struct kernfs_open_file *of = kernfs_of(filp);
739 773
774 if (kn->flags & KERNFS_HAS_RELEASE) {
775 mutex_lock(&kernfs_open_file_mutex);
776 kernfs_release_file(kn, of);
777 mutex_unlock(&kernfs_open_file_mutex);
778 }
779
740 kernfs_put_open_node(kn, of); 780 kernfs_put_open_node(kn, of);
741 seq_release(inode, filp); 781 seq_release(inode, filp);
742 kfree(of->prealloc_buf); 782 kfree(of->prealloc_buf);
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
745 return 0; 785 return 0;
746} 786}
747 787
748void kernfs_unmap_bin_file(struct kernfs_node *kn) 788void kernfs_drain_open_files(struct kernfs_node *kn)
749{ 789{
750 struct kernfs_open_node *on; 790 struct kernfs_open_node *on;
751 struct kernfs_open_file *of; 791 struct kernfs_open_file *of;
752 792
753 if (!(kn->flags & KERNFS_HAS_MMAP)) 793 if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
754 return; 794 return;
755 795
756 spin_lock_irq(&kernfs_open_node_lock); 796 spin_lock_irq(&kernfs_open_node_lock);
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
762 return; 802 return;
763 803
764 mutex_lock(&kernfs_open_file_mutex); 804 mutex_lock(&kernfs_open_file_mutex);
805
765 list_for_each_entry(of, &on->files, list) { 806 list_for_each_entry(of, &on->files, list) {
766 struct inode *inode = file_inode(of->file); 807 struct inode *inode = file_inode(of->file);
767 unmap_mapping_range(inode->i_mapping, 0, 0, 1); 808
809 if (kn->flags & KERNFS_HAS_MMAP)
810 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
811
812 kernfs_release_file(kn, of);
768 } 813 }
814
769 mutex_unlock(&kernfs_open_file_mutex); 815 mutex_unlock(&kernfs_open_file_mutex);
770 816
771 kernfs_put_open_node(kn, NULL); 817 kernfs_put_open_node(kn, NULL);
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
964 kn->flags |= KERNFS_HAS_SEQ_SHOW; 1010 kn->flags |= KERNFS_HAS_SEQ_SHOW;
965 if (ops->mmap) 1011 if (ops->mmap)
966 kn->flags |= KERNFS_HAS_MMAP; 1012 kn->flags |= KERNFS_HAS_MMAP;
1013 if (ops->release)
1014 kn->flags |= KERNFS_HAS_RELEASE;
967 1015
968 rc = kernfs_add_one(kn); 1016 rc = kernfs_add_one(kn);
969 if (rc) { 1017 if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index bfd551bbf231..3100987cf8ba 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
104 */ 104 */
105extern const struct file_operations kernfs_file_fops; 105extern const struct file_operations kernfs_file_fops;
106 106
107void kernfs_unmap_bin_file(struct kernfs_node *kn); 107void kernfs_drain_open_files(struct kernfs_node *kn);
108 108
109/* 109/*
110 * symlink.c 110 * symlink.c
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b4677fc5b..3c02404cfce9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
148 * set for a task. 148 * set for a task.
149 */ 149 */
150struct css_set { 150struct css_set {
151 /* Reference count */
152 atomic_t refcount;
153
154 /* 151 /*
155 * List running through all cgroup groups in the same hash 152 * Set of subsystem states, one for each subsystem. This array is
156 * slot. Protected by css_set_lock 153 * immutable after creation apart from the init_css_set during
154 * subsystem registration (at boot time).
157 */ 155 */
158 struct hlist_node hlist; 156 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
157
158 /* reference count */
159 atomic_t refcount;
160
161 /* the default cgroup associated with this css_set */
162 struct cgroup *dfl_cgrp;
159 163
160 /* 164 /*
161 * Lists running through all tasks using this cgroup group. 165 * Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
167 struct list_head tasks; 171 struct list_head tasks;
168 struct list_head mg_tasks; 172 struct list_head mg_tasks;
169 173
174 /* all css_task_iters currently walking this cset */
175 struct list_head task_iters;
176
170 /* 177 /*
171 * List of cgrp_cset_links pointing at cgroups referenced from this 178 * On the default hierarhcy, ->subsys[ssid] may point to a css
172 * css_set. Protected by css_set_lock. 179 * attached to an ancestor instead of the cgroup this css_set is
180 * associated with. The following node is anchored at
181 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
182 * iterate through all css's attached to a given cgroup.
173 */ 183 */
174 struct list_head cgrp_links; 184 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
175 185
176 /* the default cgroup associated with this css_set */ 186 /*
177 struct cgroup *dfl_cgrp; 187 * List running through all cgroup groups in the same hash
188 * slot. Protected by css_set_lock
189 */
190 struct hlist_node hlist;
178 191
179 /* 192 /*
180 * Set of subsystem states, one for each subsystem. This array is 193 * List of cgrp_cset_links pointing at cgroups referenced from this
181 * immutable after creation apart from the init_css_set during 194 * css_set. Protected by css_set_lock.
182 * subsystem registration (at boot time).
183 */ 195 */
184 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 196 struct list_head cgrp_links;
185 197
186 /* 198 /*
187 * List of csets participating in the on-going migration either as 199 * List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
201 struct cgroup *mg_dst_cgrp; 213 struct cgroup *mg_dst_cgrp;
202 struct css_set *mg_dst_cset; 214 struct css_set *mg_dst_cset;
203 215
204 /*
205 * On the default hierarhcy, ->subsys[ssid] may point to a css
206 * attached to an ancestor instead of the cgroup this css_set is
207 * associated with. The following node is anchored at
208 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
209 * iterate through all css's attached to a given cgroup.
210 */
211 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
212
213 /* all css_task_iters currently walking this cset */
214 struct list_head task_iters;
215
216 /* dead and being drained, ignore for migration */ 216 /* dead and being drained, ignore for migration */
217 bool dead; 217 bool dead;
218 218
@@ -388,6 +388,9 @@ struct cftype {
388 struct list_head node; /* anchored at ss->cfts */ 388 struct list_head node; /* anchored at ss->cfts */
389 struct kernfs_ops *kf_ops; 389 struct kernfs_ops *kf_ops;
390 390
391 int (*open)(struct kernfs_open_file *of);
392 void (*release)(struct kernfs_open_file *of);
393
391 /* 394 /*
392 * read_u64() is a shortcut for the common case of returning a 395 * read_u64() is a shortcut for the common case of returning a
393 * single integer. Use it in place of read() 396 * single integer. Use it in place of read()
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c83c23f0577b..f6b43fbb141c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
266 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset 266 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
267 * @leader: the loop cursor 267 * @leader: the loop cursor
268 * @dst_css: the destination css 268 * @dst_css: the destination css
269 * @tset: takset to iterate 269 * @tset: taskset to iterate
270 * 270 *
271 * Iterate threadgroup leaders of @tset. For single-task migrations, @tset 271 * Iterate threadgroup leaders of @tset. For single-task migrations, @tset
272 * may not contain any. 272 * may not contain any.
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
3 *
4 * This file is subject to the terms and conditions of version 2 of the GNU
5 * General Public License. See the file COPYING in the main directory of the
6 * Linux distribution for more details.
7 */
8
9#ifndef _CGROUP_RDMA_H
10#define _CGROUP_RDMA_H
11
12#include <linux/cgroup.h>
13
14enum rdmacg_resource_type {
15 RDMACG_RESOURCE_HCA_HANDLE,
16 RDMACG_RESOURCE_HCA_OBJECT,
17 RDMACG_RESOURCE_MAX,
18};
19
20#ifdef CONFIG_CGROUP_RDMA
21
22struct rdma_cgroup {
23 struct cgroup_subsys_state css;
24
25 /*
26 * head to keep track of all resource pools
27 * that belongs to this cgroup.
28 */
29 struct list_head rpools;
30};
31
32struct rdmacg_device {
33 struct list_head dev_node;
34 struct list_head rpools;
35 char *name;
36};
37
38/*
39 * APIs for RDMA/IB stack to publish when a device wants to
40 * participate in resource accounting
41 */
42int rdmacg_register_device(struct rdmacg_device *device);
43void rdmacg_unregister_device(struct rdmacg_device *device);
44
45/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
46int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
47 struct rdmacg_device *device,
48 enum rdmacg_resource_type index);
49void rdmacg_uncharge(struct rdma_cgroup *cg,
50 struct rdmacg_device *device,
51 enum rdmacg_resource_type index);
52#endif /* CONFIG_CGROUP_RDMA */
53#endif /* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
56SUBSYS(pids) 56SUBSYS(pids)
57#endif 57#endif
58 58
59#if IS_ENABLED(CONFIG_CGROUP_RDMA)
60SUBSYS(rdma)
61#endif
62
59/* 63/*
60 * The following subsystems are not supported on the default hierarchy. 64 * The following subsystems are not supported on the default hierarchy.
61 */ 65 */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 7056238fd9f5..a9b11b8d06f2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
46 KERNFS_SUICIDAL = 0x0400, 46 KERNFS_SUICIDAL = 0x0400,
47 KERNFS_SUICIDED = 0x0800, 47 KERNFS_SUICIDED = 0x0800,
48 KERNFS_EMPTY_DIR = 0x1000, 48 KERNFS_EMPTY_DIR = 0x1000,
49 KERNFS_HAS_RELEASE = 0x2000,
49}; 50};
50 51
51/* @flags for kernfs_create_root() */ 52/* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
175 /* published fields */ 176 /* published fields */
176 struct kernfs_node *kn; 177 struct kernfs_node *kn;
177 struct file *file; 178 struct file *file;
179 struct seq_file *seq_file;
178 void *priv; 180 void *priv;
179 181
180 /* private fields, do not use outside kernfs proper */ 182 /* private fields, do not use outside kernfs proper */
@@ -185,12 +187,20 @@ struct kernfs_open_file {
185 char *prealloc_buf; 187 char *prealloc_buf;
186 188
187 size_t atomic_write_len; 189 size_t atomic_write_len;
188 bool mmapped; 190 bool mmapped:1;
191 bool released:1;
189 const struct vm_operations_struct *vm_ops; 192 const struct vm_operations_struct *vm_ops;
190}; 193};
191 194
192struct kernfs_ops { 195struct kernfs_ops {
193 /* 196 /*
197 * Optional open/release methods. Both are called with
198 * @of->seq_file populated.
199 */
200 int (*open)(struct kernfs_open_file *of);
201 void (*release)(struct kernfs_open_file *of);
202
203 /*
194 * Read is handled by either seq_file or raw_read(). 204 * Read is handled by either seq_file or raw_read().
195 * 205 *
196 * If seq_show() is present, seq_file path is active. Other seq 206 * If seq_show() is present, seq_file path is active. Other seq
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d84849c5dc05..0f1813c13687 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
60#include <linux/atomic.h> 60#include <linux/atomic.h>
61#include <linux/mmu_notifier.h> 61#include <linux/mmu_notifier.h>
62#include <linux/uaccess.h> 62#include <linux/uaccess.h>
63#include <linux/cgroup_rdma.h>
63 64
64extern struct workqueue_struct *ib_wq; 65extern struct workqueue_struct *ib_wq;
65extern struct workqueue_struct *ib_comp_wq; 66extern struct workqueue_struct *ib_comp_wq;
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
1356 1357
1357struct ib_umem; 1358struct ib_umem;
1358 1359
1360struct ib_rdmacg_object {
1361#ifdef CONFIG_CGROUP_RDMA
1362 struct rdma_cgroup *cg; /* owner rdma cgroup */
1363#endif
1364};
1365
1359struct ib_ucontext { 1366struct ib_ucontext {
1360 struct ib_device *device; 1367 struct ib_device *device;
1361 struct list_head pd_list; 1368 struct list_head pd_list;
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
1388 struct list_head no_private_counters; 1395 struct list_head no_private_counters;
1389 int odp_mrs_count; 1396 int odp_mrs_count;
1390#endif 1397#endif
1398
1399 struct ib_rdmacg_object cg_obj;
1391}; 1400};
1392 1401
1393struct ib_uobject { 1402struct ib_uobject {
@@ -1395,6 +1404,7 @@ struct ib_uobject {
1395 struct ib_ucontext *context; /* associated user context */ 1404 struct ib_ucontext *context; /* associated user context */
1396 void *object; /* containing object */ 1405 void *object; /* containing object */
1397 struct list_head list; /* link to context's list */ 1406 struct list_head list; /* link to context's list */
1407 struct ib_rdmacg_object cg_obj; /* rdmacg object */
1398 int id; /* index into kernel idr */ 1408 int id; /* index into kernel idr */
1399 struct kref ref; 1409 struct kref ref;
1400 struct rw_semaphore mutex; /* protects .live */ 1410 struct rw_semaphore mutex; /* protects .live */
@@ -2128,6 +2138,10 @@ struct ib_device {
2128 struct attribute_group *hw_stats_ag; 2138 struct attribute_group *hw_stats_ag;
2129 struct rdma_hw_stats *hw_stats; 2139 struct rdma_hw_stats *hw_stats;
2130 2140
2141#ifdef CONFIG_CGROUP_RDMA
2142 struct rdmacg_device cg_device;
2143#endif
2144
2131 /** 2145 /**
2132 * The following mandatory functions are used only at device 2146 * The following mandatory functions are used only at device
2133 * registration. Keep functions such as these at the end of this 2147 * registration. Keep functions such as these at the end of this
diff --git a/init/Kconfig b/init/Kconfig
index 8c39615165b7..a92f27da4a27 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1078,6 +1078,16 @@ config CGROUP_PIDS
1078 since the PIDs limit only affects a process's ability to fork, not to 1078 since the PIDs limit only affects a process's ability to fork, not to
1079 attach to a cgroup. 1079 attach to a cgroup.
1080 1080
1081config CGROUP_RDMA
1082 bool "RDMA controller"
1083 help
1084 Provides enforcement of RDMA resources defined by IB stack.
1085 It is fairly easy for consumers to exhaust RDMA resources, which
1086 can result into resource unavailability to other consumers.
1087 RDMA controller is designed to stop this from happening.
1088 Attaching processes with active RDMA resources to the cgroup
1089 hierarchy is allowed even if can cross the hierarchy's limit.
1090
1081config CGROUP_FREEZER 1091config CGROUP_FREEZER
1082 bool "Freezer controller" 1092 bool "Freezer controller"
1083 help 1093 help
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
64obj-$(CONFIG_KEXEC_FILE) += kexec_file.o 64obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
65obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 65obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
66obj-$(CONFIG_COMPAT) += compat.o 66obj-$(CONFIG_COMPAT) += compat.o
67obj-$(CONFIG_CGROUPS) += cgroup.o 67obj-$(CONFIG_CGROUPS) += cgroup/
68obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
69obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
70obj-$(CONFIG_CPUSETS) += cpuset.o
71obj-$(CONFIG_UTS_NS) += utsname.o 68obj-$(CONFIG_UTS_NS) += utsname.o
72obj-$(CONFIG_USER_NS) += user_namespace.o 69obj-$(CONFIG_USER_NS) += user_namespace.o
73obj-$(CONFIG_PID_NS) += pid_namespace.o 70obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..387348a40c64
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
1obj-y := cgroup.o namespace.o cgroup-v1.o
2
3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o
5obj-$(CONFIG_CGROUP_RDMA) += rdma.o
6obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..9203bfb05603
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
1#ifndef __CGROUP_INTERNAL_H
2#define __CGROUP_INTERNAL_H
3
4#include <linux/cgroup.h>
5#include <linux/kernfs.h>
6#include <linux/workqueue.h>
7#include <linux/list.h>
8
9/*
10 * A cgroup can be associated with multiple css_sets as different tasks may
11 * belong to different cgroups on different hierarchies. In the other
12 * direction, a css_set is naturally associated with multiple cgroups.
13 * This M:N relationship is represented by the following link structure
14 * which exists for each association and allows traversing the associations
15 * from both sides.
16 */
17struct cgrp_cset_link {
18 /* the cgroup and css_set this link associates */
19 struct cgroup *cgrp;
20 struct css_set *cset;
21
22 /* list of cgrp_cset_links anchored at cgrp->cset_links */
23 struct list_head cset_link;
24
25 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
26 struct list_head cgrp_link;
27};
28
29/* used to track tasks and csets during migration */
30struct cgroup_taskset {
31 /* the src and dst cset list running through cset->mg_node */
32 struct list_head src_csets;
33 struct list_head dst_csets;
34
35 /* the subsys currently being processed */
36 int ssid;
37
38 /*
39 * Fields for cgroup_taskset_*() iteration.
40 *
41 * Before migration is committed, the target migration tasks are on
42 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
43 * the csets on ->dst_csets. ->csets point to either ->src_csets
44 * or ->dst_csets depending on whether migration is committed.
45 *
46 * ->cur_csets and ->cur_task point to the current task position
47 * during iteration.
48 */
49 struct list_head *csets;
50 struct css_set *cur_cset;
51 struct task_struct *cur_task;
52};
53
54/* migration context also tracks preloading */
55struct cgroup_mgctx {
56 /*
57 * Preloaded source and destination csets. Used to guarantee
58 * atomic success or failure on actual migration.
59 */
60 struct list_head preloaded_src_csets;
61 struct list_head preloaded_dst_csets;
62
63 /* tasks and csets to migrate */
64 struct cgroup_taskset tset;
65
66 /* subsystems affected by migration */
67 u16 ss_mask;
68};
69
70#define CGROUP_TASKSET_INIT(tset) \
71{ \
72 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
73 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
74 .csets = &tset.src_csets, \
75}
76
77#define CGROUP_MGCTX_INIT(name) \
78{ \
79 LIST_HEAD_INIT(name.preloaded_src_csets), \
80 LIST_HEAD_INIT(name.preloaded_dst_csets), \
81 CGROUP_TASKSET_INIT(name.tset), \
82}
83
84#define DEFINE_CGROUP_MGCTX(name) \
85 struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
86
87struct cgroup_sb_opts {
88 u16 subsys_mask;
89 unsigned int flags;
90 char *release_agent;
91 bool cpuset_clone_children;
92 char *name;
93 /* User explicitly requested empty subsystem */
94 bool none;
95};
96
97extern struct mutex cgroup_mutex;
98extern spinlock_t css_set_lock;
99extern struct cgroup_subsys *cgroup_subsys[];
100extern struct list_head cgroup_roots;
101extern struct file_system_type cgroup_fs_type;
102
103/* iterate across the hierarchies */
104#define for_each_root(root) \
105 list_for_each_entry((root), &cgroup_roots, root_list)
106
107/**
108 * for_each_subsys - iterate all enabled cgroup subsystems
109 * @ss: the iteration cursor
110 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
111 */
112#define for_each_subsys(ss, ssid) \
113 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
114 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
115
116static inline bool cgroup_is_dead(const struct cgroup *cgrp)
117{
118 return !(cgrp->self.flags & CSS_ONLINE);
119}
120
121static inline bool notify_on_release(const struct cgroup *cgrp)
122{
123 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
124}
125
126void put_css_set_locked(struct css_set *cset);
127
128static inline void put_css_set(struct css_set *cset)
129{
130 unsigned long flags;
131
132 /*
133 * Ensure that the refcount doesn't hit zero while any readers
134 * can see it. Similar to atomic_dec_and_lock(), but for an
135 * rwlock
136 */
137 if (atomic_add_unless(&cset->refcount, -1, 1))
138 return;
139
140 spin_lock_irqsave(&css_set_lock, flags);
141 put_css_set_locked(cset);
142 spin_unlock_irqrestore(&css_set_lock, flags);
143}
144
145/*
146 * refcounted get/put for css_set objects
147 */
148static inline void get_css_set(struct css_set *cset)
149{
150 atomic_inc(&cset->refcount);
151}
152
153bool cgroup_ssid_enabled(int ssid);
154bool cgroup_on_dfl(const struct cgroup *cgrp);
155
156struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
157struct cgroup *task_cgroup_from_root(struct task_struct *task,
158 struct cgroup_root *root);
159struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
160void cgroup_kn_unlock(struct kernfs_node *kn);
161int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
162 struct cgroup_namespace *ns);
163
164void cgroup_free_root(struct cgroup_root *root);
165void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
166int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
167int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
168struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
169 struct cgroup_root *root, unsigned long magic,
170 struct cgroup_namespace *ns);
171
172bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
173void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
174void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
175 struct cgroup_mgctx *mgctx);
176int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
177int cgroup_migrate(struct task_struct *leader, bool threadgroup,
178 struct cgroup_mgctx *mgctx);
179
180int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
181 bool threadgroup);
182ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
183 size_t nbytes, loff_t off, bool threadgroup);
184ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
185 loff_t off);
186
187void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
188
189int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
190int cgroup_rmdir(struct kernfs_node *kn);
191int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
192 struct kernfs_root *kf_root);
193
194/*
195 * namespace.c
196 */
197extern const struct proc_ns_operations cgroupns_operations;
198
199/*
200 * cgroup-v1.c
201 */
202extern struct cftype cgroup1_base_files[];
203extern const struct file_operations proc_cgroupstats_operations;
204extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
205
206bool cgroup1_ssid_disabled(int ssid);
207void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
208void cgroup1_release_agent(struct work_struct *work);
209void cgroup1_check_for_release(struct cgroup *cgrp);
210struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
211 void *data, unsigned long magic,
212 struct cgroup_namespace *ns);
213
214#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..fc34bcf2329f
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1395 @@
1#include "cgroup-internal.h"
2
3#include <linux/ctype.h>
4#include <linux/kmod.h>
5#include <linux/sort.h>
6#include <linux/delay.h>
7#include <linux/mm.h>
8#include <linux/slab.h>
9#include <linux/vmalloc.h>
10#include <linux/delayacct.h>
11#include <linux/pid_namespace.h>
12#include <linux/cgroupstats.h>
13
14#include <trace/events/cgroup.h>
15
16/*
17 * pidlists linger the following amount before being destroyed. The goal
18 * is avoiding frequent destruction in the middle of consecutive read calls
19 * Expiring in the middle is a performance problem not a correctness one.
20 * 1 sec should be enough.
21 */
22#define CGROUP_PIDLIST_DESTROY_DELAY HZ
23
24/* Controllers blocked by the commandline in v1 */
25static u16 cgroup_no_v1_mask;
26
27/*
28 * pidlist destructions need to be flushed on cgroup destruction. Use a
29 * separate workqueue as flush domain.
30 */
31static struct workqueue_struct *cgroup_pidlist_destroy_wq;
32
33/*
34 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
35 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
36 */
37static DEFINE_SPINLOCK(release_agent_path_lock);
38
39bool cgroup1_ssid_disabled(int ssid)
40{
41 return cgroup_no_v1_mask & (1 << ssid);
42}
43
44/**
45 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
46 * @from: attach to all cgroups of a given task
47 * @tsk: the task to be attached
48 */
49int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
50{
51 struct cgroup_root *root;
52 int retval = 0;
53
54 mutex_lock(&cgroup_mutex);
55 percpu_down_write(&cgroup_threadgroup_rwsem);
56 for_each_root(root) {
57 struct cgroup *from_cgrp;
58
59 if (root == &cgrp_dfl_root)
60 continue;
61
62 spin_lock_irq(&css_set_lock);
63 from_cgrp = task_cgroup_from_root(from, root);
64 spin_unlock_irq(&css_set_lock);
65
66 retval = cgroup_attach_task(from_cgrp, tsk, false);
67 if (retval)
68 break;
69 }
70 percpu_up_write(&cgroup_threadgroup_rwsem);
71 mutex_unlock(&cgroup_mutex);
72
73 return retval;
74}
75EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
76
77/**
78 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
79 * @to: cgroup to which the tasks will be moved
80 * @from: cgroup in which the tasks currently reside
81 *
82 * Locking rules between cgroup_post_fork() and the migration path
83 * guarantee that, if a task is forking while being migrated, the new child
84 * is guaranteed to be either visible in the source cgroup after the
85 * parent's migration is complete or put into the target cgroup. No task
86 * can slip out of migration through forking.
87 */
88int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
89{
90 DEFINE_CGROUP_MGCTX(mgctx);
91 struct cgrp_cset_link *link;
92 struct css_task_iter it;
93 struct task_struct *task;
94 int ret;
95
96 if (cgroup_on_dfl(to))
97 return -EINVAL;
98
99 if (!cgroup_may_migrate_to(to))
100 return -EBUSY;
101
102 mutex_lock(&cgroup_mutex);
103
104 percpu_down_write(&cgroup_threadgroup_rwsem);
105
106 /* all tasks in @from are being moved, all csets are source */
107 spin_lock_irq(&css_set_lock);
108 list_for_each_entry(link, &from->cset_links, cset_link)
109 cgroup_migrate_add_src(link->cset, to, &mgctx);
110 spin_unlock_irq(&css_set_lock);
111
112 ret = cgroup_migrate_prepare_dst(&mgctx);
113 if (ret)
114 goto out_err;
115
116 /*
117 * Migrate tasks one-by-one until @from is empty. This fails iff
118 * ->can_attach() fails.
119 */
120 do {
121 css_task_iter_start(&from->self, &it);
122 task = css_task_iter_next(&it);
123 if (task)
124 get_task_struct(task);
125 css_task_iter_end(&it);
126
127 if (task) {
128 ret = cgroup_migrate(task, false, &mgctx);
129 if (!ret)
130 trace_cgroup_transfer_tasks(to, task, false);
131 put_task_struct(task);
132 }
133 } while (task && !ret);
134out_err:
135 cgroup_migrate_finish(&mgctx);
136 percpu_up_write(&cgroup_threadgroup_rwsem);
137 mutex_unlock(&cgroup_mutex);
138 return ret;
139}
140
141/*
142 * Stuff for reading the 'tasks'/'procs' files.
143 *
144 * Reading this file can return large amounts of data if a cgroup has
145 * *lots* of attached tasks. So it may need several calls to read(),
146 * but we cannot guarantee that the information we produce is correct
147 * unless we produce it entirely atomically.
148 *
149 */
150
151/* which pidlist file are we talking about? */
152enum cgroup_filetype {
153 CGROUP_FILE_PROCS,
154 CGROUP_FILE_TASKS,
155};
156
157/*
158 * A pidlist is a list of pids that virtually represents the contents of one
159 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
160 * a pair (one each for procs, tasks) for each pid namespace that's relevant
161 * to the cgroup.
162 */
163struct cgroup_pidlist {
164 /*
165 * used to find which pidlist is wanted. doesn't change as long as
166 * this particular list stays in the list.
167 */
168 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
169 /* array of xids */
170 pid_t *list;
171 /* how many elements the above list has */
172 int length;
173 /* each of these stored in a list by its cgroup */
174 struct list_head links;
175 /* pointer to the cgroup we belong to, for list removal purposes */
176 struct cgroup *owner;
177 /* for delayed destruction */
178 struct delayed_work destroy_dwork;
179};
180
181/*
182 * The following two functions "fix" the issue where there are more pids
183 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
184 * TODO: replace with a kernel-wide solution to this problem
185 */
186#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
187static void *pidlist_allocate(int count)
188{
189 if (PIDLIST_TOO_LARGE(count))
190 return vmalloc(count * sizeof(pid_t));
191 else
192 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
193}
194
195static void pidlist_free(void *p)
196{
197 kvfree(p);
198}
199
200/*
201 * Used to destroy all pidlists lingering waiting for destroy timer. None
202 * should be left afterwards.
203 */
204void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
205{
206 struct cgroup_pidlist *l, *tmp_l;
207
208 mutex_lock(&cgrp->pidlist_mutex);
209 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
210 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
211 mutex_unlock(&cgrp->pidlist_mutex);
212
213 flush_workqueue(cgroup_pidlist_destroy_wq);
214 BUG_ON(!list_empty(&cgrp->pidlists));
215}
216
217static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
218{
219 struct delayed_work *dwork = to_delayed_work(work);
220 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
221 destroy_dwork);
222 struct cgroup_pidlist *tofree = NULL;
223
224 mutex_lock(&l->owner->pidlist_mutex);
225
226 /*
227 * Destroy iff we didn't get queued again. The state won't change
228 * as destroy_dwork can only be queued while locked.
229 */
230 if (!delayed_work_pending(dwork)) {
231 list_del(&l->links);
232 pidlist_free(l->list);
233 put_pid_ns(l->key.ns);
234 tofree = l;
235 }
236
237 mutex_unlock(&l->owner->pidlist_mutex);
238 kfree(tofree);
239}
240
241/*
242 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
243 * Returns the number of unique elements.
244 */
245static int pidlist_uniq(pid_t *list, int length)
246{
247 int src, dest = 1;
248
249 /*
250 * we presume the 0th element is unique, so i starts at 1. trivial
251 * edge cases first; no work needs to be done for either
252 */
253 if (length == 0 || length == 1)
254 return length;
255 /* src and dest walk down the list; dest counts unique elements */
256 for (src = 1; src < length; src++) {
257 /* find next unique element */
258 while (list[src] == list[src-1]) {
259 src++;
260 if (src == length)
261 goto after;
262 }
263 /* dest always points to where the next unique element goes */
264 list[dest] = list[src];
265 dest++;
266 }
267after:
268 return dest;
269}
270
271/*
272 * The two pid files - task and cgroup.procs - guaranteed that the result
273 * is sorted, which forced this whole pidlist fiasco. As pid order is
274 * different per namespace, each namespace needs differently sorted list,
275 * making it impossible to use, for example, single rbtree of member tasks
276 * sorted by task pointer. As pidlists can be fairly large, allocating one
277 * per open file is dangerous, so cgroup had to implement shared pool of
278 * pidlists keyed by cgroup and namespace.
279 */
280static int cmppid(const void *a, const void *b)
281{
282 return *(pid_t *)a - *(pid_t *)b;
283}
284
285static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
286 enum cgroup_filetype type)
287{
288 struct cgroup_pidlist *l;
289 /* don't need task_nsproxy() if we're looking at ourself */
290 struct pid_namespace *ns = task_active_pid_ns(current);
291
292 lockdep_assert_held(&cgrp->pidlist_mutex);
293
294 list_for_each_entry(l, &cgrp->pidlists, links)
295 if (l->key.type == type && l->key.ns == ns)
296 return l;
297 return NULL;
298}
299
300/*
301 * find the appropriate pidlist for our purpose (given procs vs tasks)
302 * returns with the lock on that pidlist already held, and takes care
303 * of the use count, or returns NULL with no locks held if we're out of
304 * memory.
305 */
306static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
307 enum cgroup_filetype type)
308{
309 struct cgroup_pidlist *l;
310
311 lockdep_assert_held(&cgrp->pidlist_mutex);
312
313 l = cgroup_pidlist_find(cgrp, type);
314 if (l)
315 return l;
316
317 /* entry not found; create a new one */
318 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
319 if (!l)
320 return l;
321
322 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
323 l->key.type = type;
324 /* don't need task_nsproxy() if we're looking at ourself */
325 l->key.ns = get_pid_ns(task_active_pid_ns(current));
326 l->owner = cgrp;
327 list_add(&l->links, &cgrp->pidlists);
328 return l;
329}
330
331/**
332 * cgroup_task_count - count the number of tasks in a cgroup.
333 * @cgrp: the cgroup in question
334 *
335 * Return the number of tasks in the cgroup. The returned number can be
336 * higher than the actual number of tasks due to css_set references from
337 * namespace roots and temporary usages.
338 */
339static int cgroup_task_count(const struct cgroup *cgrp)
340{
341 int count = 0;
342 struct cgrp_cset_link *link;
343
344 spin_lock_irq(&css_set_lock);
345 list_for_each_entry(link, &cgrp->cset_links, cset_link)
346 count += atomic_read(&link->cset->refcount);
347 spin_unlock_irq(&css_set_lock);
348 return count;
349}
350
351/*
352 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
353 */
354static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
355 struct cgroup_pidlist **lp)
356{
357 pid_t *array;
358 int length;
359 int pid, n = 0; /* used for populating the array */
360 struct css_task_iter it;
361 struct task_struct *tsk;
362 struct cgroup_pidlist *l;
363
364 lockdep_assert_held(&cgrp->pidlist_mutex);
365
366 /*
367 * If cgroup gets more users after we read count, we won't have
368 * enough space - tough. This race is indistinguishable to the
369 * caller from the case that the additional cgroup users didn't
370 * show up until sometime later on.
371 */
372 length = cgroup_task_count(cgrp);
373 array = pidlist_allocate(length);
374 if (!array)
375 return -ENOMEM;
376 /* now, populate the array */
377 css_task_iter_start(&cgrp->self, &it);
378 while ((tsk = css_task_iter_next(&it))) {
379 if (unlikely(n == length))
380 break;
381 /* get tgid or pid for procs or tasks file respectively */
382 if (type == CGROUP_FILE_PROCS)
383 pid = task_tgid_vnr(tsk);
384 else
385 pid = task_pid_vnr(tsk);
386 if (pid > 0) /* make sure to only use valid results */
387 array[n++] = pid;
388 }
389 css_task_iter_end(&it);
390 length = n;
391 /* now sort & (if procs) strip out duplicates */
392 sort(array, length, sizeof(pid_t), cmppid, NULL);
393 if (type == CGROUP_FILE_PROCS)
394 length = pidlist_uniq(array, length);
395
396 l = cgroup_pidlist_find_create(cgrp, type);
397 if (!l) {
398 pidlist_free(array);
399 return -ENOMEM;
400 }
401
402 /* store array, freeing old if necessary */
403 pidlist_free(l->list);
404 l->list = array;
405 l->length = length;
406 *lp = l;
407 return 0;
408}
409
410/*
411 * seq_file methods for the tasks/procs files. The seq_file position is the
412 * next pid to display; the seq_file iterator is a pointer to the pid
413 * in the cgroup->l->list array.
414 */
415
416static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
417{
418 /*
419 * Initially we receive a position value that corresponds to
420 * one more than the last pid shown (or 0 on the first call or
421 * after a seek to the start). Use a binary-search to find the
422 * next pid to display, if any
423 */
424 struct kernfs_open_file *of = s->private;
425 struct cgroup *cgrp = seq_css(s)->cgroup;
426 struct cgroup_pidlist *l;
427 enum cgroup_filetype type = seq_cft(s)->private;
428 int index = 0, pid = *pos;
429 int *iter, ret;
430
431 mutex_lock(&cgrp->pidlist_mutex);
432
433 /*
434 * !NULL @of->priv indicates that this isn't the first start()
435 * after open. If the matching pidlist is around, we can use that.
436 * Look for it. Note that @of->priv can't be used directly. It
437 * could already have been destroyed.
438 */
439 if (of->priv)
440 of->priv = cgroup_pidlist_find(cgrp, type);
441
442 /*
443 * Either this is the first start() after open or the matching
444 * pidlist has been destroyed inbetween. Create a new one.
445 */
446 if (!of->priv) {
447 ret = pidlist_array_load(cgrp, type,
448 (struct cgroup_pidlist **)&of->priv);
449 if (ret)
450 return ERR_PTR(ret);
451 }
452 l = of->priv;
453
454 if (pid) {
455 int end = l->length;
456
457 while (index < end) {
458 int mid = (index + end) / 2;
459 if (l->list[mid] == pid) {
460 index = mid;
461 break;
462 } else if (l->list[mid] <= pid)
463 index = mid + 1;
464 else
465 end = mid;
466 }
467 }
468 /* If we're off the end of the array, we're done */
469 if (index >= l->length)
470 return NULL;
471 /* Update the abstract position to be the actual pid that we found */
472 iter = l->list + index;
473 *pos = *iter;
474 return iter;
475}
476
477static void cgroup_pidlist_stop(struct seq_file *s, void *v)
478{
479 struct kernfs_open_file *of = s->private;
480 struct cgroup_pidlist *l = of->priv;
481
482 if (l)
483 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
484 CGROUP_PIDLIST_DESTROY_DELAY);
485 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
486}
487
488static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
489{
490 struct kernfs_open_file *of = s->private;
491 struct cgroup_pidlist *l = of->priv;
492 pid_t *p = v;
493 pid_t *end = l->list + l->length;
494 /*
495 * Advance to the next pid in the array. If this goes off the
496 * end, we're done
497 */
498 p++;
499 if (p >= end) {
500 return NULL;
501 } else {
502 *pos = *p;
503 return p;
504 }
505}
506
507static int cgroup_pidlist_show(struct seq_file *s, void *v)
508{
509 seq_printf(s, "%d\n", *(int *)v);
510
511 return 0;
512}
513
514static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
515 char *buf, size_t nbytes, loff_t off)
516{
517 return __cgroup_procs_write(of, buf, nbytes, off, false);
518}
519
520static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
521 char *buf, size_t nbytes, loff_t off)
522{
523 struct cgroup *cgrp;
524
525 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
526
527 cgrp = cgroup_kn_lock_live(of->kn, false);
528 if (!cgrp)
529 return -ENODEV;
530 spin_lock(&release_agent_path_lock);
531 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
532 sizeof(cgrp->root->release_agent_path));
533 spin_unlock(&release_agent_path_lock);
534 cgroup_kn_unlock(of->kn);
535 return nbytes;
536}
537
538static int cgroup_release_agent_show(struct seq_file *seq, void *v)
539{
540 struct cgroup *cgrp = seq_css(seq)->cgroup;
541
542 spin_lock(&release_agent_path_lock);
543 seq_puts(seq, cgrp->root->release_agent_path);
544 spin_unlock(&release_agent_path_lock);
545 seq_putc(seq, '\n');
546 return 0;
547}
548
549static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
550{
551 seq_puts(seq, "0\n");
552 return 0;
553}
554
555static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
556 struct cftype *cft)
557{
558 return notify_on_release(css->cgroup);
559}
560
561static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
562 struct cftype *cft, u64 val)
563{
564 if (val)
565 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
566 else
567 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
568 return 0;
569}
570
571static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
572 struct cftype *cft)
573{
574 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
575}
576
577static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
578 struct cftype *cft, u64 val)
579{
580 if (val)
581 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
582 else
583 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
584 return 0;
585}
586
587/* cgroup core interface files for the legacy hierarchies */
588struct cftype cgroup1_base_files[] = {
589 {
590 .name = "cgroup.procs",
591 .seq_start = cgroup_pidlist_start,
592 .seq_next = cgroup_pidlist_next,
593 .seq_stop = cgroup_pidlist_stop,
594 .seq_show = cgroup_pidlist_show,
595 .private = CGROUP_FILE_PROCS,
596 .write = cgroup_procs_write,
597 },
598 {
599 .name = "cgroup.clone_children",
600 .read_u64 = cgroup_clone_children_read,
601 .write_u64 = cgroup_clone_children_write,
602 },
603 {
604 .name = "cgroup.sane_behavior",
605 .flags = CFTYPE_ONLY_ON_ROOT,
606 .seq_show = cgroup_sane_behavior_show,
607 },
608 {
609 .name = "tasks",
610 .seq_start = cgroup_pidlist_start,
611 .seq_next = cgroup_pidlist_next,
612 .seq_stop = cgroup_pidlist_stop,
613 .seq_show = cgroup_pidlist_show,
614 .private = CGROUP_FILE_TASKS,
615 .write = cgroup_tasks_write,
616 },
617 {
618 .name = "notify_on_release",
619 .read_u64 = cgroup_read_notify_on_release,
620 .write_u64 = cgroup_write_notify_on_release,
621 },
622 {
623 .name = "release_agent",
624 .flags = CFTYPE_ONLY_ON_ROOT,
625 .seq_show = cgroup_release_agent_show,
626 .write = cgroup_release_agent_write,
627 .max_write_len = PATH_MAX - 1,
628 },
629 { } /* terminate */
630};
631
632/* Display information about each subsystem and each hierarchy */
633static int proc_cgroupstats_show(struct seq_file *m, void *v)
634{
635 struct cgroup_subsys *ss;
636 int i;
637
638 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
639 /*
640 * ideally we don't want subsystems moving around while we do this.
641 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
642 * subsys/hierarchy state.
643 */
644 mutex_lock(&cgroup_mutex);
645
646 for_each_subsys(ss, i)
647 seq_printf(m, "%s\t%d\t%d\t%d\n",
648 ss->legacy_name, ss->root->hierarchy_id,
649 atomic_read(&ss->root->nr_cgrps),
650 cgroup_ssid_enabled(i));
651
652 mutex_unlock(&cgroup_mutex);
653 return 0;
654}
655
656static int cgroupstats_open(struct inode *inode, struct file *file)
657{
658 return single_open(file, proc_cgroupstats_show, NULL);
659}
660
661const struct file_operations proc_cgroupstats_operations = {
662 .open = cgroupstats_open,
663 .read = seq_read,
664 .llseek = seq_lseek,
665 .release = single_release,
666};
667
668/**
669 * cgroupstats_build - build and fill cgroupstats
670 * @stats: cgroupstats to fill information into
671 * @dentry: A dentry entry belonging to the cgroup for which stats have
672 * been requested.
673 *
674 * Build and fill cgroupstats so that taskstats can export it to user
675 * space.
676 */
677int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
678{
679 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
680 struct cgroup *cgrp;
681 struct css_task_iter it;
682 struct task_struct *tsk;
683
684 /* it should be kernfs_node belonging to cgroupfs and is a directory */
685 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
686 kernfs_type(kn) != KERNFS_DIR)
687 return -EINVAL;
688
689 mutex_lock(&cgroup_mutex);
690
691 /*
692 * We aren't being called from kernfs and there's no guarantee on
693 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
694 * @kn->priv is RCU safe. Let's do the RCU dancing.
695 */
696 rcu_read_lock();
697 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
698 if (!cgrp || cgroup_is_dead(cgrp)) {
699 rcu_read_unlock();
700 mutex_unlock(&cgroup_mutex);
701 return -ENOENT;
702 }
703 rcu_read_unlock();
704
705 css_task_iter_start(&cgrp->self, &it);
706 while ((tsk = css_task_iter_next(&it))) {
707 switch (tsk->state) {
708 case TASK_RUNNING:
709 stats->nr_running++;
710 break;
711 case TASK_INTERRUPTIBLE:
712 stats->nr_sleeping++;
713 break;
714 case TASK_UNINTERRUPTIBLE:
715 stats->nr_uninterruptible++;
716 break;
717 case TASK_STOPPED:
718 stats->nr_stopped++;
719 break;
720 default:
721 if (delayacct_is_task_waiting_on_io(tsk))
722 stats->nr_io_wait++;
723 break;
724 }
725 }
726 css_task_iter_end(&it);
727
728 mutex_unlock(&cgroup_mutex);
729 return 0;
730}
731
732void cgroup1_check_for_release(struct cgroup *cgrp)
733{
734 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
735 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
736 schedule_work(&cgrp->release_agent_work);
737}
738
739/*
740 * Notify userspace when a cgroup is released, by running the
741 * configured release agent with the name of the cgroup (path
742 * relative to the root of cgroup file system) as the argument.
743 *
744 * Most likely, this user command will try to rmdir this cgroup.
745 *
746 * This races with the possibility that some other task will be
747 * attached to this cgroup before it is removed, or that some other
748 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
749 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
750 * unused, and this cgroup will be reprieved from its death sentence,
751 * to continue to serve a useful existence. Next time it's released,
752 * we will get notified again, if it still has 'notify_on_release' set.
753 *
754 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
755 * means only wait until the task is successfully execve()'d. The
756 * separate release agent task is forked by call_usermodehelper(),
757 * then control in this thread returns here, without waiting for the
758 * release agent task. We don't bother to wait because the caller of
759 * this routine has no use for the exit status of the release agent
760 * task, so no sense holding our caller up for that.
761 */
762void cgroup1_release_agent(struct work_struct *work)
763{
764 struct cgroup *cgrp =
765 container_of(work, struct cgroup, release_agent_work);
766 char *pathbuf = NULL, *agentbuf = NULL;
767 char *argv[3], *envp[3];
768 int ret;
769
770 mutex_lock(&cgroup_mutex);
771
772 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
773 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
774 if (!pathbuf || !agentbuf)
775 goto out;
776
777 spin_lock_irq(&css_set_lock);
778 ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
779 spin_unlock_irq(&css_set_lock);
780 if (ret < 0 || ret >= PATH_MAX)
781 goto out;
782
783 argv[0] = agentbuf;
784 argv[1] = pathbuf;
785 argv[2] = NULL;
786
787 /* minimal command environment */
788 envp[0] = "HOME=/";
789 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
790 envp[2] = NULL;
791
792 mutex_unlock(&cgroup_mutex);
793 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
794 goto out_free;
795out:
796 mutex_unlock(&cgroup_mutex);
797out_free:
798 kfree(agentbuf);
799 kfree(pathbuf);
800}
801
802/*
803 * cgroup_rename - Only allow simple rename of directories in place.
804 */
805static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
806 const char *new_name_str)
807{
808 struct cgroup *cgrp = kn->priv;
809 int ret;
810
811 if (kernfs_type(kn) != KERNFS_DIR)
812 return -ENOTDIR;
813 if (kn->parent != new_parent)
814 return -EIO;
815
816 /*
817 * We're gonna grab cgroup_mutex which nests outside kernfs
818 * active_ref. kernfs_rename() doesn't require active_ref
819 * protection. Break them before grabbing cgroup_mutex.
820 */
821 kernfs_break_active_protection(new_parent);
822 kernfs_break_active_protection(kn);
823
824 mutex_lock(&cgroup_mutex);
825
826 ret = kernfs_rename(kn, new_parent, new_name_str);
827 if (!ret)
828 trace_cgroup_rename(cgrp);
829
830 mutex_unlock(&cgroup_mutex);
831
832 kernfs_unbreak_active_protection(kn);
833 kernfs_unbreak_active_protection(new_parent);
834 return ret;
835}
836
837static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
838{
839 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
840 struct cgroup_subsys *ss;
841 int ssid;
842
843 for_each_subsys(ss, ssid)
844 if (root->subsys_mask & (1 << ssid))
845 seq_show_option(seq, ss->legacy_name, NULL);
846 if (root->flags & CGRP_ROOT_NOPREFIX)
847 seq_puts(seq, ",noprefix");
848 if (root->flags & CGRP_ROOT_XATTR)
849 seq_puts(seq, ",xattr");
850
851 spin_lock(&release_agent_path_lock);
852 if (strlen(root->release_agent_path))
853 seq_show_option(seq, "release_agent",
854 root->release_agent_path);
855 spin_unlock(&release_agent_path_lock);
856
857 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
858 seq_puts(seq, ",clone_children");
859 if (strlen(root->name))
860 seq_show_option(seq, "name", root->name);
861 return 0;
862}
863
864static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
865{
866 char *token, *o = data;
867 bool all_ss = false, one_ss = false;
868 u16 mask = U16_MAX;
869 struct cgroup_subsys *ss;
870 int nr_opts = 0;
871 int i;
872
873#ifdef CONFIG_CPUSETS
874 mask = ~((u16)1 << cpuset_cgrp_id);
875#endif
876
877 memset(opts, 0, sizeof(*opts));
878
879 while ((token = strsep(&o, ",")) != NULL) {
880 nr_opts++;
881
882 if (!*token)
883 return -EINVAL;
884 if (!strcmp(token, "none")) {
885 /* Explicitly have no subsystems */
886 opts->none = true;
887 continue;
888 }
889 if (!strcmp(token, "all")) {
890 /* Mutually exclusive option 'all' + subsystem name */
891 if (one_ss)
892 return -EINVAL;
893 all_ss = true;
894 continue;
895 }
896 if (!strcmp(token, "noprefix")) {
897 opts->flags |= CGRP_ROOT_NOPREFIX;
898 continue;
899 }
900 if (!strcmp(token, "clone_children")) {
901 opts->cpuset_clone_children = true;
902 continue;
903 }
904 if (!strcmp(token, "xattr")) {
905 opts->flags |= CGRP_ROOT_XATTR;
906 continue;
907 }
908 if (!strncmp(token, "release_agent=", 14)) {
909 /* Specifying two release agents is forbidden */
910 if (opts->release_agent)
911 return -EINVAL;
912 opts->release_agent =
913 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
914 if (!opts->release_agent)
915 return -ENOMEM;
916 continue;
917 }
918 if (!strncmp(token, "name=", 5)) {
919 const char *name = token + 5;
920 /* Can't specify an empty name */
921 if (!strlen(name))
922 return -EINVAL;
923 /* Must match [\w.-]+ */
924 for (i = 0; i < strlen(name); i++) {
925 char c = name[i];
926 if (isalnum(c))
927 continue;
928 if ((c == '.') || (c == '-') || (c == '_'))
929 continue;
930 return -EINVAL;
931 }
932 /* Specifying two names is forbidden */
933 if (opts->name)
934 return -EINVAL;
935 opts->name = kstrndup(name,
936 MAX_CGROUP_ROOT_NAMELEN - 1,
937 GFP_KERNEL);
938 if (!opts->name)
939 return -ENOMEM;
940
941 continue;
942 }
943
944 for_each_subsys(ss, i) {
945 if (strcmp(token, ss->legacy_name))
946 continue;
947 if (!cgroup_ssid_enabled(i))
948 continue;
949 if (cgroup1_ssid_disabled(i))
950 continue;
951
952 /* Mutually exclusive option 'all' + subsystem name */
953 if (all_ss)
954 return -EINVAL;
955 opts->subsys_mask |= (1 << i);
956 one_ss = true;
957
958 break;
959 }
960 if (i == CGROUP_SUBSYS_COUNT)
961 return -ENOENT;
962 }
963
964 /*
965 * If the 'all' option was specified select all the subsystems,
966 * otherwise if 'none', 'name=' and a subsystem name options were
967 * not specified, let's default to 'all'
968 */
969 if (all_ss || (!one_ss && !opts->none && !opts->name))
970 for_each_subsys(ss, i)
971 if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
972 opts->subsys_mask |= (1 << i);
973
974 /*
975 * We either have to specify by name or by subsystems. (So all
976 * empty hierarchies must have a name).
977 */
978 if (!opts->subsys_mask && !opts->name)
979 return -EINVAL;
980
981 /*
982 * Option noprefix was introduced just for backward compatibility
983 * with the old cpuset, so we allow noprefix only if mounting just
984 * the cpuset subsystem.
985 */
986 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
987 return -EINVAL;
988
989 /* Can't specify "none" and some subsystems */
990 if (opts->subsys_mask && opts->none)
991 return -EINVAL;
992
993 return 0;
994}
995
996static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
997{
998 int ret = 0;
999 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1000 struct cgroup_sb_opts opts;
1001 u16 added_mask, removed_mask;
1002
1003 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1004
1005 /* See what subsystems are wanted */
1006 ret = parse_cgroupfs_options(data, &opts);
1007 if (ret)
1008 goto out_unlock;
1009
1010 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1011 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1012 task_tgid_nr(current), current->comm);
1013
1014 added_mask = opts.subsys_mask & ~root->subsys_mask;
1015 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1016
1017 /* Don't allow flags or name to change at remount */
1018 if ((opts.flags ^ root->flags) ||
1019 (opts.name && strcmp(opts.name, root->name))) {
1020 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1021 opts.flags, opts.name ?: "", root->flags, root->name);
1022 ret = -EINVAL;
1023 goto out_unlock;
1024 }
1025
1026 /* remounting is not allowed for populated hierarchies */
1027 if (!list_empty(&root->cgrp.self.children)) {
1028 ret = -EBUSY;
1029 goto out_unlock;
1030 }
1031
1032 ret = rebind_subsystems(root, added_mask);
1033 if (ret)
1034 goto out_unlock;
1035
1036 WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1037
1038 if (opts.release_agent) {
1039 spin_lock(&release_agent_path_lock);
1040 strcpy(root->release_agent_path, opts.release_agent);
1041 spin_unlock(&release_agent_path_lock);
1042 }
1043
1044 trace_cgroup_remount(root);
1045
1046 out_unlock:
1047 kfree(opts.release_agent);
1048 kfree(opts.name);
1049 mutex_unlock(&cgroup_mutex);
1050 return ret;
1051}
1052
1053struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
1054 .rename = cgroup1_rename,
1055 .show_options = cgroup1_show_options,
1056 .remount_fs = cgroup1_remount,
1057 .mkdir = cgroup_mkdir,
1058 .rmdir = cgroup_rmdir,
1059 .show_path = cgroup_show_path,
1060};
1061
1062struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
1063 void *data, unsigned long magic,
1064 struct cgroup_namespace *ns)
1065{
1066 struct super_block *pinned_sb = NULL;
1067 struct cgroup_sb_opts opts;
1068 struct cgroup_root *root;
1069 struct cgroup_subsys *ss;
1070 struct dentry *dentry;
1071 int i, ret;
1072
1073 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1074
1075 /* First find the desired set of subsystems */
1076 ret = parse_cgroupfs_options(data, &opts);
1077 if (ret)
1078 goto out_unlock;
1079
1080 /*
1081 * Destruction of cgroup root is asynchronous, so subsystems may
1082 * still be dying after the previous unmount. Let's drain the
1083 * dying subsystems. We just need to ensure that the ones
1084 * unmounted previously finish dying and don't care about new ones
1085 * starting. Testing ref liveliness is good enough.
1086 */
1087 for_each_subsys(ss, i) {
1088 if (!(opts.subsys_mask & (1 << i)) ||
1089 ss->root == &cgrp_dfl_root)
1090 continue;
1091
1092 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1093 mutex_unlock(&cgroup_mutex);
1094 msleep(10);
1095 ret = restart_syscall();
1096 goto out_free;
1097 }
1098 cgroup_put(&ss->root->cgrp);
1099 }
1100
1101 for_each_root(root) {
1102 bool name_match = false;
1103
1104 if (root == &cgrp_dfl_root)
1105 continue;
1106
1107 /*
1108 * If we asked for a name then it must match. Also, if
1109 * name matches but sybsys_mask doesn't, we should fail.
1110 * Remember whether name matched.
1111 */
1112 if (opts.name) {
1113 if (strcmp(opts.name, root->name))
1114 continue;
1115 name_match = true;
1116 }
1117
1118 /*
1119 * If we asked for subsystems (or explicitly for no
1120 * subsystems) then they must match.
1121 */
1122 if ((opts.subsys_mask || opts.none) &&
1123 (opts.subsys_mask != root->subsys_mask)) {
1124 if (!name_match)
1125 continue;
1126 ret = -EBUSY;
1127 goto out_unlock;
1128 }
1129
1130 if (root->flags ^ opts.flags)
1131 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1132
1133 /*
1134 * We want to reuse @root whose lifetime is governed by its
1135 * ->cgrp. Let's check whether @root is alive and keep it
1136 * that way. As cgroup_kill_sb() can happen anytime, we
1137 * want to block it by pinning the sb so that @root doesn't
1138 * get killed before mount is complete.
1139 *
1140 * With the sb pinned, tryget_live can reliably indicate
1141 * whether @root can be reused. If it's being killed,
1142 * drain it. We can use wait_queue for the wait but this
1143 * path is super cold. Let's just sleep a bit and retry.
1144 */
1145 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1146 if (IS_ERR(pinned_sb) ||
1147 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1148 mutex_unlock(&cgroup_mutex);
1149 if (!IS_ERR_OR_NULL(pinned_sb))
1150 deactivate_super(pinned_sb);
1151 msleep(10);
1152 ret = restart_syscall();
1153 goto out_free;
1154 }
1155
1156 ret = 0;
1157 goto out_unlock;
1158 }
1159
1160 /*
1161 * No such thing, create a new one. name= matching without subsys
1162 * specification is allowed for already existing hierarchies but we
1163 * can't create new one without subsys specification.
1164 */
1165 if (!opts.subsys_mask && !opts.none) {
1166 ret = -EINVAL;
1167 goto out_unlock;
1168 }
1169
1170 /* Hierarchies may only be created in the initial cgroup namespace. */
1171 if (ns != &init_cgroup_ns) {
1172 ret = -EPERM;
1173 goto out_unlock;
1174 }
1175
1176 root = kzalloc(sizeof(*root), GFP_KERNEL);
1177 if (!root) {
1178 ret = -ENOMEM;
1179 goto out_unlock;
1180 }
1181
1182 init_cgroup_root(root, &opts);
1183
1184 ret = cgroup_setup_root(root, opts.subsys_mask);
1185 if (ret)
1186 cgroup_free_root(root);
1187
1188out_unlock:
1189 mutex_unlock(&cgroup_mutex);
1190out_free:
1191 kfree(opts.release_agent);
1192 kfree(opts.name);
1193
1194 if (ret)
1195 return ERR_PTR(ret);
1196
1197 dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
1198 CGROUP_SUPER_MAGIC, ns);
1199
1200 /*
1201 * If @pinned_sb, we're reusing an existing root and holding an
1202 * extra ref on its sb. Mount is complete. Put the extra ref.
1203 */
1204 if (pinned_sb)
1205 deactivate_super(pinned_sb);
1206
1207 return dentry;
1208}
1209
1210static int __init cgroup1_wq_init(void)
1211{
1212 /*
1213 * Used to destroy pidlists and separate to serve as flush domain.
1214 * Cap @max_active to 1 too.
1215 */
1216 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
1217 0, 1);
1218 BUG_ON(!cgroup_pidlist_destroy_wq);
1219 return 0;
1220}
1221core_initcall(cgroup1_wq_init);
1222
1223static int __init cgroup_no_v1(char *str)
1224{
1225 struct cgroup_subsys *ss;
1226 char *token;
1227 int i;
1228
1229 while ((token = strsep(&str, ",")) != NULL) {
1230 if (!*token)
1231 continue;
1232
1233 if (!strcmp(token, "all")) {
1234 cgroup_no_v1_mask = U16_MAX;
1235 break;
1236 }
1237
1238 for_each_subsys(ss, i) {
1239 if (strcmp(token, ss->name) &&
1240 strcmp(token, ss->legacy_name))
1241 continue;
1242
1243 cgroup_no_v1_mask |= 1 << i;
1244 }
1245 }
1246 return 1;
1247}
1248__setup("cgroup_no_v1=", cgroup_no_v1);
1249
1250
1251#ifdef CONFIG_CGROUP_DEBUG
1252static struct cgroup_subsys_state *
1253debug_css_alloc(struct cgroup_subsys_state *parent_css)
1254{
1255 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
1256
1257 if (!css)
1258 return ERR_PTR(-ENOMEM);
1259
1260 return css;
1261}
1262
1263static void debug_css_free(struct cgroup_subsys_state *css)
1264{
1265 kfree(css);
1266}
1267
1268static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
1269 struct cftype *cft)
1270{
1271 return cgroup_task_count(css->cgroup);
1272}
1273
1274static u64 current_css_set_read(struct cgroup_subsys_state *css,
1275 struct cftype *cft)
1276{
1277 return (u64)(unsigned long)current->cgroups;
1278}
1279
1280static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
1281 struct cftype *cft)
1282{
1283 u64 count;
1284
1285 rcu_read_lock();
1286 count = atomic_read(&task_css_set(current)->refcount);
1287 rcu_read_unlock();
1288 return count;
1289}
1290
1291static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
1292{
1293 struct cgrp_cset_link *link;
1294 struct css_set *cset;
1295 char *name_buf;
1296
1297 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
1298 if (!name_buf)
1299 return -ENOMEM;
1300
1301 spin_lock_irq(&css_set_lock);
1302 rcu_read_lock();
1303 cset = rcu_dereference(current->cgroups);
1304 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1305 struct cgroup *c = link->cgrp;
1306
1307 cgroup_name(c, name_buf, NAME_MAX + 1);
1308 seq_printf(seq, "Root %d group %s\n",
1309 c->root->hierarchy_id, name_buf);
1310 }
1311 rcu_read_unlock();
1312 spin_unlock_irq(&css_set_lock);
1313 kfree(name_buf);
1314 return 0;
1315}
1316
1317#define MAX_TASKS_SHOWN_PER_CSS 25
1318static int cgroup_css_links_read(struct seq_file *seq, void *v)
1319{
1320 struct cgroup_subsys_state *css = seq_css(seq);
1321 struct cgrp_cset_link *link;
1322
1323 spin_lock_irq(&css_set_lock);
1324 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
1325 struct css_set *cset = link->cset;
1326 struct task_struct *task;
1327 int count = 0;
1328
1329 seq_printf(seq, "css_set %p\n", cset);
1330
1331 list_for_each_entry(task, &cset->tasks, cg_list) {
1332 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1333 goto overflow;
1334 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1335 }
1336
1337 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
1338 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1339 goto overflow;
1340 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1341 }
1342 continue;
1343 overflow:
1344 seq_puts(seq, " ...\n");
1345 }
1346 spin_unlock_irq(&css_set_lock);
1347 return 0;
1348}
1349
1350static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
1351{
1352 return (!cgroup_is_populated(css->cgroup) &&
1353 !css_has_online_children(&css->cgroup->self));
1354}
1355
1356static struct cftype debug_files[] = {
1357 {
1358 .name = "taskcount",
1359 .read_u64 = debug_taskcount_read,
1360 },
1361
1362 {
1363 .name = "current_css_set",
1364 .read_u64 = current_css_set_read,
1365 },
1366
1367 {
1368 .name = "current_css_set_refcount",
1369 .read_u64 = current_css_set_refcount_read,
1370 },
1371
1372 {
1373 .name = "current_css_set_cg_links",
1374 .seq_show = current_css_set_cg_links_read,
1375 },
1376
1377 {
1378 .name = "cgroup_css_links",
1379 .seq_show = cgroup_css_links_read,
1380 },
1381
1382 {
1383 .name = "releasable",
1384 .read_u64 = releasable_read,
1385 },
1386
1387 { } /* terminate */
1388};
1389
1390struct cgroup_subsys debug_cgrp_subsys = {
1391 .css_alloc = debug_css_alloc,
1392 .css_free = debug_css_free,
1393 .legacy_cftypes = debug_files,
1394};
1395#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
index 53bbca7c4859..e8f87bf9840c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,13 @@
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30 30
31#include <linux/cgroup.h> 31#include "cgroup-internal.h"
32
32#include <linux/cred.h> 33#include <linux/cred.h>
33#include <linux/ctype.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/init_task.h> 35#include <linux/init_task.h>
36#include <linux/kernel.h> 36#include <linux/kernel.h>
37#include <linux/list.h>
38#include <linux/magic.h> 37#include <linux/magic.h>
39#include <linux/mm.h>
40#include <linux/mutex.h> 38#include <linux/mutex.h>
41#include <linux/mount.h> 39#include <linux/mount.h>
42#include <linux/pagemap.h> 40#include <linux/pagemap.h>
@@ -47,16 +45,9 @@
47#include <linux/spinlock.h> 45#include <linux/spinlock.h>
48#include <linux/percpu-rwsem.h> 46#include <linux/percpu-rwsem.h>
49#include <linux/string.h> 47#include <linux/string.h>
50#include <linux/sort.h>
51#include <linux/kmod.h>
52#include <linux/delayacct.h>
53#include <linux/cgroupstats.h>
54#include <linux/hashtable.h> 48#include <linux/hashtable.h>
55#include <linux/pid_namespace.h>
56#include <linux/idr.h> 49#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/kthread.h> 50#include <linux/kthread.h>
59#include <linux/delay.h>
60#include <linux/atomic.h> 51#include <linux/atomic.h>
61#include <linux/cpuset.h> 52#include <linux/cpuset.h>
62#include <linux/proc_ns.h> 53#include <linux/proc_ns.h>
@@ -67,14 +58,6 @@
67#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
68#include <trace/events/cgroup.h> 59#include <trace/events/cgroup.h>
69 60
70/*
71 * pidlists linger the following amount before being destroyed. The goal
72 * is avoiding frequent destruction in the middle of consecutive read calls
73 * Expiring in the middle is a performance problem not a correctness one.
74 * 1 sec should be enough.
75 */
76#define CGROUP_PIDLIST_DESTROY_DELAY HZ
77
78#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ 61#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
79 MAX_CFTYPE_NAME + 2) 62 MAX_CFTYPE_NAME + 2)
80 63
@@ -88,14 +71,12 @@
88 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 71 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
89 * cgroup.h can use them for lockdep annotations. 72 * cgroup.h can use them for lockdep annotations.
90 */ 73 */
91#ifdef CONFIG_PROVE_RCU
92DEFINE_MUTEX(cgroup_mutex); 74DEFINE_MUTEX(cgroup_mutex);
93DEFINE_SPINLOCK(css_set_lock); 75DEFINE_SPINLOCK(css_set_lock);
76
77#ifdef CONFIG_PROVE_RCU
94EXPORT_SYMBOL_GPL(cgroup_mutex); 78EXPORT_SYMBOL_GPL(cgroup_mutex);
95EXPORT_SYMBOL_GPL(css_set_lock); 79EXPORT_SYMBOL_GPL(css_set_lock);
96#else
97static DEFINE_MUTEX(cgroup_mutex);
98static DEFINE_SPINLOCK(css_set_lock);
99#endif 80#endif
100 81
101/* 82/*
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
110 */ 91 */
111static DEFINE_SPINLOCK(cgroup_file_kn_lock); 92static DEFINE_SPINLOCK(cgroup_file_kn_lock);
112 93
113/*
114 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
115 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
116 */
117static DEFINE_SPINLOCK(release_agent_path_lock);
118
119struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 94struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
120 95
121#define cgroup_assert_mutex_or_rcu_locked() \ 96#define cgroup_assert_mutex_or_rcu_locked() \
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
131 */ 106 */
132static struct workqueue_struct *cgroup_destroy_wq; 107static struct workqueue_struct *cgroup_destroy_wq;
133 108
134/*
135 * pidlist destructions need to be flushed on cgroup destruction. Use a
136 * separate workqueue as flush domain.
137 */
138static struct workqueue_struct *cgroup_pidlist_destroy_wq;
139
140/* generate an array of cgroup subsystem pointers */ 109/* generate an array of cgroup subsystem pointers */
141#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, 110#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
142static struct cgroup_subsys *cgroup_subsys[] = { 111struct cgroup_subsys *cgroup_subsys[] = {
143#include <linux/cgroup_subsys.h> 112#include <linux/cgroup_subsys.h>
144}; 113};
145#undef SUBSYS 114#undef SUBSYS
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
186 */ 155 */
187static bool cgrp_dfl_visible; 156static bool cgrp_dfl_visible;
188 157
189/* Controllers blocked by the commandline in v1 */
190static u16 cgroup_no_v1_mask;
191
192/* some controllers are not supported in the default hierarchy */ 158/* some controllers are not supported in the default hierarchy */
193static u16 cgrp_dfl_inhibit_ss_mask; 159static u16 cgrp_dfl_inhibit_ss_mask;
194 160
195/* some controllers are implicitly enabled on the default hierarchy */ 161/* some controllers are implicitly enabled on the default hierarchy */
196static unsigned long cgrp_dfl_implicit_ss_mask; 162static u16 cgrp_dfl_implicit_ss_mask;
197 163
198/* The list of hierarchy roots */ 164/* The list of hierarchy roots */
199 165LIST_HEAD(cgroup_roots);
200static LIST_HEAD(cgroup_roots);
201static int cgroup_root_count; 166static int cgroup_root_count;
202 167
203/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ 168/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
213static u64 css_serial_nr_next = 1; 178static u64 css_serial_nr_next = 1;
214 179
215/* 180/*
216 * These bitmask flags indicate whether tasks in the fork and exit paths have 181 * These bitmasks identify subsystems with specific features to avoid
217 * fork/exit handlers to call. This avoids us having to do extra work in the 182 * having to do iterative checks repeatedly.
218 * fork/exit path to check which subsystems have fork/exit callbacks.
219 */ 183 */
220static u16 have_fork_callback __read_mostly; 184static u16 have_fork_callback __read_mostly;
221static u16 have_exit_callback __read_mostly; 185static u16 have_exit_callback __read_mostly;
222static u16 have_free_callback __read_mostly; 186static u16 have_free_callback __read_mostly;
187static u16 have_canfork_callback __read_mostly;
223 188
224/* cgroup namespace for init task */ 189/* cgroup namespace for init task */
225struct cgroup_namespace init_cgroup_ns = { 190struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
230 .root_cset = &init_css_set, 195 .root_cset = &init_css_set,
231}; 196};
232 197
233/* Ditto for the can_fork callback. */
234static u16 have_canfork_callback __read_mostly;
235
236static struct file_system_type cgroup2_fs_type; 198static struct file_system_type cgroup2_fs_type;
237static struct cftype cgroup_dfl_base_files[]; 199static struct cftype cgroup_base_files[];
238static struct cftype cgroup_legacy_base_files[];
239 200
240static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
241static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
242static int cgroup_apply_control(struct cgroup *cgrp); 201static int cgroup_apply_control(struct cgroup *cgrp);
243static void cgroup_finalize_control(struct cgroup *cgrp, int ret); 202static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
244static void css_task_iter_advance(struct css_task_iter *it); 203static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
259 * is fine for individual subsystems but unsuitable for cgroup core. This 218 * is fine for individual subsystems but unsuitable for cgroup core. This
260 * is slower static_key_enabled() based test indexed by @ssid. 219 * is slower static_key_enabled() based test indexed by @ssid.
261 */ 220 */
262static bool cgroup_ssid_enabled(int ssid) 221bool cgroup_ssid_enabled(int ssid)
263{ 222{
264 if (CGROUP_SUBSYS_COUNT == 0) 223 if (CGROUP_SUBSYS_COUNT == 0)
265 return false; 224 return false;
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
267 return static_key_enabled(cgroup_subsys_enabled_key[ssid]); 226 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
268} 227}
269 228
270static bool cgroup_ssid_no_v1(int ssid)
271{
272 return cgroup_no_v1_mask & (1 << ssid);
273}
274
275/** 229/**
276 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy 230 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
277 * @cgrp: the cgroup of interest 231 * @cgrp: the cgroup of interest
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
325 * 279 *
326 * - debug: disallowed on the default hierarchy. 280 * - debug: disallowed on the default hierarchy.
327 */ 281 */
328static bool cgroup_on_dfl(const struct cgroup *cgrp) 282bool cgroup_on_dfl(const struct cgroup *cgrp)
329{ 283{
330 return cgrp->root == &cgrp_dfl_root; 284 return cgrp->root == &cgrp_dfl_root;
331} 285}
@@ -481,12 +435,6 @@ out_unlock:
481 return css; 435 return css;
482} 436}
483 437
484/* convenient tests for these bits */
485static inline bool cgroup_is_dead(const struct cgroup *cgrp)
486{
487 return !(cgrp->self.flags & CSS_ONLINE);
488}
489
490static void cgroup_get(struct cgroup *cgrp) 438static void cgroup_get(struct cgroup *cgrp)
491{ 439{
492 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 440 WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
518} 466}
519EXPORT_SYMBOL_GPL(of_css); 467EXPORT_SYMBOL_GPL(of_css);
520 468
521static int notify_on_release(const struct cgroup *cgrp)
522{
523 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
524}
525
526/** 469/**
527 * for_each_css - iterate all css's of a cgroup 470 * for_each_css - iterate all css's of a cgroup
528 * @css: the iteration cursor 471 * @css: the iteration cursor
@@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp)
553 else 496 else
554 497
555/** 498/**
556 * for_each_subsys - iterate all enabled cgroup subsystems
557 * @ss: the iteration cursor
558 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
559 */
560#define for_each_subsys(ss, ssid) \
561 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
562 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
563
564/**
565 * do_each_subsys_mask - filter for_each_subsys with a bitmask 499 * do_each_subsys_mask - filter for_each_subsys with a bitmask
566 * @ss: the iteration cursor 500 * @ss: the iteration cursor
567 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 501 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
585 } \ 519 } \
586} while (false) 520} while (false)
587 521
588/* iterate across the hierarchies */
589#define for_each_root(root) \
590 list_for_each_entry((root), &cgroup_roots, root_list)
591
592/* iterate over child cgrps, lock should be held throughout iteration */ 522/* iterate over child cgrps, lock should be held throughout iteration */
593#define cgroup_for_each_live_child(child, cgrp) \ 523#define cgroup_for_each_live_child(child, cgrp) \
594 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ 524 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
615 ; \ 545 ; \
616 else 546 else
617 547
618static void cgroup_release_agent(struct work_struct *work);
619static void check_for_release(struct cgroup *cgrp);
620
621/*
622 * A cgroup can be associated with multiple css_sets as different tasks may
623 * belong to different cgroups on different hierarchies. In the other
624 * direction, a css_set is naturally associated with multiple cgroups.
625 * This M:N relationship is represented by the following link structure
626 * which exists for each association and allows traversing the associations
627 * from both sides.
628 */
629struct cgrp_cset_link {
630 /* the cgroup and css_set this link associates */
631 struct cgroup *cgrp;
632 struct css_set *cset;
633
634 /* list of cgrp_cset_links anchored at cgrp->cset_links */
635 struct list_head cset_link;
636
637 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
638 struct list_head cgrp_link;
639};
640
641/* 548/*
642 * The default css_set - used by init and its children prior to any 549 * The default css_set - used by init and its children prior to any
643 * hierarchies being mounted. It contains a pointer to the root state 550 * hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
647 */ 554 */
648struct css_set init_css_set = { 555struct css_set init_css_set = {
649 .refcount = ATOMIC_INIT(1), 556 .refcount = ATOMIC_INIT(1),
650 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
651 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 557 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
652 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 558 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
559 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
560 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
653 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 561 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
654 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 562 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
655 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
656}; 563};
657 564
658static int css_set_count = 1; /* 1 for init_css_set */ 565static int css_set_count = 1; /* 1 for init_css_set */
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
699 if (!trigger) 606 if (!trigger)
700 break; 607 break;
701 608
702 check_for_release(cgrp); 609 cgroup1_check_for_release(cgrp);
703 cgroup_file_notify(&cgrp->events_file); 610 cgroup_file_notify(&cgrp->events_file);
704 611
705 cgrp = cgroup_parent(cgrp); 612 cgrp = cgroup_parent(cgrp);
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
808 return key; 715 return key;
809} 716}
810 717
811static void put_css_set_locked(struct css_set *cset) 718void put_css_set_locked(struct css_set *cset)
812{ 719{
813 struct cgrp_cset_link *link, *tmp_link; 720 struct cgrp_cset_link *link, *tmp_link;
814 struct cgroup_subsys *ss; 721 struct cgroup_subsys *ss;
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
838 kfree_rcu(cset, rcu_head); 745 kfree_rcu(cset, rcu_head);
839} 746}
840 747
841static void put_css_set(struct css_set *cset)
842{
843 unsigned long flags;
844
845 /*
846 * Ensure that the refcount doesn't hit zero while any readers
847 * can see it. Similar to atomic_dec_and_lock(), but for an
848 * rwlock
849 */
850 if (atomic_add_unless(&cset->refcount, -1, 1))
851 return;
852
853 spin_lock_irqsave(&css_set_lock, flags);
854 put_css_set_locked(cset);
855 spin_unlock_irqrestore(&css_set_lock, flags);
856}
857
858/*
859 * refcounted get/put for css_set objects
860 */
861static inline void get_css_set(struct css_set *cset)
862{
863 atomic_inc(&cset->refcount);
864}
865
866/** 748/**
867 * compare_css_sets - helper function for find_existing_css_set(). 749 * compare_css_sets - helper function for find_existing_css_set().
868 * @cset: candidate css_set being tested 750 * @cset: candidate css_set being tested
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1095 } 977 }
1096 978
1097 atomic_set(&cset->refcount, 1); 979 atomic_set(&cset->refcount, 1);
1098 INIT_LIST_HEAD(&cset->cgrp_links);
1099 INIT_LIST_HEAD(&cset->tasks); 980 INIT_LIST_HEAD(&cset->tasks);
1100 INIT_LIST_HEAD(&cset->mg_tasks); 981 INIT_LIST_HEAD(&cset->mg_tasks);
1101 INIT_LIST_HEAD(&cset->mg_preload_node);
1102 INIT_LIST_HEAD(&cset->mg_node);
1103 INIT_LIST_HEAD(&cset->task_iters); 982 INIT_LIST_HEAD(&cset->task_iters);
1104 INIT_HLIST_NODE(&cset->hlist); 983 INIT_HLIST_NODE(&cset->hlist);
984 INIT_LIST_HEAD(&cset->cgrp_links);
985 INIT_LIST_HEAD(&cset->mg_preload_node);
986 INIT_LIST_HEAD(&cset->mg_node);
1105 987
1106 /* Copy the set of subsystem state objects generated in 988 /* Copy the set of subsystem state objects generated in
1107 * find_existing_css_set() */ 989 * find_existing_css_set() */
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1138 return cset; 1020 return cset;
1139} 1021}
1140 1022
1141static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 1023struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1142{ 1024{
1143 struct cgroup *root_cgrp = kf_root->kn->priv; 1025 struct cgroup *root_cgrp = kf_root->kn->priv;
1144 1026
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
1166 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); 1048 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1167} 1049}
1168 1050
1169static void cgroup_free_root(struct cgroup_root *root) 1051void cgroup_free_root(struct cgroup_root *root)
1170{ 1052{
1171 if (root) { 1053 if (root) {
1172 idr_destroy(&root->cgroup_idr); 1054 idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1283 * Return the cgroup for "task" from the given hierarchy. Must be 1165 * Return the cgroup for "task" from the given hierarchy. Must be
1284 * called with cgroup_mutex and css_set_lock held. 1166 * called with cgroup_mutex and css_set_lock held.
1285 */ 1167 */
1286static struct cgroup *task_cgroup_from_root(struct task_struct *task, 1168struct cgroup *task_cgroup_from_root(struct task_struct *task,
1287 struct cgroup_root *root) 1169 struct cgroup_root *root)
1288{ 1170{
1289 /* 1171 /*
1290 * No need to lock the task - since we hold cgroup_mutex the 1172 * No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1321 */ 1203 */
1322 1204
1323static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1205static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1324static const struct file_operations proc_cgroupstats_operations;
1325 1206
1326static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, 1207static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1327 char *buf) 1208 char *buf)
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1415 * inaccessible any time. If the caller intends to continue to access the 1296 * inaccessible any time. If the caller intends to continue to access the
1416 * cgroup, it should pin it before invoking this function. 1297 * cgroup, it should pin it before invoking this function.
1417 */ 1298 */
1418static void cgroup_kn_unlock(struct kernfs_node *kn) 1299void cgroup_kn_unlock(struct kernfs_node *kn)
1419{ 1300{
1420 struct cgroup *cgrp; 1301 struct cgroup *cgrp;
1421 1302
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
1447 * locking under kernfs active protection and allows all kernfs operations 1328 * locking under kernfs active protection and allows all kernfs operations
1448 * including self-removal. 1329 * including self-removal.
1449 */ 1330 */
1450static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, 1331struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1451 bool drain_offline)
1452{ 1332{
1453 struct cgroup *cgrp; 1333 struct cgroup *cgrp;
1454 1334
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
1532 1412
1533 if (!css->ss) { 1413 if (!css->ss) {
1534 if (cgroup_on_dfl(cgrp)) 1414 if (cgroup_on_dfl(cgrp))
1535 cfts = cgroup_dfl_base_files; 1415 cfts = cgroup_base_files;
1536 else 1416 else
1537 cfts = cgroup_legacy_base_files; 1417 cfts = cgroup1_base_files;
1538 1418
1539 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); 1419 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1540 } 1420 }
@@ -1559,7 +1439,7 @@ err:
1559 return ret; 1439 return ret;
1560} 1440}
1561 1441
1562static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) 1442int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1563{ 1443{
1564 struct cgroup *dcgrp = &dst_root->cgrp; 1444 struct cgroup *dcgrp = &dst_root->cgrp;
1565 struct cgroup_subsys *ss; 1445 struct cgroup_subsys *ss;
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1629 return 0; 1509 return 0;
1630} 1510}
1631 1511
1632static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, 1512int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1633 struct kernfs_root *kf_root) 1513 struct kernfs_root *kf_root)
1634{ 1514{
1635 int len = 0; 1515 int len = 0;
1636 char *buf = NULL; 1516 char *buf = NULL;
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1656 return len; 1536 return len;
1657} 1537}
1658 1538
1659static int cgroup_show_options(struct seq_file *seq,
1660 struct kernfs_root *kf_root)
1661{
1662 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1663 struct cgroup_subsys *ss;
1664 int ssid;
1665
1666 if (root != &cgrp_dfl_root)
1667 for_each_subsys(ss, ssid)
1668 if (root->subsys_mask & (1 << ssid))
1669 seq_show_option(seq, ss->legacy_name, NULL);
1670 if (root->flags & CGRP_ROOT_NOPREFIX)
1671 seq_puts(seq, ",noprefix");
1672 if (root->flags & CGRP_ROOT_XATTR)
1673 seq_puts(seq, ",xattr");
1674
1675 spin_lock(&release_agent_path_lock);
1676 if (strlen(root->release_agent_path))
1677 seq_show_option(seq, "release_agent",
1678 root->release_agent_path);
1679 spin_unlock(&release_agent_path_lock);
1680
1681 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1682 seq_puts(seq, ",clone_children");
1683 if (strlen(root->name))
1684 seq_show_option(seq, "name", root->name);
1685 return 0;
1686}
1687
1688struct cgroup_sb_opts {
1689 u16 subsys_mask;
1690 unsigned int flags;
1691 char *release_agent;
1692 bool cpuset_clone_children;
1693 char *name;
1694 /* User explicitly requested empty subsystem */
1695 bool none;
1696};
1697
1698static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1699{
1700 char *token, *o = data;
1701 bool all_ss = false, one_ss = false;
1702 u16 mask = U16_MAX;
1703 struct cgroup_subsys *ss;
1704 int nr_opts = 0;
1705 int i;
1706
1707#ifdef CONFIG_CPUSETS
1708 mask = ~((u16)1 << cpuset_cgrp_id);
1709#endif
1710
1711 memset(opts, 0, sizeof(*opts));
1712
1713 while ((token = strsep(&o, ",")) != NULL) {
1714 nr_opts++;
1715
1716 if (!*token)
1717 return -EINVAL;
1718 if (!strcmp(token, "none")) {
1719 /* Explicitly have no subsystems */
1720 opts->none = true;
1721 continue;
1722 }
1723 if (!strcmp(token, "all")) {
1724 /* Mutually exclusive option 'all' + subsystem name */
1725 if (one_ss)
1726 return -EINVAL;
1727 all_ss = true;
1728 continue;
1729 }
1730 if (!strcmp(token, "noprefix")) {
1731 opts->flags |= CGRP_ROOT_NOPREFIX;
1732 continue;
1733 }
1734 if (!strcmp(token, "clone_children")) {
1735 opts->cpuset_clone_children = true;
1736 continue;
1737 }
1738 if (!strcmp(token, "xattr")) {
1739 opts->flags |= CGRP_ROOT_XATTR;
1740 continue;
1741 }
1742 if (!strncmp(token, "release_agent=", 14)) {
1743 /* Specifying two release agents is forbidden */
1744 if (opts->release_agent)
1745 return -EINVAL;
1746 opts->release_agent =
1747 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1748 if (!opts->release_agent)
1749 return -ENOMEM;
1750 continue;
1751 }
1752 if (!strncmp(token, "name=", 5)) {
1753 const char *name = token + 5;
1754 /* Can't specify an empty name */
1755 if (!strlen(name))
1756 return -EINVAL;
1757 /* Must match [\w.-]+ */
1758 for (i = 0; i < strlen(name); i++) {
1759 char c = name[i];
1760 if (isalnum(c))
1761 continue;
1762 if ((c == '.') || (c == '-') || (c == '_'))
1763 continue;
1764 return -EINVAL;
1765 }
1766 /* Specifying two names is forbidden */
1767 if (opts->name)
1768 return -EINVAL;
1769 opts->name = kstrndup(name,
1770 MAX_CGROUP_ROOT_NAMELEN - 1,
1771 GFP_KERNEL);
1772 if (!opts->name)
1773 return -ENOMEM;
1774
1775 continue;
1776 }
1777
1778 for_each_subsys(ss, i) {
1779 if (strcmp(token, ss->legacy_name))
1780 continue;
1781 if (!cgroup_ssid_enabled(i))
1782 continue;
1783 if (cgroup_ssid_no_v1(i))
1784 continue;
1785
1786 /* Mutually exclusive option 'all' + subsystem name */
1787 if (all_ss)
1788 return -EINVAL;
1789 opts->subsys_mask |= (1 << i);
1790 one_ss = true;
1791
1792 break;
1793 }
1794 if (i == CGROUP_SUBSYS_COUNT)
1795 return -ENOENT;
1796 }
1797
1798 /*
1799 * If the 'all' option was specified select all the subsystems,
1800 * otherwise if 'none', 'name=' and a subsystem name options were
1801 * not specified, let's default to 'all'
1802 */
1803 if (all_ss || (!one_ss && !opts->none && !opts->name))
1804 for_each_subsys(ss, i)
1805 if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
1806 opts->subsys_mask |= (1 << i);
1807
1808 /*
1809 * We either have to specify by name or by subsystems. (So all
1810 * empty hierarchies must have a name).
1811 */
1812 if (!opts->subsys_mask && !opts->name)
1813 return -EINVAL;
1814
1815 /*
1816 * Option noprefix was introduced just for backward compatibility
1817 * with the old cpuset, so we allow noprefix only if mounting just
1818 * the cpuset subsystem.
1819 */
1820 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1821 return -EINVAL;
1822
1823 /* Can't specify "none" and some subsystems */
1824 if (opts->subsys_mask && opts->none)
1825 return -EINVAL;
1826
1827 return 0;
1828}
1829
1830static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1539static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1831{ 1540{
1832 int ret = 0; 1541 pr_err("remount is not allowed\n");
1833 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1542 return -EINVAL;
1834 struct cgroup_sb_opts opts;
1835 u16 added_mask, removed_mask;
1836
1837 if (root == &cgrp_dfl_root) {
1838 pr_err("remount is not allowed\n");
1839 return -EINVAL;
1840 }
1841
1842 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1843
1844 /* See what subsystems are wanted */
1845 ret = parse_cgroupfs_options(data, &opts);
1846 if (ret)
1847 goto out_unlock;
1848
1849 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1850 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1851 task_tgid_nr(current), current->comm);
1852
1853 added_mask = opts.subsys_mask & ~root->subsys_mask;
1854 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1855
1856 /* Don't allow flags or name to change at remount */
1857 if ((opts.flags ^ root->flags) ||
1858 (opts.name && strcmp(opts.name, root->name))) {
1859 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1860 opts.flags, opts.name ?: "", root->flags, root->name);
1861 ret = -EINVAL;
1862 goto out_unlock;
1863 }
1864
1865 /* remounting is not allowed for populated hierarchies */
1866 if (!list_empty(&root->cgrp.self.children)) {
1867 ret = -EBUSY;
1868 goto out_unlock;
1869 }
1870
1871 ret = rebind_subsystems(root, added_mask);
1872 if (ret)
1873 goto out_unlock;
1874
1875 WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1876
1877 if (opts.release_agent) {
1878 spin_lock(&release_agent_path_lock);
1879 strcpy(root->release_agent_path, opts.release_agent);
1880 spin_unlock(&release_agent_path_lock);
1881 }
1882
1883 trace_cgroup_remount(root);
1884
1885 out_unlock:
1886 kfree(opts.release_agent);
1887 kfree(opts.name);
1888 mutex_unlock(&cgroup_mutex);
1889 return ret;
1890} 1543}
1891 1544
1892/* 1545/*
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1964 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1617 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1965 1618
1966 init_waitqueue_head(&cgrp->offline_waitq); 1619 init_waitqueue_head(&cgrp->offline_waitq);
1967 INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); 1620 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1968} 1621}
1969 1622
1970static void init_cgroup_root(struct cgroup_root *root, 1623void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1971 struct cgroup_sb_opts *opts)
1972{ 1624{
1973 struct cgroup *cgrp = &root->cgrp; 1625 struct cgroup *cgrp = &root->cgrp;
1974 1626
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
1987 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1639 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1988} 1640}
1989 1641
1990static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) 1642int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1991{ 1643{
1992 LIST_HEAD(tmp_links); 1644 LIST_HEAD(tmp_links);
1993 struct cgroup *root_cgrp = &root->cgrp; 1645 struct cgroup *root_cgrp = &root->cgrp;
1646 struct kernfs_syscall_ops *kf_sops;
1994 struct css_set *cset; 1647 struct css_set *cset;
1995 int i, ret; 1648 int i, ret;
1996 1649
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2022 if (ret) 1675 if (ret)
2023 goto cancel_ref; 1676 goto cancel_ref;
2024 1677
2025 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1678 kf_sops = root == &cgrp_dfl_root ?
1679 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1680
1681 root->kf_root = kernfs_create_root(kf_sops,
2026 KERNFS_ROOT_CREATE_DEACTIVATED, 1682 KERNFS_ROOT_CREATE_DEACTIVATED,
2027 root_cgrp); 1683 root_cgrp);
2028 if (IS_ERR(root->kf_root)) { 1684 if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1736,18 @@ out:
2080 return ret; 1736 return ret;
2081} 1737}
2082 1738
2083static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1739struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
2084 int flags, const char *unused_dev_name, 1740 struct cgroup_root *root, unsigned long magic,
2085 void *data) 1741 struct cgroup_namespace *ns)
2086{ 1742{
2087 bool is_v2 = fs_type == &cgroup2_fs_type;
2088 struct super_block *pinned_sb = NULL;
2089 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2090 struct cgroup_subsys *ss;
2091 struct cgroup_root *root;
2092 struct cgroup_sb_opts opts;
2093 struct dentry *dentry; 1743 struct dentry *dentry;
2094 int ret;
2095 int i;
2096 bool new_sb; 1744 bool new_sb;
2097 1745
2098 get_cgroup_ns(ns); 1746 dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
2099
2100 /* Check if the caller has permission to mount. */
2101 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2102 put_cgroup_ns(ns);
2103 return ERR_PTR(-EPERM);
2104 }
2105
2106 /*
2107 * The first time anyone tries to mount a cgroup, enable the list
2108 * linking each css_set to its tasks and fix up all existing tasks.
2109 */
2110 if (!use_task_css_set_links)
2111 cgroup_enable_task_cg_lists();
2112
2113 if (is_v2) {
2114 if (data) {
2115 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
2116 put_cgroup_ns(ns);
2117 return ERR_PTR(-EINVAL);
2118 }
2119 cgrp_dfl_visible = true;
2120 root = &cgrp_dfl_root;
2121 cgroup_get(&root->cgrp);
2122 goto out_mount;
2123 }
2124
2125 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
2126
2127 /* First find the desired set of subsystems */
2128 ret = parse_cgroupfs_options(data, &opts);
2129 if (ret)
2130 goto out_unlock;
2131
2132 /*
2133 * Destruction of cgroup root is asynchronous, so subsystems may
2134 * still be dying after the previous unmount. Let's drain the
2135 * dying subsystems. We just need to ensure that the ones
2136 * unmounted previously finish dying and don't care about new ones
2137 * starting. Testing ref liveliness is good enough.
2138 */
2139 for_each_subsys(ss, i) {
2140 if (!(opts.subsys_mask & (1 << i)) ||
2141 ss->root == &cgrp_dfl_root)
2142 continue;
2143
2144 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
2145 mutex_unlock(&cgroup_mutex);
2146 msleep(10);
2147 ret = restart_syscall();
2148 goto out_free;
2149 }
2150 cgroup_put(&ss->root->cgrp);
2151 }
2152
2153 for_each_root(root) {
2154 bool name_match = false;
2155
2156 if (root == &cgrp_dfl_root)
2157 continue;
2158
2159 /*
2160 * If we asked for a name then it must match. Also, if
2161 * name matches but sybsys_mask doesn't, we should fail.
2162 * Remember whether name matched.
2163 */
2164 if (opts.name) {
2165 if (strcmp(opts.name, root->name))
2166 continue;
2167 name_match = true;
2168 }
2169
2170 /*
2171 * If we asked for subsystems (or explicitly for no
2172 * subsystems) then they must match.
2173 */
2174 if ((opts.subsys_mask || opts.none) &&
2175 (opts.subsys_mask != root->subsys_mask)) {
2176 if (!name_match)
2177 continue;
2178 ret = -EBUSY;
2179 goto out_unlock;
2180 }
2181
2182 if (root->flags ^ opts.flags)
2183 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
2184
2185 /*
2186 * We want to reuse @root whose lifetime is governed by its
2187 * ->cgrp. Let's check whether @root is alive and keep it
2188 * that way. As cgroup_kill_sb() can happen anytime, we
2189 * want to block it by pinning the sb so that @root doesn't
2190 * get killed before mount is complete.
2191 *
2192 * With the sb pinned, tryget_live can reliably indicate
2193 * whether @root can be reused. If it's being killed,
2194 * drain it. We can use wait_queue for the wait but this
2195 * path is super cold. Let's just sleep a bit and retry.
2196 */
2197 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
2198 if (IS_ERR(pinned_sb) ||
2199 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
2200 mutex_unlock(&cgroup_mutex);
2201 if (!IS_ERR_OR_NULL(pinned_sb))
2202 deactivate_super(pinned_sb);
2203 msleep(10);
2204 ret = restart_syscall();
2205 goto out_free;
2206 }
2207
2208 ret = 0;
2209 goto out_unlock;
2210 }
2211 1747
2212 /* 1748 /*
2213 * No such thing, create a new one. name= matching without subsys 1749 * In non-init cgroup namespace, instead of root cgroup's dentry,
2214 * specification is allowed for already existing hierarchies but we 1750 * we return the dentry corresponding to the cgroupns->root_cgrp.
2215 * can't create new one without subsys specification.
2216 */
2217 if (!opts.subsys_mask && !opts.none) {
2218 ret = -EINVAL;
2219 goto out_unlock;
2220 }
2221
2222 /* Hierarchies may only be created in the initial cgroup namespace. */
2223 if (ns != &init_cgroup_ns) {
2224 ret = -EPERM;
2225 goto out_unlock;
2226 }
2227
2228 root = kzalloc(sizeof(*root), GFP_KERNEL);
2229 if (!root) {
2230 ret = -ENOMEM;
2231 goto out_unlock;
2232 }
2233
2234 init_cgroup_root(root, &opts);
2235
2236 ret = cgroup_setup_root(root, opts.subsys_mask);
2237 if (ret)
2238 cgroup_free_root(root);
2239
2240out_unlock:
2241 mutex_unlock(&cgroup_mutex);
2242out_free:
2243 kfree(opts.release_agent);
2244 kfree(opts.name);
2245
2246 if (ret) {
2247 put_cgroup_ns(ns);
2248 return ERR_PTR(ret);
2249 }
2250out_mount:
2251 dentry = kernfs_mount(fs_type, flags, root->kf_root,
2252 is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
2253 &new_sb);
2254
2255 /*
2256 * In non-init cgroup namespace, instead of root cgroup's
2257 * dentry, we return the dentry corresponding to the
2258 * cgroupns->root_cgrp.
2259 */ 1751 */
2260 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { 1752 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
2261 struct dentry *nsdentry; 1753 struct dentry *nsdentry;
@@ -2277,13 +1769,45 @@ out_mount:
2277 if (IS_ERR(dentry) || !new_sb) 1769 if (IS_ERR(dentry) || !new_sb)
2278 cgroup_put(&root->cgrp); 1770 cgroup_put(&root->cgrp);
2279 1771
1772 return dentry;
1773}
1774
1775static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1776 int flags, const char *unused_dev_name,
1777 void *data)
1778{
1779 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1780 struct dentry *dentry;
1781
1782 get_cgroup_ns(ns);
1783
1784 /* Check if the caller has permission to mount. */
1785 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
1786 put_cgroup_ns(ns);
1787 return ERR_PTR(-EPERM);
1788 }
1789
2280 /* 1790 /*
2281 * If @pinned_sb, we're reusing an existing root and holding an 1791 * The first time anyone tries to mount a cgroup, enable the list
2282 * extra ref on its sb. Mount is complete. Put the extra ref. 1792 * linking each css_set to its tasks and fix up all existing tasks.
2283 */ 1793 */
2284 if (pinned_sb) { 1794 if (!use_task_css_set_links)
2285 WARN_ON(new_sb); 1795 cgroup_enable_task_cg_lists();
2286 deactivate_super(pinned_sb); 1796
1797 if (fs_type == &cgroup2_fs_type) {
1798 if (data) {
1799 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
1800 put_cgroup_ns(ns);
1801 return ERR_PTR(-EINVAL);
1802 }
1803 cgrp_dfl_visible = true;
1804 cgroup_get(&cgrp_dfl_root.cgrp);
1805
1806 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1807 CGROUP2_SUPER_MAGIC, ns);
1808 } else {
1809 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1810 CGROUP_SUPER_MAGIC, ns);
2287 } 1811 }
2288 1812
2289 put_cgroup_ns(ns); 1813 put_cgroup_ns(ns);
@@ -2311,7 +1835,7 @@ static void cgroup_kill_sb(struct super_block *sb)
2311 kernfs_kill_sb(sb); 1835 kernfs_kill_sb(sb);
2312} 1836}
2313 1837
2314static struct file_system_type cgroup_fs_type = { 1838struct file_system_type cgroup_fs_type = {
2315 .name = "cgroup", 1839 .name = "cgroup",
2316 .mount = cgroup_mount, 1840 .mount = cgroup_mount,
2317 .kill_sb = cgroup_kill_sb, 1841 .kill_sb = cgroup_kill_sb,
@@ -2325,8 +1849,8 @@ static struct file_system_type cgroup2_fs_type = {
2325 .fs_flags = FS_USERNS_MOUNT, 1849 .fs_flags = FS_USERNS_MOUNT,
2326}; 1850};
2327 1851
2328static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, 1852int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2329 struct cgroup_namespace *ns) 1853 struct cgroup_namespace *ns)
2330{ 1854{
2331 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); 1855 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2332 1856
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2389} 1913}
2390EXPORT_SYMBOL_GPL(task_cgroup_path); 1914EXPORT_SYMBOL_GPL(task_cgroup_path);
2391 1915
2392/* used to track tasks and other necessary states during migration */
2393struct cgroup_taskset {
2394 /* the src and dst cset list running through cset->mg_node */
2395 struct list_head src_csets;
2396 struct list_head dst_csets;
2397
2398 /* the subsys currently being processed */
2399 int ssid;
2400
2401 /*
2402 * Fields for cgroup_taskset_*() iteration.
2403 *
2404 * Before migration is committed, the target migration tasks are on
2405 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
2406 * the csets on ->dst_csets. ->csets point to either ->src_csets
2407 * or ->dst_csets depending on whether migration is committed.
2408 *
2409 * ->cur_csets and ->cur_task point to the current task position
2410 * during iteration.
2411 */
2412 struct list_head *csets;
2413 struct css_set *cur_cset;
2414 struct task_struct *cur_task;
2415};
2416
2417#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
2418 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
2419 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
2420 .csets = &tset.src_csets, \
2421}
2422
2423/** 1916/**
2424 * cgroup_taskset_add - try to add a migration target task to a taskset 1917 * cgroup_migrate_add_task - add a migration target task to a migration context
2425 * @task: target task 1918 * @task: target task
2426 * @tset: target taskset 1919 * @mgctx: target migration context
2427 * 1920 *
2428 * Add @task, which is a migration target, to @tset. This function becomes 1921 * Add @task, which is a migration target, to @mgctx->tset. This function
2429 * noop if @task doesn't need to be migrated. @task's css_set should have 1922 * becomes noop if @task doesn't need to be migrated. @task's css_set
2430 * been added as a migration source and @task->cg_list will be moved from 1923 * should have been added as a migration source and @task->cg_list will be
2431 * the css_set's tasks list to mg_tasks one. 1924 * moved from the css_set's tasks list to mg_tasks one.
2432 */ 1925 */
2433static void cgroup_taskset_add(struct task_struct *task, 1926static void cgroup_migrate_add_task(struct task_struct *task,
2434 struct cgroup_taskset *tset) 1927 struct cgroup_mgctx *mgctx)
2435{ 1928{
2436 struct css_set *cset; 1929 struct css_set *cset;
2437 1930
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
2451 1944
2452 list_move_tail(&task->cg_list, &cset->mg_tasks); 1945 list_move_tail(&task->cg_list, &cset->mg_tasks);
2453 if (list_empty(&cset->mg_node)) 1946 if (list_empty(&cset->mg_node))
2454 list_add_tail(&cset->mg_node, &tset->src_csets); 1947 list_add_tail(&cset->mg_node,
1948 &mgctx->tset.src_csets);
2455 if (list_empty(&cset->mg_dst_cset->mg_node)) 1949 if (list_empty(&cset->mg_dst_cset->mg_node))
2456 list_move_tail(&cset->mg_dst_cset->mg_node, 1950 list_add_tail(&cset->mg_dst_cset->mg_node,
2457 &tset->dst_csets); 1951 &mgctx->tset.dst_csets);
2458} 1952}
2459 1953
2460/** 1954/**
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2521 2015
2522/** 2016/**
2523 * cgroup_taskset_migrate - migrate a taskset 2017 * cgroup_taskset_migrate - migrate a taskset
2524 * @tset: taget taskset 2018 * @mgctx: migration context
2525 * @root: cgroup root the migration is taking place on
2526 * 2019 *
2527 * Migrate tasks in @tset as setup by migration preparation functions. 2020 * Migrate tasks in @mgctx as setup by migration preparation functions.
2528 * This function fails iff one of the ->can_attach callbacks fails and 2021 * This function fails iff one of the ->can_attach callbacks fails and
2529 * guarantees that either all or none of the tasks in @tset are migrated. 2022 * guarantees that either all or none of the tasks in @mgctx are migrated.
2530 * @tset is consumed regardless of success. 2023 * @mgctx is consumed regardless of success.
2531 */ 2024 */
2532static int cgroup_taskset_migrate(struct cgroup_taskset *tset, 2025static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2533 struct cgroup_root *root)
2534{ 2026{
2027 struct cgroup_taskset *tset = &mgctx->tset;
2535 struct cgroup_subsys *ss; 2028 struct cgroup_subsys *ss;
2536 struct task_struct *task, *tmp_task; 2029 struct task_struct *task, *tmp_task;
2537 struct css_set *cset, *tmp_cset; 2030 struct css_set *cset, *tmp_cset;
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2542 return 0; 2035 return 0;
2543 2036
2544 /* check that we can legitimately attach to the cgroup */ 2037 /* check that we can legitimately attach to the cgroup */
2545 do_each_subsys_mask(ss, ssid, root->subsys_mask) { 2038 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2546 if (ss->can_attach) { 2039 if (ss->can_attach) {
2547 tset->ssid = ssid; 2040 tset->ssid = ssid;
2548 ret = ss->can_attach(tset); 2041 ret = ss->can_attach(tset);
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2578 */ 2071 */
2579 tset->csets = &tset->dst_csets; 2072 tset->csets = &tset->dst_csets;
2580 2073
2581 do_each_subsys_mask(ss, ssid, root->subsys_mask) { 2074 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2582 if (ss->attach) { 2075 if (ss->attach) {
2583 tset->ssid = ssid; 2076 tset->ssid = ssid;
2584 ss->attach(tset); 2077 ss->attach(tset);
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2589 goto out_release_tset; 2082 goto out_release_tset;
2590 2083
2591out_cancel_attach: 2084out_cancel_attach:
2592 do_each_subsys_mask(ss, ssid, root->subsys_mask) { 2085 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2593 if (ssid == failed_ssid) 2086 if (ssid == failed_ssid)
2594 break; 2087 break;
2595 if (ss->cancel_attach) { 2088 if (ss->cancel_attach) {
@@ -2616,7 +2109,7 @@ out_release_tset:
2616 * zero for migration destination cgroups with tasks so that child cgroups 2109 * zero for migration destination cgroups with tasks so that child cgroups
2617 * don't compete against tasks. 2110 * don't compete against tasks.
2618 */ 2111 */
2619static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) 2112bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2620{ 2113{
2621 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || 2114 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
2622 !dst_cgrp->subtree_control; 2115 !dst_cgrp->subtree_control;
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2624 2117
2625/** 2118/**
2626 * cgroup_migrate_finish - cleanup after attach 2119 * cgroup_migrate_finish - cleanup after attach
2627 * @preloaded_csets: list of preloaded css_sets 2120 * @mgctx: migration context
2628 * 2121 *
2629 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See 2122 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
2630 * those functions for details. 2123 * those functions for details.
2631 */ 2124 */
2632static void cgroup_migrate_finish(struct list_head *preloaded_csets) 2125void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2633{ 2126{
2127 LIST_HEAD(preloaded);
2634 struct css_set *cset, *tmp_cset; 2128 struct css_set *cset, *tmp_cset;
2635 2129
2636 lockdep_assert_held(&cgroup_mutex); 2130 lockdep_assert_held(&cgroup_mutex);
2637 2131
2638 spin_lock_irq(&css_set_lock); 2132 spin_lock_irq(&css_set_lock);
2639 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 2133
2134 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2135 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2136
2137 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2640 cset->mg_src_cgrp = NULL; 2138 cset->mg_src_cgrp = NULL;
2641 cset->mg_dst_cgrp = NULL; 2139 cset->mg_dst_cgrp = NULL;
2642 cset->mg_dst_cset = NULL; 2140 cset->mg_dst_cset = NULL;
2643 list_del_init(&cset->mg_preload_node); 2141 list_del_init(&cset->mg_preload_node);
2644 put_css_set_locked(cset); 2142 put_css_set_locked(cset);
2645 } 2143 }
2144
2646 spin_unlock_irq(&css_set_lock); 2145 spin_unlock_irq(&css_set_lock);
2647} 2146}
2648 2147
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2650 * cgroup_migrate_add_src - add a migration source css_set 2149 * cgroup_migrate_add_src - add a migration source css_set
2651 * @src_cset: the source css_set to add 2150 * @src_cset: the source css_set to add
2652 * @dst_cgrp: the destination cgroup 2151 * @dst_cgrp: the destination cgroup
2653 * @preloaded_csets: list of preloaded css_sets 2152 * @mgctx: migration context
2654 * 2153 *
2655 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin 2154 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
2656 * @src_cset and add it to @preloaded_csets, which should later be cleaned 2155 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
2657 * up by cgroup_migrate_finish(). 2156 * up by cgroup_migrate_finish().
2658 * 2157 *
2659 * This function may be called without holding cgroup_threadgroup_rwsem 2158 * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2662 * into play and the preloaded css_sets are guaranteed to cover all 2161 * into play and the preloaded css_sets are guaranteed to cover all
2663 * migrations. 2162 * migrations.
2664 */ 2163 */
2665static void cgroup_migrate_add_src(struct css_set *src_cset, 2164void cgroup_migrate_add_src(struct css_set *src_cset,
2666 struct cgroup *dst_cgrp, 2165 struct cgroup *dst_cgrp,
2667 struct list_head *preloaded_csets) 2166 struct cgroup_mgctx *mgctx)
2668{ 2167{
2669 struct cgroup *src_cgrp; 2168 struct cgroup *src_cgrp;
2670 2169
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
2692 src_cset->mg_src_cgrp = src_cgrp; 2191 src_cset->mg_src_cgrp = src_cgrp;
2693 src_cset->mg_dst_cgrp = dst_cgrp; 2192 src_cset->mg_dst_cgrp = dst_cgrp;
2694 get_css_set(src_cset); 2193 get_css_set(src_cset);
2695 list_add(&src_cset->mg_preload_node, preloaded_csets); 2194 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2696} 2195}
2697 2196
2698/** 2197/**
2699 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2198 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2700 * @preloaded_csets: list of preloaded source css_sets 2199 * @mgctx: migration context
2701 * 2200 *
2702 * Tasks are about to be moved and all the source css_sets have been 2201 * Tasks are about to be moved and all the source css_sets have been
2703 * preloaded to @preloaded_csets. This function looks up and pins all 2202 * preloaded to @mgctx->preloaded_src_csets. This function looks up and
2704 * destination css_sets, links each to its source, and append them to 2203 * pins all destination css_sets, links each to its source, and append them
2705 * @preloaded_csets. 2204 * to @mgctx->preloaded_dst_csets.
2706 * 2205 *
2707 * This function must be called after cgroup_migrate_add_src() has been 2206 * This function must be called after cgroup_migrate_add_src() has been
2708 * called on each migration source css_set. After migration is performed 2207 * called on each migration source css_set. After migration is performed
2709 * using cgroup_migrate(), cgroup_migrate_finish() must be called on 2208 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2710 * @preloaded_csets. 2209 * @mgctx.
2711 */ 2210 */
2712static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) 2211int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2713{ 2212{
2714 LIST_HEAD(csets);
2715 struct css_set *src_cset, *tmp_cset; 2213 struct css_set *src_cset, *tmp_cset;
2716 2214
2717 lockdep_assert_held(&cgroup_mutex); 2215 lockdep_assert_held(&cgroup_mutex);
2718 2216
2719 /* look up the dst cset for each src cset and link it to src */ 2217 /* look up the dst cset for each src cset and link it to src */
2720 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { 2218 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2219 mg_preload_node) {
2721 struct css_set *dst_cset; 2220 struct css_set *dst_cset;
2221 struct cgroup_subsys *ss;
2222 int ssid;
2722 2223
2723 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); 2224 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2724 if (!dst_cset) 2225 if (!dst_cset)
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
2743 src_cset->mg_dst_cset = dst_cset; 2244 src_cset->mg_dst_cset = dst_cset;
2744 2245
2745 if (list_empty(&dst_cset->mg_preload_node)) 2246 if (list_empty(&dst_cset->mg_preload_node))
2746 list_add(&dst_cset->mg_preload_node, &csets); 2247 list_add_tail(&dst_cset->mg_preload_node,
2248 &mgctx->preloaded_dst_csets);
2747 else 2249 else
2748 put_css_set(dst_cset); 2250 put_css_set(dst_cset);
2251
2252 for_each_subsys(ss, ssid)
2253 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2254 mgctx->ss_mask |= 1 << ssid;
2749 } 2255 }
2750 2256
2751 list_splice_tail(&csets, preloaded_csets);
2752 return 0; 2257 return 0;
2753err: 2258err:
2754 cgroup_migrate_finish(&csets); 2259 cgroup_migrate_finish(mgctx);
2755 return -ENOMEM; 2260 return -ENOMEM;
2756} 2261}
2757 2262
@@ -2759,7 +2264,7 @@ err:
2759 * cgroup_migrate - migrate a process or task to a cgroup 2264 * cgroup_migrate - migrate a process or task to a cgroup
2760 * @leader: the leader of the process or the task to migrate 2265 * @leader: the leader of the process or the task to migrate
2761 * @threadgroup: whether @leader points to the whole process or a single task 2266 * @threadgroup: whether @leader points to the whole process or a single task
2762 * @root: cgroup root migration is taking place on 2267 * @mgctx: migration context
2763 * 2268 *
2764 * Migrate a process or task denoted by @leader. If migrating a process, 2269 * Migrate a process or task denoted by @leader. If migrating a process,
2765 * the caller must be holding cgroup_threadgroup_rwsem. The caller is also 2270 * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
@@ -2773,10 +2278,9 @@ err:
2773 * decided for all targets by invoking group_migrate_prepare_dst() before 2278 * decided for all targets by invoking group_migrate_prepare_dst() before
2774 * actually starting migrating. 2279 * actually starting migrating.
2775 */ 2280 */
2776static int cgroup_migrate(struct task_struct *leader, bool threadgroup, 2281int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2777 struct cgroup_root *root) 2282 struct cgroup_mgctx *mgctx)
2778{ 2283{
2779 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2780 struct task_struct *task; 2284 struct task_struct *task;
2781 2285
2782 /* 2286 /*
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2788 rcu_read_lock(); 2292 rcu_read_lock();
2789 task = leader; 2293 task = leader;
2790 do { 2294 do {
2791 cgroup_taskset_add(task, &tset); 2295 cgroup_migrate_add_task(task, mgctx);
2792 if (!threadgroup) 2296 if (!threadgroup)
2793 break; 2297 break;
2794 } while_each_thread(leader, task); 2298 } while_each_thread(leader, task);
2795 rcu_read_unlock(); 2299 rcu_read_unlock();
2796 spin_unlock_irq(&css_set_lock); 2300 spin_unlock_irq(&css_set_lock);
2797 2301
2798 return cgroup_taskset_migrate(&tset, root); 2302 return cgroup_migrate_execute(mgctx);
2799} 2303}
2800 2304
2801/** 2305/**
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2806 * 2310 *
2807 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. 2311 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2808 */ 2312 */
2809static int cgroup_attach_task(struct cgroup *dst_cgrp, 2313int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2810 struct task_struct *leader, bool threadgroup) 2314 bool threadgroup)
2811{ 2315{
2812 LIST_HEAD(preloaded_csets); 2316 DEFINE_CGROUP_MGCTX(mgctx);
2813 struct task_struct *task; 2317 struct task_struct *task;
2814 int ret; 2318 int ret;
2815 2319
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2821 rcu_read_lock(); 2325 rcu_read_lock();
2822 task = leader; 2326 task = leader;
2823 do { 2327 do {
2824 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, 2328 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2825 &preloaded_csets);
2826 if (!threadgroup) 2329 if (!threadgroup)
2827 break; 2330 break;
2828 } while_each_thread(leader, task); 2331 } while_each_thread(leader, task);
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2830 spin_unlock_irq(&css_set_lock); 2333 spin_unlock_irq(&css_set_lock);
2831 2334
2832 /* prepare dst csets and commit */ 2335 /* prepare dst csets and commit */
2833 ret = cgroup_migrate_prepare_dst(&preloaded_csets); 2336 ret = cgroup_migrate_prepare_dst(&mgctx);
2834 if (!ret) 2337 if (!ret)
2835 ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); 2338 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2836 2339
2837 cgroup_migrate_finish(&preloaded_csets); 2340 cgroup_migrate_finish(&mgctx);
2838 2341
2839 if (!ret) 2342 if (!ret)
2840 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); 2343 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2846 struct cgroup *dst_cgrp, 2349 struct cgroup *dst_cgrp,
2847 struct kernfs_open_file *of) 2350 struct kernfs_open_file *of)
2848{ 2351{
2849 const struct cred *cred = current_cred();
2850 const struct cred *tcred = get_task_cred(task);
2851 int ret = 0; 2352 int ret = 0;
2852 2353
2853 /* 2354 if (cgroup_on_dfl(dst_cgrp)) {
2854 * even if we're attaching all tasks in the thread group, we only
2855 * need to check permissions on one of them.
2856 */
2857 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2858 !uid_eq(cred->euid, tcred->uid) &&
2859 !uid_eq(cred->euid, tcred->suid))
2860 ret = -EACCES;
2861
2862 if (!ret && cgroup_on_dfl(dst_cgrp)) {
2863 struct super_block *sb = of->file->f_path.dentry->d_sb; 2355 struct super_block *sb = of->file->f_path.dentry->d_sb;
2864 struct cgroup *cgrp; 2356 struct cgroup *cgrp;
2865 struct inode *inode; 2357 struct inode *inode;
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2877 ret = inode_permission(inode, MAY_WRITE); 2369 ret = inode_permission(inode, MAY_WRITE);
2878 iput(inode); 2370 iput(inode);
2879 } 2371 }
2372 } else {
2373 const struct cred *cred = current_cred();
2374 const struct cred *tcred = get_task_cred(task);
2375
2376 /*
2377 * even if we're attaching all tasks in the thread group,
2378 * we only need to check permissions on one of them.
2379 */
2380 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2381 !uid_eq(cred->euid, tcred->uid) &&
2382 !uid_eq(cred->euid, tcred->suid))
2383 ret = -EACCES;
2384 put_cred(tcred);
2880 } 2385 }
2881 2386
2882 put_cred(tcred);
2883 return ret; 2387 return ret;
2884} 2388}
2885 2389
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2888 * function to attach either it or all tasks in its threadgroup. Will lock 2392 * function to attach either it or all tasks in its threadgroup. Will lock
2889 * cgroup_mutex and threadgroup. 2393 * cgroup_mutex and threadgroup.
2890 */ 2394 */
2891static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, 2395ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2892 size_t nbytes, loff_t off, bool threadgroup) 2396 size_t nbytes, loff_t off, bool threadgroup)
2893{ 2397{
2894 struct task_struct *tsk; 2398 struct task_struct *tsk;
2895 struct cgroup_subsys *ss; 2399 struct cgroup_subsys *ss;
@@ -2950,86 +2454,12 @@ out_unlock_threadgroup:
2950 return ret ?: nbytes; 2454 return ret ?: nbytes;
2951} 2455}
2952 2456
2953/** 2457ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
2954 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' 2458 loff_t off)
2955 * @from: attach to all cgroups of a given task
2956 * @tsk: the task to be attached
2957 */
2958int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2959{
2960 struct cgroup_root *root;
2961 int retval = 0;
2962
2963 mutex_lock(&cgroup_mutex);
2964 percpu_down_write(&cgroup_threadgroup_rwsem);
2965 for_each_root(root) {
2966 struct cgroup *from_cgrp;
2967
2968 if (root == &cgrp_dfl_root)
2969 continue;
2970
2971 spin_lock_irq(&css_set_lock);
2972 from_cgrp = task_cgroup_from_root(from, root);
2973 spin_unlock_irq(&css_set_lock);
2974
2975 retval = cgroup_attach_task(from_cgrp, tsk, false);
2976 if (retval)
2977 break;
2978 }
2979 percpu_up_write(&cgroup_threadgroup_rwsem);
2980 mutex_unlock(&cgroup_mutex);
2981
2982 return retval;
2983}
2984EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2985
2986static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2987 char *buf, size_t nbytes, loff_t off)
2988{
2989 return __cgroup_procs_write(of, buf, nbytes, off, false);
2990}
2991
2992static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2993 char *buf, size_t nbytes, loff_t off)
2994{ 2459{
2995 return __cgroup_procs_write(of, buf, nbytes, off, true); 2460 return __cgroup_procs_write(of, buf, nbytes, off, true);
2996} 2461}
2997 2462
2998static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2999 char *buf, size_t nbytes, loff_t off)
3000{
3001 struct cgroup *cgrp;
3002
3003 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
3004
3005 cgrp = cgroup_kn_lock_live(of->kn, false);
3006 if (!cgrp)
3007 return -ENODEV;
3008 spin_lock(&release_agent_path_lock);
3009 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
3010 sizeof(cgrp->root->release_agent_path));
3011 spin_unlock(&release_agent_path_lock);
3012 cgroup_kn_unlock(of->kn);
3013 return nbytes;
3014}
3015
3016static int cgroup_release_agent_show(struct seq_file *seq, void *v)
3017{
3018 struct cgroup *cgrp = seq_css(seq)->cgroup;
3019
3020 spin_lock(&release_agent_path_lock);
3021 seq_puts(seq, cgrp->root->release_agent_path);
3022 spin_unlock(&release_agent_path_lock);
3023 seq_putc(seq, '\n');
3024 return 0;
3025}
3026
3027static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
3028{
3029 seq_puts(seq, "0\n");
3030 return 0;
3031}
3032
3033static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) 2463static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
3034{ 2464{
3035 struct cgroup_subsys *ss; 2465 struct cgroup_subsys *ss;
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
3075 */ 2505 */
3076static int cgroup_update_dfl_csses(struct cgroup *cgrp) 2506static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3077{ 2507{
3078 LIST_HEAD(preloaded_csets); 2508 DEFINE_CGROUP_MGCTX(mgctx);
3079 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
3080 struct cgroup_subsys_state *d_css; 2509 struct cgroup_subsys_state *d_css;
3081 struct cgroup *dsct; 2510 struct cgroup *dsct;
3082 struct css_set *src_cset; 2511 struct css_set *src_cset;
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3092 struct cgrp_cset_link *link; 2521 struct cgrp_cset_link *link;
3093 2522
3094 list_for_each_entry(link, &dsct->cset_links, cset_link) 2523 list_for_each_entry(link, &dsct->cset_links, cset_link)
3095 cgroup_migrate_add_src(link->cset, dsct, 2524 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
3096 &preloaded_csets);
3097 } 2525 }
3098 spin_unlock_irq(&css_set_lock); 2526 spin_unlock_irq(&css_set_lock);
3099 2527
3100 /* NULL dst indicates self on default hierarchy */ 2528 /* NULL dst indicates self on default hierarchy */
3101 ret = cgroup_migrate_prepare_dst(&preloaded_csets); 2529 ret = cgroup_migrate_prepare_dst(&mgctx);
3102 if (ret) 2530 if (ret)
3103 goto out_finish; 2531 goto out_finish;
3104 2532
3105 spin_lock_irq(&css_set_lock); 2533 spin_lock_irq(&css_set_lock);
3106 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { 2534 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
3107 struct task_struct *task, *ntask; 2535 struct task_struct *task, *ntask;
3108 2536
3109 /* src_csets precede dst_csets, break on the first dst_cset */
3110 if (!src_cset->mg_src_cgrp)
3111 break;
3112
3113 /* all tasks in src_csets need to be migrated */ 2537 /* all tasks in src_csets need to be migrated */
3114 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) 2538 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3115 cgroup_taskset_add(task, &tset); 2539 cgroup_migrate_add_task(task, &mgctx);
3116 } 2540 }
3117 spin_unlock_irq(&css_set_lock); 2541 spin_unlock_irq(&css_set_lock);
3118 2542
3119 ret = cgroup_taskset_migrate(&tset, cgrp->root); 2543 ret = cgroup_migrate_execute(&mgctx);
3120out_finish: 2544out_finish:
3121 cgroup_migrate_finish(&preloaded_csets); 2545 cgroup_migrate_finish(&mgctx);
3122 percpu_up_write(&cgroup_threadgroup_rwsem); 2546 percpu_up_write(&cgroup_threadgroup_rwsem);
3123 return ret; 2547 return ret;
3124} 2548}
@@ -3131,7 +2555,7 @@ out_finish:
3131 * controller while the previous css is still around. This function grabs 2555 * controller while the previous css is still around. This function grabs
3132 * cgroup_mutex and drains the previous css instances of @cgrp's subtree. 2556 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
3133 */ 2557 */
3134static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) 2558void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3135 __acquires(&cgroup_mutex) 2559 __acquires(&cgroup_mutex)
3136{ 2560{
3137 struct cgroup *dsct; 2561 struct cgroup *dsct;
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
3503 return 0; 2927 return 0;
3504} 2928}
3505 2929
2930static int cgroup_file_open(struct kernfs_open_file *of)
2931{
2932 struct cftype *cft = of->kn->priv;
2933
2934 if (cft->open)
2935 return cft->open(of);
2936 return 0;
2937}
2938
2939static void cgroup_file_release(struct kernfs_open_file *of)
2940{
2941 struct cftype *cft = of->kn->priv;
2942
2943 if (cft->release)
2944 cft->release(of);
2945}
2946
3506static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2947static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3507 size_t nbytes, loff_t off) 2948 size_t nbytes, loff_t off)
3508{ 2949{
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3553 2994
3554static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2995static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3555{ 2996{
3556 seq_cft(seq)->seq_stop(seq, v); 2997 if (seq_cft(seq)->seq_stop)
2998 seq_cft(seq)->seq_stop(seq, v);
3557} 2999}
3558 3000
3559static int cgroup_seqfile_show(struct seq_file *m, void *arg) 3001static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3575 3017
3576static struct kernfs_ops cgroup_kf_single_ops = { 3018static struct kernfs_ops cgroup_kf_single_ops = {
3577 .atomic_write_len = PAGE_SIZE, 3019 .atomic_write_len = PAGE_SIZE,
3020 .open = cgroup_file_open,
3021 .release = cgroup_file_release,
3578 .write = cgroup_file_write, 3022 .write = cgroup_file_write,
3579 .seq_show = cgroup_seqfile_show, 3023 .seq_show = cgroup_seqfile_show,
3580}; 3024};
3581 3025
3582static struct kernfs_ops cgroup_kf_ops = { 3026static struct kernfs_ops cgroup_kf_ops = {
3583 .atomic_write_len = PAGE_SIZE, 3027 .atomic_write_len = PAGE_SIZE,
3028 .open = cgroup_file_open,
3029 .release = cgroup_file_release,
3584 .write = cgroup_file_write, 3030 .write = cgroup_file_write,
3585 .seq_start = cgroup_seqfile_start, 3031 .seq_start = cgroup_seqfile_start,
3586 .seq_next = cgroup_seqfile_next, 3032 .seq_next = cgroup_seqfile_next,
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
3588 .seq_show = cgroup_seqfile_show, 3034 .seq_show = cgroup_seqfile_show,
3589}; 3035};
3590 3036
3591/*
3592 * cgroup_rename - Only allow simple rename of directories in place.
3593 */
3594static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3595 const char *new_name_str)
3596{
3597 struct cgroup *cgrp = kn->priv;
3598 int ret;
3599
3600 if (kernfs_type(kn) != KERNFS_DIR)
3601 return -ENOTDIR;
3602 if (kn->parent != new_parent)
3603 return -EIO;
3604
3605 /*
3606 * This isn't a proper migration and its usefulness is very
3607 * limited. Disallow on the default hierarchy.
3608 */
3609 if (cgroup_on_dfl(cgrp))
3610 return -EPERM;
3611
3612 /*
3613 * We're gonna grab cgroup_mutex which nests outside kernfs
3614 * active_ref. kernfs_rename() doesn't require active_ref
3615 * protection. Break them before grabbing cgroup_mutex.
3616 */
3617 kernfs_break_active_protection(new_parent);
3618 kernfs_break_active_protection(kn);
3619
3620 mutex_lock(&cgroup_mutex);
3621
3622 ret = kernfs_rename(kn, new_parent, new_name_str);
3623 if (!ret)
3624 trace_cgroup_rename(cgrp);
3625
3626 mutex_unlock(&cgroup_mutex);
3627
3628 kernfs_unbreak_active_protection(kn);
3629 kernfs_unbreak_active_protection(new_parent);
3630 return ret;
3631}
3632
3633/* set uid and gid of cgroup dirs and files to that of the creator */ 3037/* set uid and gid of cgroup dirs and files to that of the creator */
3634static int cgroup_kn_set_ugid(struct kernfs_node *kn) 3038static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3635{ 3039{
@@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
3926} 3330}
3927 3331
3928/** 3332/**
3929 * cgroup_task_count - count the number of tasks in a cgroup.
3930 * @cgrp: the cgroup in question
3931 *
3932 * Return the number of tasks in the cgroup. The returned number can be
3933 * higher than the actual number of tasks due to css_set references from
3934 * namespace roots and temporary usages.
3935 */
3936static int cgroup_task_count(const struct cgroup *cgrp)
3937{
3938 int count = 0;
3939 struct cgrp_cset_link *link;
3940
3941 spin_lock_irq(&css_set_lock);
3942 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3943 count += atomic_read(&link->cset->refcount);
3944 spin_unlock_irq(&css_set_lock);
3945 return count;
3946}
3947
3948/**
3949 * css_next_child - find the next child of a given css 3333 * css_next_child - find the next child of a given css
3950 * @pos: the current position (%NULL to initiate traversal) 3334 * @pos: the current position (%NULL to initiate traversal)
3951 * @parent: css whose children to walk 3335 * @parent: css whose children to walk
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
4343 put_task_struct(it->cur_task); 3727 put_task_struct(it->cur_task);
4344} 3728}
4345 3729
4346/** 3730static void cgroup_procs_release(struct kernfs_open_file *of)
4347 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
4348 * @to: cgroup to which the tasks will be moved
4349 * @from: cgroup in which the tasks currently reside
4350 *
4351 * Locking rules between cgroup_post_fork() and the migration path
4352 * guarantee that, if a task is forking while being migrated, the new child
4353 * is guaranteed to be either visible in the source cgroup after the
4354 * parent's migration is complete or put into the target cgroup. No task
4355 * can slip out of migration through forking.
4356 */
4357int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4358{
4359 LIST_HEAD(preloaded_csets);
4360 struct cgrp_cset_link *link;
4361 struct css_task_iter it;
4362 struct task_struct *task;
4363 int ret;
4364
4365 if (!cgroup_may_migrate_to(to))
4366 return -EBUSY;
4367
4368 mutex_lock(&cgroup_mutex);
4369
4370 percpu_down_write(&cgroup_threadgroup_rwsem);
4371
4372 /* all tasks in @from are being moved, all csets are source */
4373 spin_lock_irq(&css_set_lock);
4374 list_for_each_entry(link, &from->cset_links, cset_link)
4375 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4376 spin_unlock_irq(&css_set_lock);
4377
4378 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
4379 if (ret)
4380 goto out_err;
4381
4382 /*
4383 * Migrate tasks one-by-one until @from is empty. This fails iff
4384 * ->can_attach() fails.
4385 */
4386 do {
4387 css_task_iter_start(&from->self, &it);
4388 task = css_task_iter_next(&it);
4389 if (task)
4390 get_task_struct(task);
4391 css_task_iter_end(&it);
4392
4393 if (task) {
4394 ret = cgroup_migrate(task, false, to->root);
4395 if (!ret)
4396 trace_cgroup_transfer_tasks(to, task, false);
4397 put_task_struct(task);
4398 }
4399 } while (task && !ret);
4400out_err:
4401 cgroup_migrate_finish(&preloaded_csets);
4402 percpu_up_write(&cgroup_threadgroup_rwsem);
4403 mutex_unlock(&cgroup_mutex);
4404 return ret;
4405}
4406
4407/*
4408 * Stuff for reading the 'tasks'/'procs' files.
4409 *
4410 * Reading this file can return large amounts of data if a cgroup has
4411 * *lots* of attached tasks. So it may need several calls to read(),
4412 * but we cannot guarantee that the information we produce is correct
4413 * unless we produce it entirely atomically.
4414 *
4415 */
4416
4417/* which pidlist file are we talking about? */
4418enum cgroup_filetype {
4419 CGROUP_FILE_PROCS,
4420 CGROUP_FILE_TASKS,
4421};
4422
4423/*
4424 * A pidlist is a list of pids that virtually represents the contents of one
4425 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
4426 * a pair (one each for procs, tasks) for each pid namespace that's relevant
4427 * to the cgroup.
4428 */
4429struct cgroup_pidlist {
4430 /*
4431 * used to find which pidlist is wanted. doesn't change as long as
4432 * this particular list stays in the list.
4433 */
4434 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
4435 /* array of xids */
4436 pid_t *list;
4437 /* how many elements the above list has */
4438 int length;
4439 /* each of these stored in a list by its cgroup */
4440 struct list_head links;
4441 /* pointer to the cgroup we belong to, for list removal purposes */
4442 struct cgroup *owner;
4443 /* for delayed destruction */
4444 struct delayed_work destroy_dwork;
4445};
4446
4447/*
4448 * The following two functions "fix" the issue where there are more pids
4449 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
4450 * TODO: replace with a kernel-wide solution to this problem
4451 */
4452#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
4453static void *pidlist_allocate(int count)
4454{
4455 if (PIDLIST_TOO_LARGE(count))
4456 return vmalloc(count * sizeof(pid_t));
4457 else
4458 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
4459}
4460
4461static void pidlist_free(void *p)
4462{
4463 kvfree(p);
4464}
4465
4466/*
4467 * Used to destroy all pidlists lingering waiting for destroy timer. None
4468 * should be left afterwards.
4469 */
4470static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
4471{
4472 struct cgroup_pidlist *l, *tmp_l;
4473
4474 mutex_lock(&cgrp->pidlist_mutex);
4475 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
4476 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
4477 mutex_unlock(&cgrp->pidlist_mutex);
4478
4479 flush_workqueue(cgroup_pidlist_destroy_wq);
4480 BUG_ON(!list_empty(&cgrp->pidlists));
4481}
4482
4483static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
4484{
4485 struct delayed_work *dwork = to_delayed_work(work);
4486 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
4487 destroy_dwork);
4488 struct cgroup_pidlist *tofree = NULL;
4489
4490 mutex_lock(&l->owner->pidlist_mutex);
4491
4492 /*
4493 * Destroy iff we didn't get queued again. The state won't change
4494 * as destroy_dwork can only be queued while locked.
4495 */
4496 if (!delayed_work_pending(dwork)) {
4497 list_del(&l->links);
4498 pidlist_free(l->list);
4499 put_pid_ns(l->key.ns);
4500 tofree = l;
4501 }
4502
4503 mutex_unlock(&l->owner->pidlist_mutex);
4504 kfree(tofree);
4505}
4506
4507/*
4508 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
4509 * Returns the number of unique elements.
4510 */
4511static int pidlist_uniq(pid_t *list, int length)
4512{
4513 int src, dest = 1;
4514
4515 /*
4516 * we presume the 0th element is unique, so i starts at 1. trivial
4517 * edge cases first; no work needs to be done for either
4518 */
4519 if (length == 0 || length == 1)
4520 return length;
4521 /* src and dest walk down the list; dest counts unique elements */
4522 for (src = 1; src < length; src++) {
4523 /* find next unique element */
4524 while (list[src] == list[src-1]) {
4525 src++;
4526 if (src == length)
4527 goto after;
4528 }
4529 /* dest always points to where the next unique element goes */
4530 list[dest] = list[src];
4531 dest++;
4532 }
4533after:
4534 return dest;
4535}
4536
4537/*
4538 * The two pid files - task and cgroup.procs - guaranteed that the result
4539 * is sorted, which forced this whole pidlist fiasco. As pid order is
4540 * different per namespace, each namespace needs differently sorted list,
4541 * making it impossible to use, for example, single rbtree of member tasks
4542 * sorted by task pointer. As pidlists can be fairly large, allocating one
4543 * per open file is dangerous, so cgroup had to implement shared pool of
4544 * pidlists keyed by cgroup and namespace.
4545 *
4546 * All this extra complexity was caused by the original implementation
4547 * committing to an entirely unnecessary property. In the long term, we
4548 * want to do away with it. Explicitly scramble sort order if on the
4549 * default hierarchy so that no such expectation exists in the new
4550 * interface.
4551 *
4552 * Scrambling is done by swapping every two consecutive bits, which is
4553 * non-identity one-to-one mapping which disturbs sort order sufficiently.
4554 */
4555static pid_t pid_fry(pid_t pid)
4556{ 3731{
4557 unsigned a = pid & 0x55555555; 3732 if (of->priv) {
4558 unsigned b = pid & 0xAAAAAAAA; 3733 css_task_iter_end(of->priv);
4559 3734 kfree(of->priv);
4560 return (a << 1) | (b >> 1);
4561}
4562
4563static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
4564{
4565 if (cgroup_on_dfl(cgrp))
4566 return pid_fry(pid);
4567 else
4568 return pid;
4569}
4570
4571static int cmppid(const void *a, const void *b)
4572{
4573 return *(pid_t *)a - *(pid_t *)b;
4574}
4575
4576static int fried_cmppid(const void *a, const void *b)
4577{
4578 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
4579}
4580
4581static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
4582 enum cgroup_filetype type)
4583{
4584 struct cgroup_pidlist *l;
4585 /* don't need task_nsproxy() if we're looking at ourself */
4586 struct pid_namespace *ns = task_active_pid_ns(current);
4587
4588 lockdep_assert_held(&cgrp->pidlist_mutex);
4589
4590 list_for_each_entry(l, &cgrp->pidlists, links)
4591 if (l->key.type == type && l->key.ns == ns)
4592 return l;
4593 return NULL;
4594}
4595
4596/*
4597 * find the appropriate pidlist for our purpose (given procs vs tasks)
4598 * returns with the lock on that pidlist already held, and takes care
4599 * of the use count, or returns NULL with no locks held if we're out of
4600 * memory.
4601 */
4602static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
4603 enum cgroup_filetype type)
4604{
4605 struct cgroup_pidlist *l;
4606
4607 lockdep_assert_held(&cgrp->pidlist_mutex);
4608
4609 l = cgroup_pidlist_find(cgrp, type);
4610 if (l)
4611 return l;
4612
4613 /* entry not found; create a new one */
4614 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
4615 if (!l)
4616 return l;
4617
4618 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
4619 l->key.type = type;
4620 /* don't need task_nsproxy() if we're looking at ourself */
4621 l->key.ns = get_pid_ns(task_active_pid_ns(current));
4622 l->owner = cgrp;
4623 list_add(&l->links, &cgrp->pidlists);
4624 return l;
4625}
4626
4627/*
4628 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
4629 */
4630static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
4631 struct cgroup_pidlist **lp)
4632{
4633 pid_t *array;
4634 int length;
4635 int pid, n = 0; /* used for populating the array */
4636 struct css_task_iter it;
4637 struct task_struct *tsk;
4638 struct cgroup_pidlist *l;
4639
4640 lockdep_assert_held(&cgrp->pidlist_mutex);
4641
4642 /*
4643 * If cgroup gets more users after we read count, we won't have
4644 * enough space - tough. This race is indistinguishable to the
4645 * caller from the case that the additional cgroup users didn't
4646 * show up until sometime later on.
4647 */
4648 length = cgroup_task_count(cgrp);
4649 array = pidlist_allocate(length);
4650 if (!array)
4651 return -ENOMEM;
4652 /* now, populate the array */
4653 css_task_iter_start(&cgrp->self, &it);
4654 while ((tsk = css_task_iter_next(&it))) {
4655 if (unlikely(n == length))
4656 break;
4657 /* get tgid or pid for procs or tasks file respectively */
4658 if (type == CGROUP_FILE_PROCS)
4659 pid = task_tgid_vnr(tsk);
4660 else
4661 pid = task_pid_vnr(tsk);
4662 if (pid > 0) /* make sure to only use valid results */
4663 array[n++] = pid;
4664 }
4665 css_task_iter_end(&it);
4666 length = n;
4667 /* now sort & (if procs) strip out duplicates */
4668 if (cgroup_on_dfl(cgrp))
4669 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4670 else
4671 sort(array, length, sizeof(pid_t), cmppid, NULL);
4672 if (type == CGROUP_FILE_PROCS)
4673 length = pidlist_uniq(array, length);
4674
4675 l = cgroup_pidlist_find_create(cgrp, type);
4676 if (!l) {
4677 pidlist_free(array);
4678 return -ENOMEM;
4679 } 3735 }
4680
4681 /* store array, freeing old if necessary */
4682 pidlist_free(l->list);
4683 l->list = array;
4684 l->length = length;
4685 *lp = l;
4686 return 0;
4687} 3736}
4688 3737
4689/** 3738static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4690 * cgroupstats_build - build and fill cgroupstats
4691 * @stats: cgroupstats to fill information into
4692 * @dentry: A dentry entry belonging to the cgroup for which stats have
4693 * been requested.
4694 *
4695 * Build and fill cgroupstats so that taskstats can export it to user
4696 * space.
4697 */
4698int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4699{ 3739{
4700 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 3740 struct kernfs_open_file *of = s->private;
4701 struct cgroup *cgrp; 3741 struct css_task_iter *it = of->priv;
4702 struct css_task_iter it; 3742 struct task_struct *task;
4703 struct task_struct *tsk;
4704
4705 /* it should be kernfs_node belonging to cgroupfs and is a directory */
4706 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4707 kernfs_type(kn) != KERNFS_DIR)
4708 return -EINVAL;
4709
4710 mutex_lock(&cgroup_mutex);
4711
4712 /*
4713 * We aren't being called from kernfs and there's no guarantee on
4714 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
4715 * @kn->priv is RCU safe. Let's do the RCU dancing.
4716 */
4717 rcu_read_lock();
4718 cgrp = rcu_dereference(kn->priv);
4719 if (!cgrp || cgroup_is_dead(cgrp)) {
4720 rcu_read_unlock();
4721 mutex_unlock(&cgroup_mutex);
4722 return -ENOENT;
4723 }
4724 rcu_read_unlock();
4725 3743
4726 css_task_iter_start(&cgrp->self, &it); 3744 do {
4727 while ((tsk = css_task_iter_next(&it))) { 3745 task = css_task_iter_next(it);
4728 switch (tsk->state) { 3746 } while (task && !thread_group_leader(task));
4729 case TASK_RUNNING:
4730 stats->nr_running++;
4731 break;
4732 case TASK_INTERRUPTIBLE:
4733 stats->nr_sleeping++;
4734 break;
4735 case TASK_UNINTERRUPTIBLE:
4736 stats->nr_uninterruptible++;
4737 break;
4738 case TASK_STOPPED:
4739 stats->nr_stopped++;
4740 break;
4741 default:
4742 if (delayacct_is_task_waiting_on_io(tsk))
4743 stats->nr_io_wait++;
4744 break;
4745 }
4746 }
4747 css_task_iter_end(&it);
4748 3747
4749 mutex_unlock(&cgroup_mutex); 3748 return task;
4750 return 0;
4751} 3749}
4752 3750
4753 3751static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4754/*
4755 * seq_file methods for the tasks/procs files. The seq_file position is the
4756 * next pid to display; the seq_file iterator is a pointer to the pid
4757 * in the cgroup->l->list array.
4758 */
4759
4760static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4761{ 3752{
4762 /*
4763 * Initially we receive a position value that corresponds to
4764 * one more than the last pid shown (or 0 on the first call or
4765 * after a seek to the start). Use a binary-search to find the
4766 * next pid to display, if any
4767 */
4768 struct kernfs_open_file *of = s->private; 3753 struct kernfs_open_file *of = s->private;
4769 struct cgroup *cgrp = seq_css(s)->cgroup; 3754 struct cgroup *cgrp = seq_css(s)->cgroup;
4770 struct cgroup_pidlist *l; 3755 struct css_task_iter *it = of->priv;
4771 enum cgroup_filetype type = seq_cft(s)->private;
4772 int index = 0, pid = *pos;
4773 int *iter, ret;
4774
4775 mutex_lock(&cgrp->pidlist_mutex);
4776 3756
4777 /* 3757 /*
4778 * !NULL @of->priv indicates that this isn't the first start() 3758 * When a seq_file is seeked, it's always traversed sequentially
4779 * after open. If the matching pidlist is around, we can use that. 3759 * from position 0, so we can simply keep iterating on !0 *pos.
4780 * Look for it. Note that @of->priv can't be used directly. It
4781 * could already have been destroyed.
4782 */ 3760 */
4783 if (of->priv) 3761 if (!it) {
4784 of->priv = cgroup_pidlist_find(cgrp, type); 3762 if (WARN_ON_ONCE((*pos)++))
4785 3763 return ERR_PTR(-EINVAL);
4786 /*
4787 * Either this is the first start() after open or the matching
4788 * pidlist has been destroyed inbetween. Create a new one.
4789 */
4790 if (!of->priv) {
4791 ret = pidlist_array_load(cgrp, type,
4792 (struct cgroup_pidlist **)&of->priv);
4793 if (ret)
4794 return ERR_PTR(ret);
4795 }
4796 l = of->priv;
4797
4798 if (pid) {
4799 int end = l->length;
4800
4801 while (index < end) {
4802 int mid = (index + end) / 2;
4803 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4804 index = mid;
4805 break;
4806 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4807 index = mid + 1;
4808 else
4809 end = mid;
4810 }
4811 }
4812 /* If we're off the end of the array, we're done */
4813 if (index >= l->length)
4814 return NULL;
4815 /* Update the abstract position to be the actual pid that we found */
4816 iter = l->list + index;
4817 *pos = cgroup_pid_fry(cgrp, *iter);
4818 return iter;
4819}
4820
4821static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4822{
4823 struct kernfs_open_file *of = s->private;
4824 struct cgroup_pidlist *l = of->priv;
4825
4826 if (l)
4827 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4828 CGROUP_PIDLIST_DESTROY_DELAY);
4829 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4830}
4831 3764
4832static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3765 it = kzalloc(sizeof(*it), GFP_KERNEL);
4833{ 3766 if (!it)
4834 struct kernfs_open_file *of = s->private; 3767 return ERR_PTR(-ENOMEM);
4835 struct cgroup_pidlist *l = of->priv; 3768 of->priv = it;
4836 pid_t *p = v; 3769 css_task_iter_start(&cgrp->self, it);
4837 pid_t *end = l->list + l->length; 3770 } else if (!(*pos)++) {
4838 /* 3771 css_task_iter_end(it);
4839 * Advance to the next pid in the array. If this goes off the 3772 css_task_iter_start(&cgrp->self, it);
4840 * end, we're done
4841 */
4842 p++;
4843 if (p >= end) {
4844 return NULL;
4845 } else {
4846 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4847 return p;
4848 } 3773 }
4849}
4850
4851static int cgroup_pidlist_show(struct seq_file *s, void *v)
4852{
4853 seq_printf(s, "%d\n", *(int *)v);
4854 3774
4855 return 0; 3775 return cgroup_procs_next(s, NULL, NULL);
4856} 3776}
4857 3777
4858static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3778static int cgroup_procs_show(struct seq_file *s, void *v)
4859 struct cftype *cft)
4860{ 3779{
4861 return notify_on_release(css->cgroup); 3780 seq_printf(s, "%d\n", task_tgid_vnr(v));
4862}
4863
4864static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4865 struct cftype *cft, u64 val)
4866{
4867 if (val)
4868 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4869 else
4870 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4871 return 0;
4872}
4873
4874static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4875 struct cftype *cft)
4876{
4877 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4878}
4879
4880static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4881 struct cftype *cft, u64 val)
4882{
4883 if (val)
4884 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4885 else
4886 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4887 return 0; 3781 return 0;
4888} 3782}
4889 3783
4890/* cgroup core interface files for the default hierarchy */ 3784/* cgroup core interface files for the default hierarchy */
4891static struct cftype cgroup_dfl_base_files[] = { 3785static struct cftype cgroup_base_files[] = {
4892 { 3786 {
4893 .name = "cgroup.procs", 3787 .name = "cgroup.procs",
4894 .file_offset = offsetof(struct cgroup, procs_file), 3788 .file_offset = offsetof(struct cgroup, procs_file),
4895 .seq_start = cgroup_pidlist_start, 3789 .release = cgroup_procs_release,
4896 .seq_next = cgroup_pidlist_next, 3790 .seq_start = cgroup_procs_start,
4897 .seq_stop = cgroup_pidlist_stop, 3791 .seq_next = cgroup_procs_next,
4898 .seq_show = cgroup_pidlist_show, 3792 .seq_show = cgroup_procs_show,
4899 .private = CGROUP_FILE_PROCS,
4900 .write = cgroup_procs_write, 3793 .write = cgroup_procs_write,
4901 }, 3794 },
4902 { 3795 {
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
4917 { } /* terminate */ 3810 { } /* terminate */
4918}; 3811};
4919 3812
4920/* cgroup core interface files for the legacy hierarchies */
4921static struct cftype cgroup_legacy_base_files[] = {
4922 {
4923 .name = "cgroup.procs",
4924 .seq_start = cgroup_pidlist_start,
4925 .seq_next = cgroup_pidlist_next,
4926 .seq_stop = cgroup_pidlist_stop,
4927 .seq_show = cgroup_pidlist_show,
4928 .private = CGROUP_FILE_PROCS,
4929 .write = cgroup_procs_write,
4930 },
4931 {
4932 .name = "cgroup.clone_children",
4933 .read_u64 = cgroup_clone_children_read,
4934 .write_u64 = cgroup_clone_children_write,
4935 },
4936 {
4937 .name = "cgroup.sane_behavior",
4938 .flags = CFTYPE_ONLY_ON_ROOT,
4939 .seq_show = cgroup_sane_behavior_show,
4940 },
4941 {
4942 .name = "tasks",
4943 .seq_start = cgroup_pidlist_start,
4944 .seq_next = cgroup_pidlist_next,
4945 .seq_stop = cgroup_pidlist_stop,
4946 .seq_show = cgroup_pidlist_show,
4947 .private = CGROUP_FILE_TASKS,
4948 .write = cgroup_tasks_write,
4949 },
4950 {
4951 .name = "notify_on_release",
4952 .read_u64 = cgroup_read_notify_on_release,
4953 .write_u64 = cgroup_write_notify_on_release,
4954 },
4955 {
4956 .name = "release_agent",
4957 .flags = CFTYPE_ONLY_ON_ROOT,
4958 .seq_show = cgroup_release_agent_show,
4959 .write = cgroup_release_agent_write,
4960 .max_write_len = PATH_MAX - 1,
4961 },
4962 { } /* terminate */
4963};
4964
4965/* 3813/*
4966 * css destruction is four-stage process. 3814 * css destruction is four-stage process.
4967 * 3815 *
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
5007 } else { 3855 } else {
5008 /* cgroup free path */ 3856 /* cgroup free path */
5009 atomic_dec(&cgrp->root->nr_cgrps); 3857 atomic_dec(&cgrp->root->nr_cgrps);
5010 cgroup_pidlist_destroy_all(cgrp); 3858 cgroup1_pidlist_destroy_all(cgrp);
5011 cancel_work_sync(&cgrp->release_agent_work); 3859 cancel_work_sync(&cgrp->release_agent_work);
5012 3860
5013 if (cgroup_parent(cgrp)) { 3861 if (cgroup_parent(cgrp)) {
@@ -5302,8 +4150,7 @@ out_free_cgrp:
5302 return ERR_PTR(ret); 4150 return ERR_PTR(ret);
5303} 4151}
5304 4152
5305static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 4153int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5306 umode_t mode)
5307{ 4154{
5308 struct cgroup *parent, *cgrp; 4155 struct cgroup *parent, *cgrp;
5309 struct kernfs_node *kn; 4156 struct kernfs_node *kn;
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5507 */ 4354 */
5508 kernfs_remove(cgrp->kn); 4355 kernfs_remove(cgrp->kn);
5509 4356
5510 check_for_release(cgroup_parent(cgrp)); 4357 cgroup1_check_for_release(cgroup_parent(cgrp));
5511 4358
5512 /* put the base reference */ 4359 /* put the base reference */
5513 percpu_ref_kill(&cgrp->self.refcnt); 4360 percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5515 return 0; 4362 return 0;
5516}; 4363};
5517 4364
5518static int cgroup_rmdir(struct kernfs_node *kn) 4365int cgroup_rmdir(struct kernfs_node *kn)
5519{ 4366{
5520 struct cgroup *cgrp; 4367 struct cgroup *cgrp;
5521 int ret = 0; 4368 int ret = 0;
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
5535 4382
5536static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4383static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5537 .remount_fs = cgroup_remount, 4384 .remount_fs = cgroup_remount,
5538 .show_options = cgroup_show_options,
5539 .mkdir = cgroup_mkdir, 4385 .mkdir = cgroup_mkdir,
5540 .rmdir = cgroup_rmdir, 4386 .rmdir = cgroup_rmdir,
5541 .rename = cgroup_rename,
5542 .show_path = cgroup_show_path, 4387 .show_path = cgroup_show_path,
5543}; 4388};
5544 4389
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
5646 4491
5647 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); 4492 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5648 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); 4493 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5649 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 4494 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5650 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 4495 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5651 4496
5652 /* 4497 /*
5653 * The latency of the synchronize_sched() is too high for cgroups, 4498 * The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
5697 continue; 4542 continue;
5698 } 4543 }
5699 4544
5700 if (cgroup_ssid_no_v1(ssid)) 4545 if (cgroup1_ssid_disabled(ssid))
5701 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", 4546 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5702 ss->name); 4547 ss->name);
5703 4548
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
5744 */ 4589 */
5745 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4590 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5746 BUG_ON(!cgroup_destroy_wq); 4591 BUG_ON(!cgroup_destroy_wq);
5747
5748 /*
5749 * Used to destroy pidlists and separate to serve as flush domain.
5750 * Cap @max_active to 1 too.
5751 */
5752 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5753 0, 1);
5754 BUG_ON(!cgroup_pidlist_destroy_wq);
5755
5756 return 0; 4592 return 0;
5757} 4593}
5758core_initcall(cgroup_wq_init); 4594core_initcall(cgroup_wq_init);
@@ -5835,42 +4671,6 @@ out:
5835 return retval; 4671 return retval;
5836} 4672}
5837 4673
5838/* Display information about each subsystem and each hierarchy */
5839static int proc_cgroupstats_show(struct seq_file *m, void *v)
5840{
5841 struct cgroup_subsys *ss;
5842 int i;
5843
5844 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5845 /*
5846 * ideally we don't want subsystems moving around while we do this.
5847 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5848 * subsys/hierarchy state.
5849 */
5850 mutex_lock(&cgroup_mutex);
5851
5852 for_each_subsys(ss, i)
5853 seq_printf(m, "%s\t%d\t%d\t%d\n",
5854 ss->legacy_name, ss->root->hierarchy_id,
5855 atomic_read(&ss->root->nr_cgrps),
5856 cgroup_ssid_enabled(i));
5857
5858 mutex_unlock(&cgroup_mutex);
5859 return 0;
5860}
5861
5862static int cgroupstats_open(struct inode *inode, struct file *file)
5863{
5864 return single_open(file, proc_cgroupstats_show, NULL);
5865}
5866
5867static const struct file_operations proc_cgroupstats_operations = {
5868 .open = cgroupstats_open,
5869 .read = seq_read,
5870 .llseek = seq_lseek,
5871 .release = single_release,
5872};
5873
5874/** 4674/**
5875 * cgroup_fork - initialize cgroup related fields during copy_process() 4675 * cgroup_fork - initialize cgroup related fields during copy_process()
5876 * @child: pointer to task_struct of forking parent process. 4676 * @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
6050 put_css_set(cset); 4850 put_css_set(cset);
6051} 4851}
6052 4852
6053static void check_for_release(struct cgroup *cgrp)
6054{
6055 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
6056 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
6057 schedule_work(&cgrp->release_agent_work);
6058}
6059
6060/*
6061 * Notify userspace when a cgroup is released, by running the
6062 * configured release agent with the name of the cgroup (path
6063 * relative to the root of cgroup file system) as the argument.
6064 *
6065 * Most likely, this user command will try to rmdir this cgroup.
6066 *
6067 * This races with the possibility that some other task will be
6068 * attached to this cgroup before it is removed, or that some other
6069 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
6070 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
6071 * unused, and this cgroup will be reprieved from its death sentence,
6072 * to continue to serve a useful existence. Next time it's released,
6073 * we will get notified again, if it still has 'notify_on_release' set.
6074 *
6075 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
6076 * means only wait until the task is successfully execve()'d. The
6077 * separate release agent task is forked by call_usermodehelper(),
6078 * then control in this thread returns here, without waiting for the
6079 * release agent task. We don't bother to wait because the caller of
6080 * this routine has no use for the exit status of the release agent
6081 * task, so no sense holding our caller up for that.
6082 */
6083static void cgroup_release_agent(struct work_struct *work)
6084{
6085 struct cgroup *cgrp =
6086 container_of(work, struct cgroup, release_agent_work);
6087 char *pathbuf = NULL, *agentbuf = NULL;
6088 char *argv[3], *envp[3];
6089 int ret;
6090
6091 mutex_lock(&cgroup_mutex);
6092
6093 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
6094 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
6095 if (!pathbuf || !agentbuf)
6096 goto out;
6097
6098 spin_lock_irq(&css_set_lock);
6099 ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
6100 spin_unlock_irq(&css_set_lock);
6101 if (ret < 0 || ret >= PATH_MAX)
6102 goto out;
6103
6104 argv[0] = agentbuf;
6105 argv[1] = pathbuf;
6106 argv[2] = NULL;
6107
6108 /* minimal command environment */
6109 envp[0] = "HOME=/";
6110 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
6111 envp[2] = NULL;
6112
6113 mutex_unlock(&cgroup_mutex);
6114 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
6115 goto out_free;
6116out:
6117 mutex_unlock(&cgroup_mutex);
6118out_free:
6119 kfree(agentbuf);
6120 kfree(pathbuf);
6121}
6122
6123static int __init cgroup_disable(char *str) 4853static int __init cgroup_disable(char *str)
6124{ 4854{
6125 struct cgroup_subsys *ss; 4855 struct cgroup_subsys *ss;
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
6141} 4871}
6142__setup("cgroup_disable=", cgroup_disable); 4872__setup("cgroup_disable=", cgroup_disable);
6143 4873
6144static int __init cgroup_no_v1(char *str)
6145{
6146 struct cgroup_subsys *ss;
6147 char *token;
6148 int i;
6149
6150 while ((token = strsep(&str, ",")) != NULL) {
6151 if (!*token)
6152 continue;
6153
6154 if (!strcmp(token, "all")) {
6155 cgroup_no_v1_mask = U16_MAX;
6156 break;
6157 }
6158
6159 for_each_subsys(ss, i) {
6160 if (strcmp(token, ss->name) &&
6161 strcmp(token, ss->legacy_name))
6162 continue;
6163
6164 cgroup_no_v1_mask |= 1 << i;
6165 }
6166 }
6167 return 1;
6168}
6169__setup("cgroup_no_v1=", cgroup_no_v1);
6170
6171/** 4874/**
6172 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 4875 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
6173 * @dentry: directory dentry of interest 4876 * @dentry: directory dentry of interest
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6197 * have been or be removed at any point. @kn->priv is RCU 4900 * have been or be removed at any point. @kn->priv is RCU
6198 * protected for this access. See css_release_work_fn() for details. 4901 * protected for this access. See css_release_work_fn() for details.
6199 */ 4902 */
6200 cgrp = rcu_dereference(kn->priv); 4903 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6201 if (cgrp) 4904 if (cgrp)
6202 css = cgroup_css(cgrp, ss); 4905 css = cgroup_css(cgrp, ss);
6203 4906
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
6349 5052
6350#endif /* CONFIG_SOCK_CGROUP_DATA */ 5053#endif /* CONFIG_SOCK_CGROUP_DATA */
6351 5054
6352/* cgroup namespaces */
6353
6354static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
6355{
6356 return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
6357}
6358
6359static void dec_cgroup_namespaces(struct ucounts *ucounts)
6360{
6361 dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
6362}
6363
6364static struct cgroup_namespace *alloc_cgroup_ns(void)
6365{
6366 struct cgroup_namespace *new_ns;
6367 int ret;
6368
6369 new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
6370 if (!new_ns)
6371 return ERR_PTR(-ENOMEM);
6372 ret = ns_alloc_inum(&new_ns->ns);
6373 if (ret) {
6374 kfree(new_ns);
6375 return ERR_PTR(ret);
6376 }
6377 atomic_set(&new_ns->count, 1);
6378 new_ns->ns.ops = &cgroupns_operations;
6379 return new_ns;
6380}
6381
6382void free_cgroup_ns(struct cgroup_namespace *ns)
6383{
6384 put_css_set(ns->root_cset);
6385 dec_cgroup_namespaces(ns->ucounts);
6386 put_user_ns(ns->user_ns);
6387 ns_free_inum(&ns->ns);
6388 kfree(ns);
6389}
6390EXPORT_SYMBOL(free_cgroup_ns);
6391
6392struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6393 struct user_namespace *user_ns,
6394 struct cgroup_namespace *old_ns)
6395{
6396 struct cgroup_namespace *new_ns;
6397 struct ucounts *ucounts;
6398 struct css_set *cset;
6399
6400 BUG_ON(!old_ns);
6401
6402 if (!(flags & CLONE_NEWCGROUP)) {
6403 get_cgroup_ns(old_ns);
6404 return old_ns;
6405 }
6406
6407 /* Allow only sysadmin to create cgroup namespace. */
6408 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6409 return ERR_PTR(-EPERM);
6410
6411 ucounts = inc_cgroup_namespaces(user_ns);
6412 if (!ucounts)
6413 return ERR_PTR(-ENOSPC);
6414
6415 /* It is not safe to take cgroup_mutex here */
6416 spin_lock_irq(&css_set_lock);
6417 cset = task_css_set(current);
6418 get_css_set(cset);
6419 spin_unlock_irq(&css_set_lock);
6420
6421 new_ns = alloc_cgroup_ns();
6422 if (IS_ERR(new_ns)) {
6423 put_css_set(cset);
6424 dec_cgroup_namespaces(ucounts);
6425 return new_ns;
6426 }
6427
6428 new_ns->user_ns = get_user_ns(user_ns);
6429 new_ns->ucounts = ucounts;
6430 new_ns->root_cset = cset;
6431
6432 return new_ns;
6433}
6434
6435static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
6436{
6437 return container_of(ns, struct cgroup_namespace, ns);
6438}
6439
6440static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
6441{
6442 struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
6443
6444 if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
6445 !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
6446 return -EPERM;
6447
6448 /* Don't need to do anything if we are attaching to our own cgroupns. */
6449 if (cgroup_ns == nsproxy->cgroup_ns)
6450 return 0;
6451
6452 get_cgroup_ns(cgroup_ns);
6453 put_cgroup_ns(nsproxy->cgroup_ns);
6454 nsproxy->cgroup_ns = cgroup_ns;
6455
6456 return 0;
6457}
6458
6459static struct ns_common *cgroupns_get(struct task_struct *task)
6460{
6461 struct cgroup_namespace *ns = NULL;
6462 struct nsproxy *nsproxy;
6463
6464 task_lock(task);
6465 nsproxy = task->nsproxy;
6466 if (nsproxy) {
6467 ns = nsproxy->cgroup_ns;
6468 get_cgroup_ns(ns);
6469 }
6470 task_unlock(task);
6471
6472 return ns ? &ns->ns : NULL;
6473}
6474
6475static void cgroupns_put(struct ns_common *ns)
6476{
6477 put_cgroup_ns(to_cg_ns(ns));
6478}
6479
6480static struct user_namespace *cgroupns_owner(struct ns_common *ns)
6481{
6482 return to_cg_ns(ns)->user_ns;
6483}
6484
6485const struct proc_ns_operations cgroupns_operations = {
6486 .name = "cgroup",
6487 .type = CLONE_NEWCGROUP,
6488 .get = cgroupns_get,
6489 .put = cgroupns_put,
6490 .install = cgroupns_install,
6491 .owner = cgroupns_owner,
6492};
6493
6494static __init int cgroup_namespaces_init(void)
6495{
6496 return 0;
6497}
6498subsys_initcall(cgroup_namespaces_init);
6499
6500#ifdef CONFIG_CGROUP_BPF 5055#ifdef CONFIG_CGROUP_BPF
6501int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, 5056int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
6502 enum bpf_attach_type type, bool overridable) 5057 enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
6510 return ret; 5065 return ret;
6511} 5066}
6512#endif /* CONFIG_CGROUP_BPF */ 5067#endif /* CONFIG_CGROUP_BPF */
6513
6514#ifdef CONFIG_CGROUP_DEBUG
6515static struct cgroup_subsys_state *
6516debug_css_alloc(struct cgroup_subsys_state *parent_css)
6517{
6518 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
6519
6520 if (!css)
6521 return ERR_PTR(-ENOMEM);
6522
6523 return css;
6524}
6525
6526static void debug_css_free(struct cgroup_subsys_state *css)
6527{
6528 kfree(css);
6529}
6530
6531static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
6532 struct cftype *cft)
6533{
6534 return cgroup_task_count(css->cgroup);
6535}
6536
6537static u64 current_css_set_read(struct cgroup_subsys_state *css,
6538 struct cftype *cft)
6539{
6540 return (u64)(unsigned long)current->cgroups;
6541}
6542
6543static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
6544 struct cftype *cft)
6545{
6546 u64 count;
6547
6548 rcu_read_lock();
6549 count = atomic_read(&task_css_set(current)->refcount);
6550 rcu_read_unlock();
6551 return count;
6552}
6553
6554static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6555{
6556 struct cgrp_cset_link *link;
6557 struct css_set *cset;
6558 char *name_buf;
6559
6560 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
6561 if (!name_buf)
6562 return -ENOMEM;
6563
6564 spin_lock_irq(&css_set_lock);
6565 rcu_read_lock();
6566 cset = rcu_dereference(current->cgroups);
6567 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
6568 struct cgroup *c = link->cgrp;
6569
6570 cgroup_name(c, name_buf, NAME_MAX + 1);
6571 seq_printf(seq, "Root %d group %s\n",
6572 c->root->hierarchy_id, name_buf);
6573 }
6574 rcu_read_unlock();
6575 spin_unlock_irq(&css_set_lock);
6576 kfree(name_buf);
6577 return 0;
6578}
6579
6580#define MAX_TASKS_SHOWN_PER_CSS 25
6581static int cgroup_css_links_read(struct seq_file *seq, void *v)
6582{
6583 struct cgroup_subsys_state *css = seq_css(seq);
6584 struct cgrp_cset_link *link;
6585
6586 spin_lock_irq(&css_set_lock);
6587 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6588 struct css_set *cset = link->cset;
6589 struct task_struct *task;
6590 int count = 0;
6591
6592 seq_printf(seq, "css_set %p\n", cset);
6593
6594 list_for_each_entry(task, &cset->tasks, cg_list) {
6595 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6596 goto overflow;
6597 seq_printf(seq, " task %d\n", task_pid_vnr(task));
6598 }
6599
6600 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
6601 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6602 goto overflow;
6603 seq_printf(seq, " task %d\n", task_pid_vnr(task));
6604 }
6605 continue;
6606 overflow:
6607 seq_puts(seq, " ...\n");
6608 }
6609 spin_unlock_irq(&css_set_lock);
6610 return 0;
6611}
6612
6613static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
6614{
6615 return (!cgroup_is_populated(css->cgroup) &&
6616 !css_has_online_children(&css->cgroup->self));
6617}
6618
6619static struct cftype debug_files[] = {
6620 {
6621 .name = "taskcount",
6622 .read_u64 = debug_taskcount_read,
6623 },
6624
6625 {
6626 .name = "current_css_set",
6627 .read_u64 = current_css_set_read,
6628 },
6629
6630 {
6631 .name = "current_css_set_refcount",
6632 .read_u64 = current_css_set_refcount_read,
6633 },
6634
6635 {
6636 .name = "current_css_set_cg_links",
6637 .seq_show = current_css_set_cg_links_read,
6638 },
6639
6640 {
6641 .name = "cgroup_css_links",
6642 .seq_show = cgroup_css_links_read,
6643 },
6644
6645 {
6646 .name = "releasable",
6647 .read_u64 = releasable_read,
6648 },
6649
6650 { } /* terminate */
6651};
6652
6653struct cgroup_subsys debug_cgrp_subsys = {
6654 .css_alloc = debug_css_alloc,
6655 .css_free = debug_css_free,
6656 .legacy_cftypes = debug_files,
6657};
6658#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cgroup/cpuset.c
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..cff7ea62c38f
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
1#include "cgroup-internal.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/nsproxy.h>
6#include <linux/proc_ns.h>
7
8
9/* cgroup namespaces */
10
11static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
12{
13 return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
14}
15
16static void dec_cgroup_namespaces(struct ucounts *ucounts)
17{
18 dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
19}
20
21static struct cgroup_namespace *alloc_cgroup_ns(void)
22{
23 struct cgroup_namespace *new_ns;
24 int ret;
25
26 new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
27 if (!new_ns)
28 return ERR_PTR(-ENOMEM);
29 ret = ns_alloc_inum(&new_ns->ns);
30 if (ret) {
31 kfree(new_ns);
32 return ERR_PTR(ret);
33 }
34 atomic_set(&new_ns->count, 1);
35 new_ns->ns.ops = &cgroupns_operations;
36 return new_ns;
37}
38
39void free_cgroup_ns(struct cgroup_namespace *ns)
40{
41 put_css_set(ns->root_cset);
42 dec_cgroup_namespaces(ns->ucounts);
43 put_user_ns(ns->user_ns);
44 ns_free_inum(&ns->ns);
45 kfree(ns);
46}
47EXPORT_SYMBOL(free_cgroup_ns);
48
49struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
50 struct user_namespace *user_ns,
51 struct cgroup_namespace *old_ns)
52{
53 struct cgroup_namespace *new_ns;
54 struct ucounts *ucounts;
55 struct css_set *cset;
56
57 BUG_ON(!old_ns);
58
59 if (!(flags & CLONE_NEWCGROUP)) {
60 get_cgroup_ns(old_ns);
61 return old_ns;
62 }
63
64 /* Allow only sysadmin to create cgroup namespace. */
65 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
66 return ERR_PTR(-EPERM);
67
68 ucounts = inc_cgroup_namespaces(user_ns);
69 if (!ucounts)
70 return ERR_PTR(-ENOSPC);
71
72 /* It is not safe to take cgroup_mutex here */
73 spin_lock_irq(&css_set_lock);
74 cset = task_css_set(current);
75 get_css_set(cset);
76 spin_unlock_irq(&css_set_lock);
77
78 new_ns = alloc_cgroup_ns();
79 if (IS_ERR(new_ns)) {
80 put_css_set(cset);
81 dec_cgroup_namespaces(ucounts);
82 return new_ns;
83 }
84
85 new_ns->user_ns = get_user_ns(user_ns);
86 new_ns->ucounts = ucounts;
87 new_ns->root_cset = cset;
88
89 return new_ns;
90}
91
92static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
93{
94 return container_of(ns, struct cgroup_namespace, ns);
95}
96
97static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
98{
99 struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
100
101 if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
102 !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
103 return -EPERM;
104
105 /* Don't need to do anything if we are attaching to our own cgroupns. */
106 if (cgroup_ns == nsproxy->cgroup_ns)
107 return 0;
108
109 get_cgroup_ns(cgroup_ns);
110 put_cgroup_ns(nsproxy->cgroup_ns);
111 nsproxy->cgroup_ns = cgroup_ns;
112
113 return 0;
114}
115
116static struct ns_common *cgroupns_get(struct task_struct *task)
117{
118 struct cgroup_namespace *ns = NULL;
119 struct nsproxy *nsproxy;
120
121 task_lock(task);
122 nsproxy = task->nsproxy;
123 if (nsproxy) {
124 ns = nsproxy->cgroup_ns;
125 get_cgroup_ns(ns);
126 }
127 task_unlock(task);
128
129 return ns ? &ns->ns : NULL;
130}
131
132static void cgroupns_put(struct ns_common *ns)
133{
134 put_cgroup_ns(to_cg_ns(ns));
135}
136
137static struct user_namespace *cgroupns_owner(struct ns_common *ns)
138{
139 return to_cg_ns(ns)->user_ns;
140}
141
142const struct proc_ns_operations cgroupns_operations = {
143 .name = "cgroup",
144 .type = CLONE_NEWCGROUP,
145 .get = cgroupns_get,
146 .put = cgroupns_put,
147 .install = cgroupns_install,
148 .owner = cgroupns_owner,
149};
150
151static __init int cgroup_namespaces_init(void)
152{
153 return 0;
154}
155subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup/pids.c
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
1/*
2 * RDMA resource limiting controller for cgroups.
3 *
4 * Used to allow a cgroup hierarchy to stop processes from consuming
5 * additional RDMA resources after a certain limit is reached.
6 *
7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
8 *
9 * This file is subject to the terms and conditions of version 2 of the GNU
10 * General Public License. See the file COPYING in the main directory of the
11 * Linux distribution for more details.
12 */
13
14#include <linux/bitops.h>
15#include <linux/slab.h>
16#include <linux/seq_file.h>
17#include <linux/cgroup.h>
18#include <linux/parser.h>
19#include <linux/cgroup_rdma.h>
20
21#define RDMACG_MAX_STR "max"
22
23/*
24 * Protects list of resource pools maintained on per cgroup basis
25 * and rdma device list.
26 */
27static DEFINE_MUTEX(rdmacg_mutex);
28static LIST_HEAD(rdmacg_devices);
29
30enum rdmacg_file_type {
31 RDMACG_RESOURCE_TYPE_MAX,
32 RDMACG_RESOURCE_TYPE_STAT,
33};
34
35/*
36 * resource table definition as to be seen by the user.
37 * Need to add entries to it when more resources are
38 * added/defined at IB verb/core layer.
39 */
40static char const *rdmacg_resource_names[] = {
41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
43};
44
45/* resource tracker for each resource of rdma cgroup */
46struct rdmacg_resource {
47 int max;
48 int usage;
49};
50
51/*
52 * resource pool object which represents per cgroup, per device
53 * resources. There are multiple instances of this object per cgroup,
54 * therefore it cannot be embedded within rdma_cgroup structure. It
55 * is maintained as list.
56 */
57struct rdmacg_resource_pool {
58 struct rdmacg_device *device;
59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
60
61 struct list_head cg_node;
62 struct list_head dev_node;
63
64 /* count active user tasks of this pool */
65 u64 usage_sum;
66 /* total number counts which are set to max */
67 int num_max_cnt;
68};
69
70static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
71{
72 return container_of(css, struct rdma_cgroup, css);
73}
74
75static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
76{
77 return css_rdmacg(cg->css.parent);
78}
79
80static inline struct rdma_cgroup *get_current_rdmacg(void)
81{
82 return css_rdmacg(task_get_css(current, rdma_cgrp_id));
83}
84
85static void set_resource_limit(struct rdmacg_resource_pool *rpool,
86 int index, int new_max)
87{
88 if (new_max == S32_MAX) {
89 if (rpool->resources[index].max != S32_MAX)
90 rpool->num_max_cnt++;
91 } else {
92 if (rpool->resources[index].max == S32_MAX)
93 rpool->num_max_cnt--;
94 }
95 rpool->resources[index].max = new_max;
96}
97
98static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
99{
100 int i;
101
102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103 set_resource_limit(rpool, i, S32_MAX);
104}
105
106static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
107{
108 lockdep_assert_held(&rdmacg_mutex);
109
110 list_del(&rpool->cg_node);
111 list_del(&rpool->dev_node);
112 kfree(rpool);
113}
114
115static struct rdmacg_resource_pool *
116find_cg_rpool_locked(struct rdma_cgroup *cg,
117 struct rdmacg_device *device)
118
119{
120 struct rdmacg_resource_pool *pool;
121
122 lockdep_assert_held(&rdmacg_mutex);
123
124 list_for_each_entry(pool, &cg->rpools, cg_node)
125 if (pool->device == device)
126 return pool;
127
128 return NULL;
129}
130
131static struct rdmacg_resource_pool *
132get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
133{
134 struct rdmacg_resource_pool *rpool;
135
136 rpool = find_cg_rpool_locked(cg, device);
137 if (rpool)
138 return rpool;
139
140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141 if (!rpool)
142 return ERR_PTR(-ENOMEM);
143
144 rpool->device = device;
145 set_all_resource_max_limit(rpool);
146
147 INIT_LIST_HEAD(&rpool->cg_node);
148 INIT_LIST_HEAD(&rpool->dev_node);
149 list_add_tail(&rpool->cg_node, &cg->rpools);
150 list_add_tail(&rpool->dev_node, &device->rpools);
151 return rpool;
152}
153
154/**
155 * uncharge_cg_locked - uncharge resource for rdma cgroup
156 * @cg: pointer to cg to uncharge and all parents in hierarchy
157 * @device: pointer to rdmacg device
158 * @index: index of the resource to uncharge in cg (resource pool)
159 *
160 * It also frees the resource pool which was created as part of
161 * charging operation when there are no resources attached to
162 * resource pool.
163 */
164static void
165uncharge_cg_locked(struct rdma_cgroup *cg,
166 struct rdmacg_device *device,
167 enum rdmacg_resource_type index)
168{
169 struct rdmacg_resource_pool *rpool;
170
171 rpool = find_cg_rpool_locked(cg, device);
172
173 /*
174 * rpool cannot be null at this stage. Let kernel operate in case
175 * if there a bug in IB stack or rdma controller, instead of crashing
176 * the system.
177 */
178 if (unlikely(!rpool)) {
179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180 return;
181 }
182
183 rpool->resources[index].usage--;
184
185 /*
186 * A negative count (or overflow) is invalid,
187 * it indicates a bug in the rdma controller.
188 */
189 WARN_ON_ONCE(rpool->resources[index].usage < 0);
190 rpool->usage_sum--;
191 if (rpool->usage_sum == 0 &&
192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
193 /*
194 * No user of the rpool and all entries are set to max, so
195 * safe to delete this rpool.
196 */
197 free_cg_rpool_locked(rpool);
198 }
199}
200
201/**
202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203 * @device: pointer to rdmacg device
204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205 * stop uncharging
206 * @index: index of the resource to uncharge in cg in given resource pool
207 */
208static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209 struct rdmacg_device *device,
210 struct rdma_cgroup *stop_cg,
211 enum rdmacg_resource_type index)
212{
213 struct rdma_cgroup *p;
214
215 mutex_lock(&rdmacg_mutex);
216
217 for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218 uncharge_cg_locked(p, device, index);
219
220 mutex_unlock(&rdmacg_mutex);
221
222 css_put(&cg->css);
223}
224
225/**
226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
227 * @device: pointer to rdmacg device
228 * @index: index of the resource to uncharge in cgroup in given resource pool
229 */
230void rdmacg_uncharge(struct rdma_cgroup *cg,
231 struct rdmacg_device *device,
232 enum rdmacg_resource_type index)
233{
234 if (index >= RDMACG_RESOURCE_MAX)
235 return;
236
237 rdmacg_uncharge_hierarchy(cg, device, NULL, index);
238}
239EXPORT_SYMBOL(rdmacg_uncharge);
240
241/**
242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
243 * @rdmacg: pointer to rdma cgroup which will own this resource
244 * @device: pointer to rdmacg device
245 * @index: index of the resource to charge in cgroup (resource pool)
246 *
247 * This function follows charging resource in hierarchical way.
248 * It will fail if the charge would cause the new value to exceed the
249 * hierarchical limit.
250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251 * Returns pointer to rdmacg for this resource when charging is successful.
252 *
253 * Charger needs to account resources on two criteria.
254 * (a) per cgroup & (b) per device resource usage.
255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256 * the configured limits. Per device provides granular configuration
257 * in multi device usage. It allocates resource pool in the hierarchy
258 * for each parent it come across for first resource. Later on resource
259 * pool will be available. Therefore it will be much faster thereon
260 * to charge/uncharge.
261 */
262int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263 struct rdmacg_device *device,
264 enum rdmacg_resource_type index)
265{
266 struct rdma_cgroup *cg, *p;
267 struct rdmacg_resource_pool *rpool;
268 s64 new;
269 int ret = 0;
270
271 if (index >= RDMACG_RESOURCE_MAX)
272 return -EINVAL;
273
274 /*
275 * hold on to css, as cgroup can be removed but resource
276 * accounting happens on css.
277 */
278 cg = get_current_rdmacg();
279
280 mutex_lock(&rdmacg_mutex);
281 for (p = cg; p; p = parent_rdmacg(p)) {
282 rpool = get_cg_rpool_locked(p, device);
283 if (IS_ERR(rpool)) {
284 ret = PTR_ERR(rpool);
285 goto err;
286 } else {
287 new = rpool->resources[index].usage + 1;
288 if (new > rpool->resources[index].max) {
289 ret = -EAGAIN;
290 goto err;
291 } else {
292 rpool->resources[index].usage = new;
293 rpool->usage_sum++;
294 }
295 }
296 }
297 mutex_unlock(&rdmacg_mutex);
298
299 *rdmacg = cg;
300 return 0;
301
302err:
303 mutex_unlock(&rdmacg_mutex);
304 rdmacg_uncharge_hierarchy(cg, device, p, index);
305 return ret;
306}
307EXPORT_SYMBOL(rdmacg_try_charge);
308
309/**
310 * rdmacg_register_device - register rdmacg device to rdma controller.
311 * @device: pointer to rdmacg device whose resources need to be accounted.
312 *
313 * If IB stack wish a device to participate in rdma cgroup resource
314 * tracking, it must invoke this API to register with rdma cgroup before
315 * any user space application can start using the RDMA resources.
316 * Returns 0 on success or EINVAL when table length given is beyond
317 * supported size.
318 */
319int rdmacg_register_device(struct rdmacg_device *device)
320{
321 INIT_LIST_HEAD(&device->dev_node);
322 INIT_LIST_HEAD(&device->rpools);
323
324 mutex_lock(&rdmacg_mutex);
325 list_add_tail(&device->dev_node, &rdmacg_devices);
326 mutex_unlock(&rdmacg_mutex);
327 return 0;
328}
329EXPORT_SYMBOL(rdmacg_register_device);
330
331/**
332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333 * @device: pointer to rdmacg device which was previously registered with rdma
334 * controller using rdmacg_register_device().
335 *
336 * IB stack must invoke this after all the resources of the IB device
337 * are destroyed and after ensuring that no more resources will be created
338 * when this API is invoked.
339 */
340void rdmacg_unregister_device(struct rdmacg_device *device)
341{
342 struct rdmacg_resource_pool *rpool, *tmp;
343
344 /*
345 * Synchronize with any active resource settings,
346 * usage query happening via configfs.
347 */
348 mutex_lock(&rdmacg_mutex);
349 list_del_init(&device->dev_node);
350
351 /*
352 * Now that this device is off the cgroup list, its safe to free
353 * all the rpool resources.
354 */
355 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356 free_cg_rpool_locked(rpool);
357
358 mutex_unlock(&rdmacg_mutex);
359}
360EXPORT_SYMBOL(rdmacg_unregister_device);
361
362static int parse_resource(char *c, int *intval)
363{
364 substring_t argstr;
365 const char **table = &rdmacg_resource_names[0];
366 char *name, *value = c;
367 size_t len;
368 int ret, i = 0;
369
370 name = strsep(&value, "=");
371 if (!name || !value)
372 return -EINVAL;
373
374 len = strlen(value);
375
376 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377 if (strcmp(table[i], name))
378 continue;
379
380 argstr.from = value;
381 argstr.to = value + len;
382
383 ret = match_int(&argstr, intval);
384 if (ret >= 0) {
385 if (*intval < 0)
386 break;
387 return i;
388 }
389 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390 *intval = S32_MAX;
391 return i;
392 }
393 break;
394 }
395 return -EINVAL;
396}
397
398static int rdmacg_parse_limits(char *options,
399 int *new_limits, unsigned long *enables)
400{
401 char *c;
402 int err = -EINVAL;
403
404 /* parse resource options */
405 while ((c = strsep(&options, " ")) != NULL) {
406 int index, intval;
407
408 index = parse_resource(c, &intval);
409 if (index < 0)
410 goto err;
411
412 new_limits[index] = intval;
413 *enables |= BIT(index);
414 }
415 return 0;
416
417err:
418 return err;
419}
420
421static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
422{
423 struct rdmacg_device *device;
424
425 lockdep_assert_held(&rdmacg_mutex);
426
427 list_for_each_entry(device, &rdmacg_devices, dev_node)
428 if (!strcmp(name, device->name))
429 return device;
430
431 return NULL;
432}
433
434static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435 char *buf, size_t nbytes, loff_t off)
436{
437 struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438 const char *dev_name;
439 struct rdmacg_resource_pool *rpool;
440 struct rdmacg_device *device;
441 char *options = strstrip(buf);
442 int *new_limits;
443 unsigned long enables = 0;
444 int i = 0, ret = 0;
445
446 /* extract the device name first */
447 dev_name = strsep(&options, " ");
448 if (!dev_name) {
449 ret = -EINVAL;
450 goto err;
451 }
452
453 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454 if (!new_limits) {
455 ret = -ENOMEM;
456 goto err;
457 }
458
459 ret = rdmacg_parse_limits(options, new_limits, &enables);
460 if (ret)
461 goto parse_err;
462
463 /* acquire lock to synchronize with hot plug devices */
464 mutex_lock(&rdmacg_mutex);
465
466 device = rdmacg_get_device_locked(dev_name);
467 if (!device) {
468 ret = -ENODEV;
469 goto dev_err;
470 }
471
472 rpool = get_cg_rpool_locked(cg, device);
473 if (IS_ERR(rpool)) {
474 ret = PTR_ERR(rpool);
475 goto dev_err;
476 }
477
478 /* now set the new limits of the rpool */
479 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480 set_resource_limit(rpool, i, new_limits[i]);
481
482 if (rpool->usage_sum == 0 &&
483 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
484 /*
485 * No user of the rpool and all entries are set to max, so
486 * safe to delete this rpool.
487 */
488 free_cg_rpool_locked(rpool);
489 }
490
491dev_err:
492 mutex_unlock(&rdmacg_mutex);
493
494parse_err:
495 kfree(new_limits);
496
497err:
498 return ret ?: nbytes;
499}
500
501static void print_rpool_values(struct seq_file *sf,
502 struct rdmacg_resource_pool *rpool)
503{
504 enum rdmacg_file_type sf_type;
505 int i;
506 u32 value;
507
508 sf_type = seq_cft(sf)->private;
509
510 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511 seq_puts(sf, rdmacg_resource_names[i]);
512 seq_putc(sf, '=');
513 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514 if (rpool)
515 value = rpool->resources[i].max;
516 else
517 value = S32_MAX;
518 } else {
519 if (rpool)
520 value = rpool->resources[i].usage;
521 else
522 value = 0;
523 }
524
525 if (value == S32_MAX)
526 seq_puts(sf, RDMACG_MAX_STR);
527 else
528 seq_printf(sf, "%d", value);
529 seq_putc(sf, ' ');
530 }
531}
532
533static int rdmacg_resource_read(struct seq_file *sf, void *v)
534{
535 struct rdmacg_device *device;
536 struct rdmacg_resource_pool *rpool;
537 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
538
539 mutex_lock(&rdmacg_mutex);
540
541 list_for_each_entry(device, &rdmacg_devices, dev_node) {
542 seq_printf(sf, "%s ", device->name);
543
544 rpool = find_cg_rpool_locked(cg, device);
545 print_rpool_values(sf, rpool);
546
547 seq_putc(sf, '\n');
548 }
549
550 mutex_unlock(&rdmacg_mutex);
551 return 0;
552}
553
554static struct cftype rdmacg_files[] = {
555 {
556 .name = "max",
557 .write = rdmacg_resource_set_max,
558 .seq_show = rdmacg_resource_read,
559 .private = RDMACG_RESOURCE_TYPE_MAX,
560 .flags = CFTYPE_NOT_ON_ROOT,
561 },
562 {
563 .name = "current",
564 .seq_show = rdmacg_resource_read,
565 .private = RDMACG_RESOURCE_TYPE_STAT,
566 .flags = CFTYPE_NOT_ON_ROOT,
567 },
568 { } /* terminate */
569};
570
571static struct cgroup_subsys_state *
572rdmacg_css_alloc(struct cgroup_subsys_state *parent)
573{
574 struct rdma_cgroup *cg;
575
576 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577 if (!cg)
578 return ERR_PTR(-ENOMEM);
579
580 INIT_LIST_HEAD(&cg->rpools);
581 return &cg->css;
582}
583
584static void rdmacg_css_free(struct cgroup_subsys_state *css)
585{
586 struct rdma_cgroup *cg = css_rdmacg(css);
587
588 kfree(cg);
589}
590
591/**
592 * rdmacg_css_offline - cgroup css_offline callback
593 * @css: css of interest
594 *
595 * This function is called when @css is about to go away and responsible
596 * for shooting down all rdmacg associated with @css. As part of that it
597 * marks all the resource pool entries to max value, so that when resources are
598 * uncharged, associated resource pool can be freed as well.
599 */
600static void rdmacg_css_offline(struct cgroup_subsys_state *css)
601{
602 struct rdma_cgroup *cg = css_rdmacg(css);
603 struct rdmacg_resource_pool *rpool;
604
605 mutex_lock(&rdmacg_mutex);
606
607 list_for_each_entry(rpool, &cg->rpools, cg_node)
608 set_all_resource_max_limit(rpool);
609
610 mutex_unlock(&rdmacg_mutex);
611}
612
613struct cgroup_subsys rdma_cgrp_subsys = {
614 .css_alloc = rdmacg_css_alloc,
615 .css_free = rdmacg_css_free,
616 .css_offline = rdmacg_css_offline,
617 .legacy_cftypes = rdmacg_files,
618 .dfl_cftypes = rdmacg_files,
619};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2eb3542e829..5b4e0b98f4eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10959,5 +10959,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
10959 .css_alloc = perf_cgroup_css_alloc, 10959 .css_alloc = perf_cgroup_css_alloc,
10960 .css_free = perf_cgroup_css_free, 10960 .css_free = perf_cgroup_css_free,
10961 .attach = perf_cgroup_attach, 10961 .attach = perf_cgroup_attach,
10962 /*
10963 * Implicitly enable on dfl hierarchy so that perf events can
10964 * always be filtered by cgroup2 path as long as perf_event
10965 * controller is not mounted on a legacy hierarchy.
10966 */
10967 .implicit_on_dfl = true,
10962}; 10968};
10963#endif /* CONFIG_CGROUP_PERF */ 10969#endif /* CONFIG_CGROUP_PERF */
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8fdee24725a7..eafbf11442b2 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
12{ 12{
13 FILE *fp; 13 FILE *fp;
14 char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; 14 char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
15 char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
15 char *token, *saved_ptr = NULL; 16 char *token, *saved_ptr = NULL;
16 int found = 0;
17 17
18 fp = fopen("/proc/mounts", "r"); 18 fp = fopen("/proc/mounts", "r");
19 if (!fp) 19 if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
24 * and inspect every cgroupfs mount point to find one that has 24 * and inspect every cgroupfs mount point to find one that has
25 * perf_event subsystem 25 * perf_event subsystem
26 */ 26 */
27 path_v1[0] = '\0';
28 path_v2[0] = '\0';
29
27 while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" 30 while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
28 STR(PATH_MAX)"s %*d %*d\n", 31 STR(PATH_MAX)"s %*d %*d\n",
29 mountpoint, type, tokens) == 3) { 32 mountpoint, type, tokens) == 3) {
30 33
31 if (!strcmp(type, "cgroup")) { 34 if (!path_v1[0] && !strcmp(type, "cgroup")) {
32 35
33 token = strtok_r(tokens, ",", &saved_ptr); 36 token = strtok_r(tokens, ",", &saved_ptr);
34 37
35 while (token != NULL) { 38 while (token != NULL) {
36 if (!strcmp(token, "perf_event")) { 39 if (!strcmp(token, "perf_event")) {
37 found = 1; 40 strcpy(path_v1, mountpoint);
38 break; 41 break;
39 } 42 }
40 token = strtok_r(NULL, ",", &saved_ptr); 43 token = strtok_r(NULL, ",", &saved_ptr);
41 } 44 }
42 } 45 }
43 if (found) 46
47 if (!path_v2[0] && !strcmp(type, "cgroup2"))
48 strcpy(path_v2, mountpoint);
49
50 if (path_v1[0] && path_v2[0])
44 break; 51 break;
45 } 52 }
46 fclose(fp); 53 fclose(fp);
47 if (!found) 54
55 if (path_v1[0])
56 path = path_v1;
57 else if (path_v2[0])
58 path = path_v2;
59 else
48 return -1; 60 return -1;
49 61
50 if (strlen(mountpoint) < maxlen) { 62 if (strlen(path) < maxlen) {
51 strcpy(buf, mountpoint); 63 strcpy(buf, path);
52 return 0; 64 return 0;
53 } 65 }
54 return -1; 66 return -1;