aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 13:11:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 13:11:28 -0400
commit47dfe4037e37b2843055ea3feccf1c335ea23a9c (patch)
tree818e8da41b62e6e801d88feaccfb45ad60ed5968
parentf2a84170ede80e4b80f636e3700ef4d4d5dc7d33 (diff)
parenta13812683f1118ee4deed88d8d9bc2c268358b2e (diff)
Merge branch 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "Mostly changes to get the v2 interface ready. The core features are mostly ready now and I think it's reasonable to expect to drop the devel mask in one or two devel cycles at least for a subset of controllers. - cgroup added a controller dependency mechanism so that block cgroup can depend on memory cgroup. This will be used to finally support IO provisioning on the writeback traffic, which is currently being implemented. - The v2 interface now uses a separate table so that the interface files for the new interface are explicitly declared in one place. Each controller will explicitly review and add the files for the new interface. - cpuset is getting ready for the hierarchical behavior which is in the similar style with other controllers so that an ancestor's configuration change doesn't change the descendants' configurations irreversibly and processes aren't silently migrated when a CPU or node goes down. All the changes are to the new interface and no behavior changed for the multiple hierarchies" * 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (29 commits) cpuset: fix the WARN_ON() in update_nodemasks_hier() cgroup: initialize cgrp_dfl_root_inhibit_ss_mask from !->dfl_files test cgroup: make CFTYPE_ONLY_ON_DFL and CFTYPE_NO_ internal to cgroup core cgroup: distinguish the default and legacy hierarchies when handling cftypes cgroup: replace cgroup_add_cftypes() with cgroup_add_legacy_cftypes() cgroup: rename cgroup_subsys->base_cftypes to ->legacy_cftypes cgroup: split cgroup_base_files[] into cgroup_{dfl|legacy}_base_files[] cpuset: export effective masks to userspace cpuset: allow writing offlined masks to cpuset.cpus/mems cpuset: enable onlined cpu/node in effective masks cpuset: refactor cpuset_hotplug_update_tasks() cpuset: make cs->{cpus, mems}_allowed as user-configured masks cpuset: apply cs->effective_{cpus,mems} cpuset: initialize top_cpuset's configured masks at mount cpuset: use effective cpumask to build sched domains cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty cpuset: update cs->effective_{cpus, mems} when config changes cpuset: update cpuset->effective_{cpus,mems} at hotplug cpuset: add cs->effective_cpus and cs->effective_mems cgroup: clean up sane_behavior handling ...
-rw-r--r--Documentation/cgroups/cgroups.txt14
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt35
-rw-r--r--block/blk-cgroup.c13
-rw-r--r--block/blk-throttle.c6
-rw-r--r--include/linux/cgroup.h165
-rw-r--r--kernel/cgroup.c453
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--mm/hugetlb_cgroup.c5
-rw-r--r--mm/memcontrol.c37
-rw-r--r--net/core/netclassid_cgroup.c2
-rw-r--r--net/core/netprio_cgroup.c2
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--security/device_cgroup.c2
16 files changed, 806 insertions, 436 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 821de56d1580..10c949b293e4 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -599,6 +599,20 @@ fork. If this method returns 0 (success) then this should remain valid
599while the caller holds cgroup_mutex and it is ensured that either 599while the caller holds cgroup_mutex and it is ensured that either
600attach() or cancel_attach() will be called in future. 600attach() or cancel_attach() will be called in future.
601 601
602void css_reset(struct cgroup_subsys_state *css)
603(cgroup_mutex held by caller)
604
605An optional operation which should restore @css's configuration to the
606initial state. This is currently only used on the unified hierarchy
607when a subsystem is disabled on a cgroup through
608"cgroup.subtree_control" but should remain enabled because other
609subsystems depend on it. cgroup core makes such a css invisible by
610removing the associated interface files and invokes this callback so
611that the hidden subsystem can return to the initial neutral state.
612This prevents unexpected resource control from a hidden css and
613ensures that the configuration is in the initial state when it is made
614visible again later.
615
602void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 616void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
603(cgroup_mutex held by caller) 617(cgroup_mutex held by caller)
604 618
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 324b182e6000..4f4563277864 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -94,12 +94,35 @@ change soon.
94 94
95 mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT 95 mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
96 96
97All controllers which are not bound to other hierarchies are 97All controllers which support the unified hierarchy and are not bound
98automatically bound to unified hierarchy and show up at the root of 98to other hierarchies are automatically bound to unified hierarchy and
99it. Controllers which are enabled only in the root of unified 99show up at the root of it. Controllers which are enabled only in the
100hierarchy can be bound to other hierarchies at any time. This allows 100root of unified hierarchy can be bound to other hierarchies. This
101mixing unified hierarchy with the traditional multiple hierarchies in 101allows mixing unified hierarchy with the traditional multiple
102a fully backward compatible way. 102hierarchies in a fully backward compatible way.
103
104For development purposes, the following boot parameter makes all
105controllers to appear on the unified hierarchy whether supported or
106not.
107
108 cgroup__DEVEL__legacy_files_on_dfl
109
110A controller can be moved across hierarchies only after the controller
111is no longer referenced in its current hierarchy. Because per-cgroup
112controller states are destroyed asynchronously and controllers may
113have lingering references, a controller may not show up immediately on
114the unified hierarchy after the final umount of the previous
115hierarchy. Similarly, a controller should be fully disabled to be
116moved out of the unified hierarchy and it may take some time for the
117disabled controller to become available for other hierarchies;
118furthermore, due to dependencies among controllers, other controllers
119may need to be disabled too.
120
121While useful for development and manual configurations, dynamically
122moving controllers between the unified and other hierarchies is
123strongly discouraged for production use. It is recommended to decide
124the hierarchies and controller associations before starting using the
125controllers.
103 126
104 127
1052-2. cgroup.subtree_control 1282-2. cgroup.subtree_control
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 28d227c5ca77..e17da947f6bd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -928,7 +928,15 @@ struct cgroup_subsys blkio_cgrp_subsys = {
928 .css_offline = blkcg_css_offline, 928 .css_offline = blkcg_css_offline,
929 .css_free = blkcg_css_free, 929 .css_free = blkcg_css_free,
930 .can_attach = blkcg_can_attach, 930 .can_attach = blkcg_can_attach,
931 .base_cftypes = blkcg_files, 931 .legacy_cftypes = blkcg_files,
932#ifdef CONFIG_MEMCG
933 /*
934 * This ensures that, if available, memcg is automatically enabled
935 * together on the default hierarchy so that the owner cgroup can
936 * be retrieved from writeback pages.
937 */
938 .depends_on = 1 << memory_cgrp_id,
939#endif
932}; 940};
933EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); 941EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
934 942
@@ -1120,7 +1128,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1120 1128
1121 /* everything is in place, add intf files for the new policy */ 1129 /* everything is in place, add intf files for the new policy */
1122 if (pol->cftypes) 1130 if (pol->cftypes)
1123 WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); 1131 WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
1132 pol->cftypes));
1124 ret = 0; 1133 ret = 0;
1125out_unlock: 1134out_unlock:
1126 mutex_unlock(&blkcg_pol_mutex); 1135 mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3fdb21a390c1..9273d0969ebd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -412,13 +412,13 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
412 int rw; 412 int rw;
413 413
414 /* 414 /*
415 * If sane_hierarchy is enabled, we switch to properly hierarchical 415 * If on the default hierarchy, we switch to properly hierarchical
416 * behavior where limits on a given throtl_grp are applied to the 416 * behavior where limits on a given throtl_grp are applied to the
417 * whole subtree rather than just the group itself. e.g. If 16M 417 * whole subtree rather than just the group itself. e.g. If 16M
418 * read_bps limit is set on the root group, the whole system can't 418 * read_bps limit is set on the root group, the whole system can't
419 * exceed 16M for the device. 419 * exceed 16M for the device.
420 * 420 *
421 * If sane_hierarchy is not enabled, the broken flat hierarchy 421 * If not on the default hierarchy, the broken flat hierarchy
422 * behavior is retained where all throtl_grps are treated as if 422 * behavior is retained where all throtl_grps are treated as if
423 * they're all separate root groups right below throtl_data. 423 * they're all separate root groups right below throtl_data.
424 * Limits of a group don't interact with limits of other groups 424 * Limits of a group don't interact with limits of other groups
@@ -426,7 +426,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
426 */ 426 */
427 parent_sq = &td->service_queue; 427 parent_sq = &td->service_queue;
428 428
429 if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) 429 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
430 parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 430 parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
431 431
432 throtl_service_queue_init(&tg->service_queue, parent_sq); 432 throtl_service_queue_init(&tg->service_queue, parent_sq);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8a111dd42d7a..b5223c570eba 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -203,7 +203,15 @@ struct cgroup {
203 struct kernfs_node *kn; /* cgroup kernfs entry */ 203 struct kernfs_node *kn; /* cgroup kernfs entry */
204 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ 204 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
205 205
206 /* the bitmask of subsystems enabled on the child cgroups */ 206 /*
207 * The bitmask of subsystems enabled on the child cgroups.
208 * ->subtree_control is the one configured through
209 * "cgroup.subtree_control" while ->child_subsys_mask is the
210 * effective one which may have more subsystems enabled.
211 * Controller knobs are made available iff it's enabled in
212 * ->subtree_control.
213 */
214 unsigned int subtree_control;
207 unsigned int child_subsys_mask; 215 unsigned int child_subsys_mask;
208 216
209 /* Private pointers for each registered subsystem */ 217 /* Private pointers for each registered subsystem */
@@ -248,73 +256,9 @@ struct cgroup {
248 256
249/* cgroup_root->flags */ 257/* cgroup_root->flags */
250enum { 258enum {
251 /* 259 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
252 * Unfortunately, cgroup core and various controllers are riddled
253 * with idiosyncrasies and pointless options. The following flag,
254 * when set, will force sane behavior - some options are forced on,
255 * others are disallowed, and some controllers will change their
256 * hierarchical or other behaviors.
257 *
258 * The set of behaviors affected by this flag are still being
259 * determined and developed and the mount option for this flag is
260 * prefixed with __DEVEL__. The prefix will be dropped once we
261 * reach the point where all behaviors are compatible with the
262 * planned unified hierarchy, which will automatically turn on this
263 * flag.
264 *
265 * The followings are the behaviors currently affected this flag.
266 *
267 * - Mount options "noprefix", "xattr", "clone_children",
268 * "release_agent" and "name" are disallowed.
269 *
270 * - When mounting an existing superblock, mount options should
271 * match.
272 *
273 * - Remount is disallowed.
274 *
275 * - rename(2) is disallowed.
276 *
277 * - "tasks" is removed. Everything should be at process
278 * granularity. Use "cgroup.procs" instead.
279 *
280 * - "cgroup.procs" is not sorted. pids will be unique unless they
281 * got recycled inbetween reads.
282 *
283 * - "release_agent" and "notify_on_release" are removed.
284 * Replacement notification mechanism will be implemented.
285 *
286 * - "cgroup.clone_children" is removed.
287 *
288 * - "cgroup.subtree_populated" is available. Its value is 0 if
289 * the cgroup and its descendants contain no task; otherwise, 1.
290 * The file also generates kernfs notification which can be
291 * monitored through poll and [di]notify when the value of the
292 * file changes.
293 *
294 * - If mount is requested with sane_behavior but without any
295 * subsystem, the default unified hierarchy is mounted.
296 *
297 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
298 * and take masks of ancestors with non-empty cpus/mems, instead of
299 * being moved to an ancestor.
300 *
301 * - cpuset: a task can be moved into an empty cpuset, and again it
302 * takes masks of ancestors.
303 *
304 * - memcg: use_hierarchy is on by default and the cgroup file for
305 * the flag is not created.
306 *
307 * - blkcg: blk-throttle becomes properly hierarchical.
308 *
309 * - debug: disallowed on the default hierarchy.
310 */
311 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
312
313 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 260 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
314 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 261 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
315
316 /* mount options live below bit 16 */
317 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
318}; 262};
319 263
320/* 264/*
@@ -440,9 +384,11 @@ struct css_set {
440enum { 384enum {
441 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 385 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
442 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 386 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
443 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
444 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 387 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
445 CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */ 388
389 /* internal flags, do not use outside cgroup core proper */
390 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
391 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
446}; 392};
447 393
448#define MAX_CFTYPE_NAME 64 394#define MAX_CFTYPE_NAME 64
@@ -526,20 +472,64 @@ struct cftype {
526extern struct cgroup_root cgrp_dfl_root; 472extern struct cgroup_root cgrp_dfl_root;
527extern struct css_set init_css_set; 473extern struct css_set init_css_set;
528 474
475/**
476 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
477 * @cgrp: the cgroup of interest
478 *
479 * The default hierarchy is the v2 interface of cgroup and this function
480 * can be used to test whether a cgroup is on the default hierarchy for
481 * cases where a subsystem should behave differnetly depending on the
482 * interface version.
483 *
484 * The set of behaviors which change on the default hierarchy are still
485 * being determined and the mount option is prefixed with __DEVEL__.
486 *
487 * List of changed behaviors:
488 *
489 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
490 * and "name" are disallowed.
491 *
492 * - When mounting an existing superblock, mount options should match.
493 *
494 * - Remount is disallowed.
495 *
496 * - rename(2) is disallowed.
497 *
498 * - "tasks" is removed. Everything should be at process granularity. Use
499 * "cgroup.procs" instead.
500 *
501 * - "cgroup.procs" is not sorted. pids will be unique unless they got
502 * recycled inbetween reads.
503 *
504 * - "release_agent" and "notify_on_release" are removed. Replacement
505 * notification mechanism will be implemented.
506 *
507 * - "cgroup.clone_children" is removed.
508 *
509 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
510 * and its descendants contain no task; otherwise, 1. The file also
511 * generates kernfs notification which can be monitored through poll and
512 * [di]notify when the value of the file changes.
513 *
514 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
515 * take masks of ancestors with non-empty cpus/mems, instead of being
516 * moved to an ancestor.
517 *
518 * - cpuset: a task can be moved into an empty cpuset, and again it takes
519 * masks of ancestors.
520 *
521 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
522 * is not created.
523 *
524 * - blkcg: blk-throttle becomes properly hierarchical.
525 *
526 * - debug: disallowed on the default hierarchy.
527 */
529static inline bool cgroup_on_dfl(const struct cgroup *cgrp) 528static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
530{ 529{
531 return cgrp->root == &cgrp_dfl_root; 530 return cgrp->root == &cgrp_dfl_root;
532} 531}
533 532
534/*
535 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
536 * function can be called as long as @cgrp is accessible.
537 */
538static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
539{
540 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
541}
542
543/* no synchronization, the result can only be used as a hint */ 533/* no synchronization, the result can only be used as a hint */
544static inline bool cgroup_has_tasks(struct cgroup *cgrp) 534static inline bool cgroup_has_tasks(struct cgroup *cgrp)
545{ 535{
@@ -602,7 +592,8 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
602 592
603char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); 593char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
604 594
605int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 595int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
596int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
606int cgroup_rm_cftypes(struct cftype *cfts); 597int cgroup_rm_cftypes(struct cftype *cfts);
607 598
608bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 599bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
@@ -634,6 +625,7 @@ struct cgroup_subsys {
634 int (*css_online)(struct cgroup_subsys_state *css); 625 int (*css_online)(struct cgroup_subsys_state *css);
635 void (*css_offline)(struct cgroup_subsys_state *css); 626 void (*css_offline)(struct cgroup_subsys_state *css);
636 void (*css_free)(struct cgroup_subsys_state *css); 627 void (*css_free)(struct cgroup_subsys_state *css);
628 void (*css_reset)(struct cgroup_subsys_state *css);
637 629
638 int (*can_attach)(struct cgroup_subsys_state *css, 630 int (*can_attach)(struct cgroup_subsys_state *css,
639 struct cgroup_taskset *tset); 631 struct cgroup_taskset *tset);
@@ -682,8 +674,21 @@ struct cgroup_subsys {
682 */ 674 */
683 struct list_head cfts; 675 struct list_head cfts;
684 676
685 /* base cftypes, automatically registered with subsys itself */ 677 /*
686 struct cftype *base_cftypes; 678 * Base cftypes which are automatically registered. The two can
679 * point to the same array.
680 */
681 struct cftype *dfl_cftypes; /* for the default hierarchy */
682 struct cftype *legacy_cftypes; /* for the legacy hierarchies */
683
684 /*
685 * A subsystem may depend on other subsystems. When such subsystem
686 * is enabled on a cgroup, the depended-upon subsystems are enabled
687 * together if available. Subsystems enabled due to dependency are
688 * not visible to userland until explicitly enabled. The following
689 * specifies the mask of subsystems that this one depends on.
690 */
691 unsigned int depends_on;
687}; 692};
688 693
689#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; 694#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aad41f06901b..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
149 */ 149 */
150static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
151 151
152/*
153 * Set by the boot param of the same name and makes subsystems with NULL
154 * ->dfl_files to use ->legacy_files on the default hierarchy.
155 */
156static bool cgroup_legacy_files_on_dfl;
157
152/* some controllers are not supported in the default hierarchy */ 158/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 159static unsigned int cgrp_dfl_root_inhibit_ss_mask;
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158 160
159/* The list of hierarchy roots */ 161/* The list of hierarchy roots */
160 162
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
180 */ 182 */
181static int need_forkexit_callback __read_mostly; 183static int need_forkexit_callback __read_mostly;
182 184
183static struct cftype cgroup_base_files[]; 185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[];
184 187
185static void cgroup_put(struct cgroup *cgrp); 188static void cgroup_put(struct cgroup *cgrp);
186static int rebind_subsystems(struct cgroup_root *dst_root, 189static int rebind_subsystems(struct cgroup_root *dst_root,
187 unsigned int ss_mask); 190 unsigned int ss_mask);
188static int cgroup_destroy_locked(struct cgroup *cgrp); 191static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); 192static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
193 bool visible);
190static void css_release(struct percpu_ref *ref); 194static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css); 195static void kill_css(struct cgroup_subsys_state *css);
192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
1037} 1041}
1038 1042
1039/** 1043/**
1044 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1045 * @cgrp: the target cgroup
1046 *
1047 * On the default hierarchy, a subsystem may request other subsystems to be
1048 * enabled together through its ->depends_on mask. In such cases, more
1049 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1050 *
1051 * This function determines which subsystems need to be enabled given the
1052 * current @cgrp->subtree_control and records it in
1053 * @cgrp->child_subsys_mask. The resulting mask is always a superset of
1054 * @cgrp->subtree_control and follows the usual hierarchy rules.
1055 */
1056static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1057{
1058 struct cgroup *parent = cgroup_parent(cgrp);
1059 unsigned int cur_ss_mask = cgrp->subtree_control;
1060 struct cgroup_subsys *ss;
1061 int ssid;
1062
1063 lockdep_assert_held(&cgroup_mutex);
1064
1065 if (!cgroup_on_dfl(cgrp)) {
1066 cgrp->child_subsys_mask = cur_ss_mask;
1067 return;
1068 }
1069
1070 while (true) {
1071 unsigned int new_ss_mask = cur_ss_mask;
1072
1073 for_each_subsys(ss, ssid)
1074 if (cur_ss_mask & (1 << ssid))
1075 new_ss_mask |= ss->depends_on;
1076
1077 /*
1078 * Mask out subsystems which aren't available. This can
1079 * happen only if some depended-upon subsystems were bound
1080 * to non-default hierarchies.
1081 */
1082 if (parent)
1083 new_ss_mask &= parent->child_subsys_mask;
1084 else
1085 new_ss_mask &= cgrp->root->subsys_mask;
1086
1087 if (new_ss_mask == cur_ss_mask)
1088 break;
1089 cur_ss_mask = new_ss_mask;
1090 }
1091
1092 cgrp->child_subsys_mask = cur_ss_mask;
1093}
1094
1095/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods 1096 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced 1097 * @kn: the kernfs_node being serviced
1042 * 1098 *
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1208 up_write(&css_set_rwsem); 1264 up_write(&css_set_rwsem);
1209 1265
1210 src_root->subsys_mask &= ~(1 << ssid); 1266 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid); 1267 src_root->cgrp.subtree_control &= ~(1 << ssid);
1268 cgroup_refresh_child_subsys_mask(&src_root->cgrp);
1212 1269
1213 /* default hierarchy doesn't enable controllers by default */ 1270 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid; 1271 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root) 1272 if (dst_root != &cgrp_dfl_root) {
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid; 1273 dst_root->cgrp.subtree_control |= 1 << ssid;
1274 cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
1275 }
1217 1276
1218 if (ss->bind) 1277 if (ss->bind)
1219 ss->bind(css); 1278 ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
1233 for_each_subsys(ss, ssid) 1292 for_each_subsys(ss, ssid)
1234 if (root->subsys_mask & (1 << ssid)) 1293 if (root->subsys_mask & (1 << ssid))
1235 seq_printf(seq, ",%s", ss->name); 1294 seq_printf(seq, ",%s", ss->name);
1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1237 seq_puts(seq, ",sane_behavior");
1238 if (root->flags & CGRP_ROOT_NOPREFIX) 1295 if (root->flags & CGRP_ROOT_NOPREFIX)
1239 seq_puts(seq, ",noprefix"); 1296 seq_puts(seq, ",noprefix");
1240 if (root->flags & CGRP_ROOT_XATTR) 1297 if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1268 bool all_ss = false, one_ss = false; 1325 bool all_ss = false, one_ss = false;
1269 unsigned int mask = -1U; 1326 unsigned int mask = -1U;
1270 struct cgroup_subsys *ss; 1327 struct cgroup_subsys *ss;
1328 int nr_opts = 0;
1271 int i; 1329 int i;
1272 1330
1273#ifdef CONFIG_CPUSETS 1331#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1277 memset(opts, 0, sizeof(*opts)); 1335 memset(opts, 0, sizeof(*opts));
1278 1336
1279 while ((token = strsep(&o, ",")) != NULL) { 1337 while ((token = strsep(&o, ",")) != NULL) {
1338 nr_opts++;
1339
1280 if (!*token) 1340 if (!*token)
1281 return -EINVAL; 1341 return -EINVAL;
1282 if (!strcmp(token, "none")) { 1342 if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1361 return -ENOENT; 1421 return -ENOENT;
1362 } 1422 }
1363 1423
1364 /* Consistency checks */
1365
1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1424 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1425 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1368 1426 if (nr_opts != 1) {
1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1427 pr_err("sane_behavior: no other mount options allowed\n");
1370 opts->cpuset_clone_children || opts->release_agent ||
1371 opts->name) {
1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1373 return -EINVAL; 1428 return -EINVAL;
1374 } 1429 }
1375 } else { 1430 return 0;
1376 /*
1377 * If the 'all' option was specified select all the
1378 * subsystems, otherwise if 'none', 'name=' and a subsystem
1379 * name options were not specified, let's default to 'all'
1380 */
1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1382 for_each_subsys(ss, i)
1383 if (!ss->disabled)
1384 opts->subsys_mask |= (1 << i);
1385
1386 /*
1387 * We either have to specify by name or by subsystems. (So
1388 * all empty hierarchies must have a name).
1389 */
1390 if (!opts->subsys_mask && !opts->name)
1391 return -EINVAL;
1392 } 1431 }
1393 1432
1394 /* 1433 /*
1434 * If the 'all' option was specified select all the subsystems,
1435 * otherwise if 'none', 'name=' and a subsystem name options were
1436 * not specified, let's default to 'all'
1437 */
1438 if (all_ss || (!one_ss && !opts->none && !opts->name))
1439 for_each_subsys(ss, i)
1440 if (!ss->disabled)
1441 opts->subsys_mask |= (1 << i);
1442
1443 /*
1444 * We either have to specify by name or by subsystems. (So all
1445 * empty hierarchies must have a name).
1446 */
1447 if (!opts->subsys_mask && !opts->name)
1448 return -EINVAL;
1449
1450 /*
1395 * Option noprefix was introduced just for backward compatibility 1451 * Option noprefix was introduced just for backward compatibility
1396 * with the old cpuset, so we allow noprefix only if mounting just 1452 * with the old cpuset, so we allow noprefix only if mounting just
1397 * the cpuset subsystem. 1453 * the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1399 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) 1455 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1400 return -EINVAL; 1456 return -EINVAL;
1401 1457
1402
1403 /* Can't specify "none" and some subsystems */ 1458 /* Can't specify "none" and some subsystems */
1404 if (opts->subsys_mask && opts->none) 1459 if (opts->subsys_mask && opts->none)
1405 return -EINVAL; 1460 return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1414 struct cgroup_sb_opts opts; 1469 struct cgroup_sb_opts opts;
1415 unsigned int added_mask, removed_mask; 1470 unsigned int added_mask, removed_mask;
1416 1471
1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1472 if (root == &cgrp_dfl_root) {
1418 pr_err("sane_behavior: remount is not allowed\n"); 1473 pr_err("remount is not allowed\n");
1419 return -EINVAL; 1474 return -EINVAL;
1420 } 1475 }
1421 1476
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1434 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1489 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1435 1490
1436 /* Don't allow flags or name to change at remount */ 1491 /* Don't allow flags or name to change at remount */
1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1492 if ((opts.flags ^ root->flags) ||
1438 (opts.name && strcmp(opts.name, root->name))) { 1493 (opts.name && strcmp(opts.name, root->name))) {
1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", 1494 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1495 opts.flags, opts.name ?: "", root->flags, root->name);
1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1442 ret = -EINVAL; 1496 ret = -EINVAL;
1443 goto out_unlock; 1497 goto out_unlock;
1444 } 1498 }
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1563{ 1617{
1564 LIST_HEAD(tmp_links); 1618 LIST_HEAD(tmp_links);
1565 struct cgroup *root_cgrp = &root->cgrp; 1619 struct cgroup *root_cgrp = &root->cgrp;
1620 struct cftype *base_files;
1566 struct css_set *cset; 1621 struct css_set *cset;
1567 int i, ret; 1622 int i, ret;
1568 1623
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1600 } 1655 }
1601 root_cgrp->kn = root->kf_root->kn; 1656 root_cgrp->kn = root->kf_root->kn;
1602 1657
1603 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1658 if (root == &cgrp_dfl_root)
1659 base_files = cgroup_dfl_base_files;
1660 else
1661 base_files = cgroup_legacy_base_files;
1662
1663 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1604 if (ret) 1664 if (ret)
1605 goto destroy_root; 1665 goto destroy_root;
1606 1666
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1672 goto out_unlock; 1732 goto out_unlock;
1673 1733
1674 /* look for a matching existing root */ 1734 /* look for a matching existing root */
1675 if (!opts.subsys_mask && !opts.none && !opts.name) { 1735 if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
1676 cgrp_dfl_root_visible = true; 1736 cgrp_dfl_root_visible = true;
1677 root = &cgrp_dfl_root; 1737 root = &cgrp_dfl_root;
1678 cgroup_get(&root->cgrp); 1738 cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1730 goto out_unlock; 1790 goto out_unlock;
1731 } 1791 }
1732 1792
1733 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1793 if (root->flags ^ opts.flags)
1734 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1794 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1735 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1736 ret = -EINVAL;
1737 goto out_unlock;
1738 } else {
1739 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1740 }
1741 }
1742 1795
1743 /* 1796 /*
1744 * We want to reuse @root whose lifetime is governed by its 1797 * We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2457 2510
2458static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2511static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2459{ 2512{
2460 struct cgroup *cgrp = seq_css(seq)->cgroup; 2513 seq_puts(seq, "0\n");
2461
2462 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2463 return 0; 2514 return 0;
2464} 2515}
2465 2516
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
2496{ 2547{
2497 struct cgroup *cgrp = seq_css(seq)->cgroup; 2548 struct cgroup *cgrp = seq_css(seq)->cgroup;
2498 2549
2499 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); 2550 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
2500 return 0; 2551 return 0;
2501} 2552}
2502 2553
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2505{ 2556{
2506 struct cgroup *cgrp = seq_css(seq)->cgroup; 2557 struct cgroup *cgrp = seq_css(seq)->cgroup;
2507 2558
2508 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); 2559 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2509 return 0; 2560 return 0;
2510} 2561}
2511 2562
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2611 loff_t off) 2662 loff_t off)
2612{ 2663{
2613 unsigned int enable = 0, disable = 0; 2664 unsigned int enable = 0, disable = 0;
2665 unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
2614 struct cgroup *cgrp, *child; 2666 struct cgroup *cgrp, *child;
2615 struct cgroup_subsys *ss; 2667 struct cgroup_subsys *ss;
2616 char *tok; 2668 char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2650 2702
2651 for_each_subsys(ss, ssid) { 2703 for_each_subsys(ss, ssid) {
2652 if (enable & (1 << ssid)) { 2704 if (enable & (1 << ssid)) {
2653 if (cgrp->child_subsys_mask & (1 << ssid)) { 2705 if (cgrp->subtree_control & (1 << ssid)) {
2654 enable &= ~(1 << ssid); 2706 enable &= ~(1 << ssid);
2655 continue; 2707 continue;
2656 } 2708 }
2657 2709
2710 /* unavailable or not enabled on the parent? */
2711 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2712 (cgroup_parent(cgrp) &&
2713 !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
2714 ret = -ENOENT;
2715 goto out_unlock;
2716 }
2717
2718 /*
2719 * @ss is already enabled through dependency and
2720 * we'll just make it visible. Skip draining.
2721 */
2722 if (cgrp->child_subsys_mask & (1 << ssid))
2723 continue;
2724
2658 /* 2725 /*
2659 * Because css offlining is asynchronous, userland 2726 * Because css offlining is asynchronous, userland
2660 * might try to re-enable the same controller while 2727 * might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2677 2744
2678 return restart_syscall(); 2745 return restart_syscall();
2679 } 2746 }
2680
2681 /* unavailable or not enabled on the parent? */
2682 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2683 (cgroup_parent(cgrp) &&
2684 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2685 ret = -ENOENT;
2686 goto out_unlock;
2687 }
2688 } else if (disable & (1 << ssid)) { 2747 } else if (disable & (1 << ssid)) {
2689 if (!(cgrp->child_subsys_mask & (1 << ssid))) { 2748 if (!(cgrp->subtree_control & (1 << ssid))) {
2690 disable &= ~(1 << ssid); 2749 disable &= ~(1 << ssid);
2691 continue; 2750 continue;
2692 } 2751 }
2693 2752
2694 /* a child has it enabled? */ 2753 /* a child has it enabled? */
2695 cgroup_for_each_live_child(child, cgrp) { 2754 cgroup_for_each_live_child(child, cgrp) {
2696 if (child->child_subsys_mask & (1 << ssid)) { 2755 if (child->subtree_control & (1 << ssid)) {
2697 ret = -EBUSY; 2756 ret = -EBUSY;
2698 goto out_unlock; 2757 goto out_unlock;
2699 } 2758 }
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2707 } 2766 }
2708 2767
2709 /* 2768 /*
2710 * Except for the root, child_subsys_mask must be zero for a cgroup 2769 * Except for the root, subtree_control must be zero for a cgroup
2711 * with tasks so that child cgroups don't compete against tasks. 2770 * with tasks so that child cgroups don't compete against tasks.
2712 */ 2771 */
2713 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { 2772 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2716 } 2775 }
2717 2776
2718 /* 2777 /*
2719 * Create csses for enables and update child_subsys_mask. This 2778 * Update subsys masks and calculate what needs to be done. More
2720 * changes cgroup_e_css() results which in turn makes the 2779 * subsystems than specified may need to be enabled or disabled
2721 * subsequent cgroup_update_dfl_csses() associate all tasks in the 2780 * depending on subsystem dependencies.
2722 * subtree to the updated csses. 2781 */
2782 cgrp->subtree_control |= enable;
2783 cgrp->subtree_control &= ~disable;
2784
2785 old_ctrl = cgrp->child_subsys_mask;
2786 cgroup_refresh_child_subsys_mask(cgrp);
2787 new_ctrl = cgrp->child_subsys_mask;
2788
2789 css_enable = ~old_ctrl & new_ctrl;
2790 css_disable = old_ctrl & ~new_ctrl;
2791 enable |= css_enable;
2792 disable |= css_disable;
2793
2794 /*
2795 * Create new csses or make the existing ones visible. A css is
2796 * created invisible if it's being implicitly enabled through
2797 * dependency. An invisible css is made visible when the userland
2798 * explicitly enables it.
2723 */ 2799 */
2724 for_each_subsys(ss, ssid) { 2800 for_each_subsys(ss, ssid) {
2725 if (!(enable & (1 << ssid))) 2801 if (!(enable & (1 << ssid)))
2726 continue; 2802 continue;
2727 2803
2728 cgroup_for_each_live_child(child, cgrp) { 2804 cgroup_for_each_live_child(child, cgrp) {
2729 ret = create_css(child, ss); 2805 if (css_enable & (1 << ssid))
2806 ret = create_css(child, ss,
2807 cgrp->subtree_control & (1 << ssid));
2808 else
2809 ret = cgroup_populate_dir(child, 1 << ssid);
2730 if (ret) 2810 if (ret)
2731 goto err_undo_css; 2811 goto err_undo_css;
2732 } 2812 }
2733 } 2813 }
2734 2814
2735 cgrp->child_subsys_mask |= enable; 2815 /*
2736 cgrp->child_subsys_mask &= ~disable; 2816 * At this point, cgroup_e_css() results reflect the new csses
2737 2817 * making the following cgroup_update_dfl_csses() properly update
2818 * css associations of all tasks in the subtree.
2819 */
2738 ret = cgroup_update_dfl_csses(cgrp); 2820 ret = cgroup_update_dfl_csses(cgrp);
2739 if (ret) 2821 if (ret)
2740 goto err_undo_css; 2822 goto err_undo_css;
2741 2823
2742 /* all tasks are now migrated away from the old csses, kill them */ 2824 /*
2825 * All tasks are migrated out of disabled csses. Kill or hide
2826 * them. A css is hidden when the userland requests it to be
2827 * disabled while other subsystems are still depending on it. The
2828 * css must not actively control resources and be in the vanilla
2829 * state if it's made visible again later. Controllers which may
2830 * be depended upon should provide ->css_reset() for this purpose.
2831 */
2743 for_each_subsys(ss, ssid) { 2832 for_each_subsys(ss, ssid) {
2744 if (!(disable & (1 << ssid))) 2833 if (!(disable & (1 << ssid)))
2745 continue; 2834 continue;
2746 2835
2747 cgroup_for_each_live_child(child, cgrp) 2836 cgroup_for_each_live_child(child, cgrp) {
2748 kill_css(cgroup_css(child, ss)); 2837 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2838
2839 if (css_disable & (1 << ssid)) {
2840 kill_css(css);
2841 } else {
2842 cgroup_clear_dir(child, 1 << ssid);
2843 if (ss->css_reset)
2844 ss->css_reset(css);
2845 }
2846 }
2749 } 2847 }
2750 2848
2751 kernfs_activate(cgrp->kn); 2849 kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
2755 return ret ?: nbytes; 2853 return ret ?: nbytes;
2756 2854
2757err_undo_css: 2855err_undo_css:
2758 cgrp->child_subsys_mask &= ~enable; 2856 cgrp->subtree_control &= ~enable;
2759 cgrp->child_subsys_mask |= disable; 2857 cgrp->subtree_control |= disable;
2858 cgroup_refresh_child_subsys_mask(cgrp);
2760 2859
2761 for_each_subsys(ss, ssid) { 2860 for_each_subsys(ss, ssid) {
2762 if (!(enable & (1 << ssid))) 2861 if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
2764 2863
2765 cgroup_for_each_live_child(child, cgrp) { 2864 cgroup_for_each_live_child(child, cgrp) {
2766 struct cgroup_subsys_state *css = cgroup_css(child, ss); 2865 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2767 if (css) 2866
2867 if (!css)
2868 continue;
2869
2870 if (css_enable & (1 << ssid))
2768 kill_css(css); 2871 kill_css(css);
2872 else
2873 cgroup_clear_dir(child, 1 << ssid);
2769 } 2874 }
2770 } 2875 }
2771 goto out_unlock; 2876 goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2878 2983
2879 /* 2984 /*
2880 * This isn't a proper migration and its usefulness is very 2985 * This isn't a proper migration and its usefulness is very
2881 * limited. Disallow if sane_behavior. 2986 * limited. Disallow on the default hierarchy.
2882 */ 2987 */
2883 if (cgroup_sane_behavior(cgrp)) 2988 if (cgroup_on_dfl(cgrp))
2884 return -EPERM; 2989 return -EPERM;
2885 2990
2886 /* 2991 /*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2964 3069
2965 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3070 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2966 /* does cft->flags tell us to skip this file on @cgrp? */ 3071 /* does cft->flags tell us to skip this file on @cgrp? */
2967 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3072 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2968 continue; 3073 continue;
2969 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 3074 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
2970 continue; 3075 continue;
2971 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) 3076 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2972 continue; 3077 continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
3024 kfree(cft->kf_ops); 3129 kfree(cft->kf_ops);
3025 cft->kf_ops = NULL; 3130 cft->kf_ops = NULL;
3026 cft->ss = NULL; 3131 cft->ss = NULL;
3132
3133 /* revert flags set by cgroup core while adding @cfts */
3134 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3027 } 3135 }
3028} 3136}
3029 3137
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
3109 * function currently returns 0 as long as @cfts registration is successful 3217 * function currently returns 0 as long as @cfts registration is successful
3110 * even if some file creation attempts on existing cgroups fail. 3218 * even if some file creation attempts on existing cgroups fail.
3111 */ 3219 */
3112int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3220static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3113{ 3221{
3114 int ret; 3222 int ret;
3115 3223
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3135} 3243}
3136 3244
3137/** 3245/**
3246 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3247 * @ss: target cgroup subsystem
3248 * @cfts: zero-length name terminated array of cftypes
3249 *
3250 * Similar to cgroup_add_cftypes() but the added files are only used for
3251 * the default hierarchy.
3252 */
3253int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3254{
3255 struct cftype *cft;
3256
3257 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3258 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3259 return cgroup_add_cftypes(ss, cfts);
3260}
3261
3262/**
3263 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3264 * @ss: target cgroup subsystem
3265 * @cfts: zero-length name terminated array of cftypes
3266 *
3267 * Similar to cgroup_add_cftypes() but the added files are only used for
3268 * the legacy hierarchies.
3269 */
3270int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3271{
3272 struct cftype *cft;
3273
3274 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3275 cft->flags |= __CFTYPE_NOT_ON_DFL;
3276 return cgroup_add_cftypes(ss, cfts);
3277}
3278
3279/**
3138 * cgroup_task_count - count the number of tasks in a cgroup. 3280 * cgroup_task_count - count the number of tasks in a cgroup.
3139 * @cgrp: the cgroup in question 3281 * @cgrp: the cgroup in question
3140 * 3282 *
@@ -3699,8 +3841,9 @@ after:
3699 * 3841 *
3700 * All this extra complexity was caused by the original implementation 3842 * All this extra complexity was caused by the original implementation
3701 * committing to an entirely unnecessary property. In the long term, we 3843 * committing to an entirely unnecessary property. In the long term, we
3702 * want to do away with it. Explicitly scramble sort order if 3844 * want to do away with it. Explicitly scramble sort order if on the
3703 * sane_behavior so that no such expectation exists in the new interface. 3845 * default hierarchy so that no such expectation exists in the new
3846 * interface.
3704 * 3847 *
3705 * Scrambling is done by swapping every two consecutive bits, which is 3848 * Scrambling is done by swapping every two consecutive bits, which is
3706 * non-identity one-to-one mapping which disturbs sort order sufficiently. 3849 * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
3715 3858
3716static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3859static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3717{ 3860{
3718 if (cgroup_sane_behavior(cgrp)) 3861 if (cgroup_on_dfl(cgrp))
3719 return pid_fry(pid); 3862 return pid_fry(pid);
3720 else 3863 else
3721 return pid; 3864 return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3818 css_task_iter_end(&it); 3961 css_task_iter_end(&it);
3819 length = n; 3962 length = n;
3820 /* now sort & (if procs) strip out duplicates */ 3963 /* now sort & (if procs) strip out duplicates */
3821 if (cgroup_sane_behavior(cgrp)) 3964 if (cgroup_on_dfl(cgrp))
3822 sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3965 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3823 else 3966 else
3824 sort(array, length, sizeof(pid_t), cmppid, NULL); 3967 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4040 return 0; 4183 return 0;
4041} 4184}
4042 4185
4043static struct cftype cgroup_base_files[] = { 4186/* cgroup core interface files for the default hierarchy */
4187static struct cftype cgroup_dfl_base_files[] = {
4044 { 4188 {
4045 .name = "cgroup.procs", 4189 .name = "cgroup.procs",
4046 .seq_start = cgroup_pidlist_start, 4190 .seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
4052 .mode = S_IRUGO | S_IWUSR, 4196 .mode = S_IRUGO | S_IWUSR,
4053 }, 4197 },
4054 { 4198 {
4055 .name = "cgroup.clone_children",
4056 .flags = CFTYPE_INSANE,
4057 .read_u64 = cgroup_clone_children_read,
4058 .write_u64 = cgroup_clone_children_write,
4059 },
4060 {
4061 .name = "cgroup.sane_behavior",
4062 .flags = CFTYPE_ONLY_ON_ROOT,
4063 .seq_show = cgroup_sane_behavior_show,
4064 },
4065 {
4066 .name = "cgroup.controllers", 4199 .name = "cgroup.controllers",
4067 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, 4200 .flags = CFTYPE_ONLY_ON_ROOT,
4068 .seq_show = cgroup_root_controllers_show, 4201 .seq_show = cgroup_root_controllers_show,
4069 }, 4202 },
4070 { 4203 {
4071 .name = "cgroup.controllers", 4204 .name = "cgroup.controllers",
4072 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4205 .flags = CFTYPE_NOT_ON_ROOT,
4073 .seq_show = cgroup_controllers_show, 4206 .seq_show = cgroup_controllers_show,
4074 }, 4207 },
4075 { 4208 {
4076 .name = "cgroup.subtree_control", 4209 .name = "cgroup.subtree_control",
4077 .flags = CFTYPE_ONLY_ON_DFL,
4078 .seq_show = cgroup_subtree_control_show, 4210 .seq_show = cgroup_subtree_control_show,
4079 .write = cgroup_subtree_control_write, 4211 .write = cgroup_subtree_control_write,
4080 }, 4212 },
4081 { 4213 {
4082 .name = "cgroup.populated", 4214 .name = "cgroup.populated",
4083 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4215 .flags = CFTYPE_NOT_ON_ROOT,
4084 .seq_show = cgroup_populated_show, 4216 .seq_show = cgroup_populated_show,
4085 }, 4217 },
4218 { } /* terminate */
4219};
4086 4220
4087 /* 4221/* cgroup core interface files for the legacy hierarchies */
4088 * Historical crazy stuff. These don't have "cgroup." prefix and 4222static struct cftype cgroup_legacy_base_files[] = {
4089 * don't exist if sane_behavior. If you're depending on these, be 4223 {
4090 * prepared to be burned. 4224 .name = "cgroup.procs",
4091 */ 4225 .seq_start = cgroup_pidlist_start,
4226 .seq_next = cgroup_pidlist_next,
4227 .seq_stop = cgroup_pidlist_stop,
4228 .seq_show = cgroup_pidlist_show,
4229 .private = CGROUP_FILE_PROCS,
4230 .write = cgroup_procs_write,
4231 .mode = S_IRUGO | S_IWUSR,
4232 },
4233 {
4234 .name = "cgroup.clone_children",
4235 .read_u64 = cgroup_clone_children_read,
4236 .write_u64 = cgroup_clone_children_write,
4237 },
4238 {
4239 .name = "cgroup.sane_behavior",
4240 .flags = CFTYPE_ONLY_ON_ROOT,
4241 .seq_show = cgroup_sane_behavior_show,
4242 },
4092 { 4243 {
4093 .name = "tasks", 4244 .name = "tasks",
4094 .flags = CFTYPE_INSANE, /* use "procs" instead */
4095 .seq_start = cgroup_pidlist_start, 4245 .seq_start = cgroup_pidlist_start,
4096 .seq_next = cgroup_pidlist_next, 4246 .seq_next = cgroup_pidlist_next,
4097 .seq_stop = cgroup_pidlist_stop, 4247 .seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
4102 }, 4252 },
4103 { 4253 {
4104 .name = "notify_on_release", 4254 .name = "notify_on_release",
4105 .flags = CFTYPE_INSANE,
4106 .read_u64 = cgroup_read_notify_on_release, 4255 .read_u64 = cgroup_read_notify_on_release,
4107 .write_u64 = cgroup_write_notify_on_release, 4256 .write_u64 = cgroup_write_notify_on_release,
4108 }, 4257 },
4109 { 4258 {
4110 .name = "release_agent", 4259 .name = "release_agent",
4111 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4260 .flags = CFTYPE_ONLY_ON_ROOT,
4112 .seq_show = cgroup_release_agent_show, 4261 .seq_show = cgroup_release_agent_show,
4113 .write = cgroup_release_agent_write, 4262 .write = cgroup_release_agent_write,
4114 .max_write_len = PATH_MAX - 1, 4263 .max_write_len = PATH_MAX - 1,
@@ -4316,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
4316 * create_css - create a cgroup_subsys_state 4465 * create_css - create a cgroup_subsys_state
4317 * @cgrp: the cgroup new css will be associated with 4466 * @cgrp: the cgroup new css will be associated with
4318 * @ss: the subsys of new css 4467 * @ss: the subsys of new css
4468 * @visible: whether to create control knobs for the new css or not
4319 * 4469 *
4320 * Create a new css associated with @cgrp - @ss pair. On success, the new 4470 * Create a new css associated with @cgrp - @ss pair. On success, the new
4321 * css is online and installed in @cgrp with all interface files created. 4471 * css is online and installed in @cgrp with all interface files created if
4322 * Returns 0 on success, -errno on failure. 4472 * @visible. Returns 0 on success, -errno on failure.
4323 */ 4473 */
4324static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4474static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4475 bool visible)
4325{ 4476{
4326 struct cgroup *parent = cgroup_parent(cgrp); 4477 struct cgroup *parent = cgroup_parent(cgrp);
4327 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); 4478 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4345,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4345 goto err_free_percpu_ref; 4496 goto err_free_percpu_ref;
4346 css->id = err; 4497 css->id = err;
4347 4498
4348 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4499 if (visible) {
4349 if (err) 4500 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4350 goto err_free_id; 4501 if (err)
4502 goto err_free_id;
4503 }
4351 4504
4352 /* @css is ready to be brought online now, make it visible */ 4505 /* @css is ready to be brought online now, make it visible */
4353 list_add_tail_rcu(&css->sibling, &parent_css->children); 4506 list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4387,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4387 struct cgroup_root *root; 4540 struct cgroup_root *root;
4388 struct cgroup_subsys *ss; 4541 struct cgroup_subsys *ss;
4389 struct kernfs_node *kn; 4542 struct kernfs_node *kn;
4543 struct cftype *base_files;
4390 int ssid, ret; 4544 int ssid, ret;
4391 4545
4392 parent = cgroup_kn_lock_live(parent_kn); 4546 parent = cgroup_kn_lock_live(parent_kn);
@@ -4457,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4457 if (ret) 4611 if (ret)
4458 goto out_destroy; 4612 goto out_destroy;
4459 4613
4460 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4614 if (cgroup_on_dfl(cgrp))
4615 base_files = cgroup_dfl_base_files;
4616 else
4617 base_files = cgroup_legacy_base_files;
4618
4619 ret = cgroup_addrm_files(cgrp, base_files, true);
4461 if (ret) 4620 if (ret)
4462 goto out_destroy; 4621 goto out_destroy;
4463 4622
4464 /* let's create and online css's */ 4623 /* let's create and online css's */
4465 for_each_subsys(ss, ssid) { 4624 for_each_subsys(ss, ssid) {
4466 if (parent->child_subsys_mask & (1 << ssid)) { 4625 if (parent->child_subsys_mask & (1 << ssid)) {
4467 ret = create_css(cgrp, ss); 4626 ret = create_css(cgrp, ss,
4627 parent->subtree_control & (1 << ssid));
4468 if (ret) 4628 if (ret)
4469 goto out_destroy; 4629 goto out_destroy;
4470 } 4630 }
@@ -4472,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4472 4632
4473 /* 4633 /*
4474 * On the default hierarchy, a child doesn't automatically inherit 4634 * On the default hierarchy, a child doesn't automatically inherit
4475 * child_subsys_mask from the parent. Each is configured manually. 4635 * subtree_control from the parent. Each is configured manually.
4476 */ 4636 */
4477 if (!cgroup_on_dfl(cgrp)) 4637 if (!cgroup_on_dfl(cgrp)) {
4478 cgrp->child_subsys_mask = parent->child_subsys_mask; 4638 cgrp->subtree_control = parent->subtree_control;
4639 cgroup_refresh_child_subsys_mask(cgrp);
4640 }
4479 4641
4480 kernfs_activate(kn); 4642 kernfs_activate(kn);
4481 4643
@@ -4738,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4738 */ 4900 */
4739int __init cgroup_init_early(void) 4901int __init cgroup_init_early(void)
4740{ 4902{
4741 static struct cgroup_sb_opts __initdata opts = 4903 static struct cgroup_sb_opts __initdata opts;
4742 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4743 struct cgroup_subsys *ss; 4904 struct cgroup_subsys *ss;
4744 int i; 4905 int i;
4745 4906
@@ -4777,7 +4938,8 @@ int __init cgroup_init(void)
4777 unsigned long key; 4938 unsigned long key;
4778 int ssid, err; 4939 int ssid, err;
4779 4940
4780 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4941 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
4942 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
4781 4943
4782 mutex_lock(&cgroup_mutex); 4944 mutex_lock(&cgroup_mutex);
4783 4945
@@ -4809,9 +4971,22 @@ int __init cgroup_init(void)
4809 * disabled flag and cftype registration needs kmalloc, 4971 * disabled flag and cftype registration needs kmalloc,
4810 * both of which aren't available during early_init. 4972 * both of which aren't available during early_init.
4811 */ 4973 */
4812 if (!ss->disabled) { 4974 if (ss->disabled)
4813 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4975 continue;
4814 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4976
4977 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4978
4979 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
4980 ss->dfl_cftypes = ss->legacy_cftypes;
4981
4982 if (!ss->dfl_cftypes)
4983 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
4984
4985 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4986 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4987 } else {
4988 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
4989 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4815 } 4990 }
4816 } 4991 }
4817 4992
@@ -5207,6 +5382,14 @@ static int __init cgroup_disable(char *str)
5207} 5382}
5208__setup("cgroup_disable=", cgroup_disable); 5383__setup("cgroup_disable=", cgroup_disable);
5209 5384
5385static int __init cgroup_set_legacy_files_on_dfl(char *str)
5386{
5387 printk("cgroup: using legacy files on the default hierarchy\n");
5388 cgroup_legacy_files_on_dfl = true;
5389 return 0;
5390}
5391__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5392
5210/** 5393/**
5211 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5394 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5212 * @dentry: directory dentry of interest 5395 * @dentry: directory dentry of interest
@@ -5401,6 +5584,6 @@ static struct cftype debug_files[] = {
5401struct cgroup_subsys debug_cgrp_subsys = { 5584struct cgroup_subsys debug_cgrp_subsys = {
5402 .css_alloc = debug_css_alloc, 5585 .css_alloc = debug_css_alloc,
5403 .css_free = debug_css_free, 5586 .css_free = debug_css_free,
5404 .base_cftypes = debug_files, 5587 .legacy_cftypes = debug_files,
5405}; 5588};
5406#endif /* CONFIG_CGROUP_DEBUG */ 5589#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
480 .css_free = freezer_css_free, 480 .css_free = freezer_css_free,
481 .attach = freezer_attach, 481 .attach = freezer_attach,
482 .fork = freezer_fork, 482 .fork = freezer_fork,
483 .base_cftypes = files, 483 .legacy_cftypes = files,
484}; 484};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
76 struct cgroup_subsys_state css; 76 struct cgroup_subsys_state css;
77 77
78 unsigned long flags; /* "unsigned long" so bitops work */ 78 unsigned long flags; /* "unsigned long" so bitops work */
79 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 79
80 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 80 /*
81 * On default hierarchy:
82 *
83 * The user-configured masks can only be changed by writing to
84 * cpuset.cpus and cpuset.mems, and won't be limited by the
85 * parent masks.
86 *
87 * The effective masks is the real masks that apply to the tasks
88 * in the cpuset. They may be changed if the configured masks are
89 * changed or hotplug happens.
90 *
91 * effective_mask == configured_mask & parent's effective_mask,
92 * and if it ends up empty, it will inherit the parent's mask.
93 *
94 *
95 * On legacy hierachy:
96 *
97 * The user-configured masks are always the same with effective masks.
98 */
99
100 /* user-configured CPUs and Memory Nodes allow to tasks */
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104 /* effective CPUs and Memory Nodes allow to tasks */
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
81 107
82 /* 108 /*
83 * This is old Memory Nodes tasks took on. 109 * This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
307 */ 333 */
308static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
309{ 335{
310 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
311 cs = parent_cs(cs); 337 cs = parent_cs(cs);
312 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
313} 339}
314 340
315/* 341/*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
325 */ 351 */
326static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
327{ 353{
328 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
329 cs = parent_cs(cs); 355 cs = parent_cs(cs);
330 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); 356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
331} 357}
332 358
333/* 359/*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
376 if (!trial) 402 if (!trial)
377 return NULL; 403 return NULL;
378 404
379 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { 405 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
380 kfree(trial); 406 goto free_cs;
381 return NULL; 407 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
382 } 408 goto free_cpus;
383 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
384 409
410 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
411 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
385 return trial; 412 return trial;
413
414free_cpus:
415 free_cpumask_var(trial->cpus_allowed);
416free_cs:
417 kfree(trial);
418 return NULL;
386} 419}
387 420
388/** 421/**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
391 */ 424 */
392static void free_trial_cpuset(struct cpuset *trial) 425static void free_trial_cpuset(struct cpuset *trial)
393{ 426{
427 free_cpumask_var(trial->effective_cpus);
394 free_cpumask_var(trial->cpus_allowed); 428 free_cpumask_var(trial->cpus_allowed);
395 kfree(trial); 429 kfree(trial);
396} 430}
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
436 470
437 par = parent_cs(cur); 471 par = parent_cs(cur);
438 472
439 /* We must be a subset of our parent cpuset */ 473 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
440 ret = -EACCES; 474 ret = -EACCES;
441 if (!is_cpuset_subset(trial, par)) 475 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
442 goto out; 476 goto out;
443 477
444 /* 478 /*
@@ -480,11 +514,11 @@ out:
480#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
481/* 515/*
482 * Helper routine for generate_sched_domains(). 516 * Helper routine for generate_sched_domains().
483 * Do cpusets a, b have overlapping cpus_allowed masks? 517 * Do cpusets a, b have overlapping effective cpus_allowed masks?
484 */ 518 */
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 519static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 520{
487 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); 521 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
488} 522}
489 523
490static void 524static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
601 *dattr = SD_ATTR_INIT; 635 *dattr = SD_ATTR_INIT;
602 update_domain_attr_tree(dattr, &top_cpuset); 636 update_domain_attr_tree(dattr, &top_cpuset);
603 } 637 }
604 cpumask_copy(doms[0], top_cpuset.cpus_allowed); 638 cpumask_copy(doms[0], top_cpuset.effective_cpus);
605 639
606 goto done; 640 goto done;
607 } 641 }
@@ -705,7 +739,7 @@ restart:
705 struct cpuset *b = csa[j]; 739 struct cpuset *b = csa[j];
706 740
707 if (apn == b->pn) { 741 if (apn == b->pn) {
708 cpumask_or(dp, dp, b->cpus_allowed); 742 cpumask_or(dp, dp, b->effective_cpus);
709 if (dattr) 743 if (dattr)
710 update_domain_attr_tree(dattr + nslot, b); 744 update_domain_attr_tree(dattr + nslot, b);
711 745
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
757 * passing doms with offlined cpu to partition_sched_domains(). 791 * passing doms with offlined cpu to partition_sched_domains().
758 * Anyways, hotplug work item will rebuild sched domains. 792 * Anyways, hotplug work item will rebuild sched domains.
759 */ 793 */
760 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) 794 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
761 goto out; 795 goto out;
762 796
763 /* Generate domain masks and attrs */ 797 /* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
781 mutex_unlock(&cpuset_mutex); 815 mutex_unlock(&cpuset_mutex);
782} 816}
783 817
784/*
785 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
786 * @cs: the cpuset in interest
787 *
788 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
789 * with non-empty cpus. We use effective cpumask whenever:
790 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
791 * if the cpuset they reside in has no cpus)
792 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
793 *
794 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
795 * exception. See comments there.
796 */
797static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
798{
799 while (cpumask_empty(cs->cpus_allowed))
800 cs = parent_cs(cs);
801 return cs;
802}
803
804/*
805 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
806 * @cs: the cpuset in interest
807 *
808 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
809 * with non-empty memss. We use effective nodemask whenever:
810 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
811 * if the cpuset they reside in has no mems)
812 * - we want to retrieve task_cs(tsk)'s mems_allowed.
813 *
814 * Called with cpuset_mutex held.
815 */
816static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
817{
818 while (nodes_empty(cs->mems_allowed))
819 cs = parent_cs(cs);
820 return cs;
821}
822
823/** 818/**
824 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 819 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
825 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 820 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
830 */ 825 */
831static void update_tasks_cpumask(struct cpuset *cs) 826static void update_tasks_cpumask(struct cpuset *cs)
832{ 827{
833 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
834 struct css_task_iter it; 828 struct css_task_iter it;
835 struct task_struct *task; 829 struct task_struct *task;
836 830
837 css_task_iter_start(&cs->css, &it); 831 css_task_iter_start(&cs->css, &it);
838 while ((task = css_task_iter_next(&it))) 832 while ((task = css_task_iter_next(&it)))
839 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); 833 set_cpus_allowed_ptr(task, cs->effective_cpus);
840 css_task_iter_end(&it); 834 css_task_iter_end(&it);
841} 835}
842 836
843/* 837/*
844 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 838 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
845 * @root_cs: the root cpuset of the hierarchy 839 * @cs: the cpuset to consider
846 * @update_root: update root cpuset or not? 840 * @new_cpus: temp variable for calculating new effective_cpus
841 *
842 * When congifured cpumask is changed, the effective cpumasks of this cpuset
843 * and all its descendants need to be updated.
847 * 844 *
848 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 845 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
849 * which take on cpumask of @root_cs.
850 * 846 *
851 * Called with cpuset_mutex held 847 * Called with cpuset_mutex held
852 */ 848 */
853static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) 849static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
854{ 850{
855 struct cpuset *cp; 851 struct cpuset *cp;
856 struct cgroup_subsys_state *pos_css; 852 struct cgroup_subsys_state *pos_css;
853 bool need_rebuild_sched_domains = false;
857 854
858 rcu_read_lock(); 855 rcu_read_lock();
859 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 856 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
860 if (cp == root_cs) { 857 struct cpuset *parent = parent_cs(cp);
861 if (!update_root) 858
862 continue; 859 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
863 } else { 860
864 /* skip the whole subtree if @cp have some CPU */ 861 /*
865 if (!cpumask_empty(cp->cpus_allowed)) { 862 * If it becomes empty, inherit the effective mask of the
866 pos_css = css_rightmost_descendant(pos_css); 863 * parent, which is guaranteed to have some CPUs.
867 continue; 864 */
868 } 865 if (cpumask_empty(new_cpus))
866 cpumask_copy(new_cpus, parent->effective_cpus);
867
868 /* Skip the whole subtree if the cpumask remains the same. */
869 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
870 pos_css = css_rightmost_descendant(pos_css);
871 continue;
869 } 872 }
873
870 if (!css_tryget_online(&cp->css)) 874 if (!css_tryget_online(&cp->css))
871 continue; 875 continue;
872 rcu_read_unlock(); 876 rcu_read_unlock();
873 877
878 mutex_lock(&callback_mutex);
879 cpumask_copy(cp->effective_cpus, new_cpus);
880 mutex_unlock(&callback_mutex);
881
882 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
883 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
884
874 update_tasks_cpumask(cp); 885 update_tasks_cpumask(cp);
875 886
887 /*
888 * If the effective cpumask of any non-empty cpuset is changed,
889 * we need to rebuild sched domains.
890 */
891 if (!cpumask_empty(cp->cpus_allowed) &&
892 is_sched_load_balance(cp))
893 need_rebuild_sched_domains = true;
894
876 rcu_read_lock(); 895 rcu_read_lock();
877 css_put(&cp->css); 896 css_put(&cp->css);
878 } 897 }
879 rcu_read_unlock(); 898 rcu_read_unlock();
899
900 if (need_rebuild_sched_domains)
901 rebuild_sched_domains_locked();
880} 902}
881 903
882/** 904/**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
889 const char *buf) 911 const char *buf)
890{ 912{
891 int retval; 913 int retval;
892 int is_load_balanced;
893 914
894 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 915 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
895 if (cs == &top_cpuset) 916 if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
908 if (retval < 0) 929 if (retval < 0)
909 return retval; 930 return retval;
910 931
911 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 932 if (!cpumask_subset(trialcs->cpus_allowed,
933 top_cpuset.cpus_allowed))
912 return -EINVAL; 934 return -EINVAL;
913 } 935 }
914 936
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 if (retval < 0) 942 if (retval < 0)
921 return retval; 943 return retval;
922 944
923 is_load_balanced = is_sched_load_balance(trialcs);
924
925 mutex_lock(&callback_mutex); 945 mutex_lock(&callback_mutex);
926 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 946 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
927 mutex_unlock(&callback_mutex); 947 mutex_unlock(&callback_mutex);
928 948
929 update_tasks_cpumask_hier(cs, true); 949 /* use trialcs->cpus_allowed as a temp variable */
930 950 update_cpumasks_hier(cs, trialcs->cpus_allowed);
931 if (is_load_balanced)
932 rebuild_sched_domains_locked();
933 return 0; 951 return 0;
934} 952}
935 953
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
951 const nodemask_t *to) 969 const nodemask_t *to)
952{ 970{
953 struct task_struct *tsk = current; 971 struct task_struct *tsk = current;
954 struct cpuset *mems_cs;
955 972
956 tsk->mems_allowed = *to; 973 tsk->mems_allowed = *to;
957 974
958 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 975 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
959 976
960 rcu_read_lock(); 977 rcu_read_lock();
961 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 978 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
962 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
963 rcu_read_unlock(); 979 rcu_read_unlock();
964} 980}
965 981
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
1028static void update_tasks_nodemask(struct cpuset *cs) 1044static void update_tasks_nodemask(struct cpuset *cs)
1029{ 1045{
1030 static nodemask_t newmems; /* protected by cpuset_mutex */ 1046 static nodemask_t newmems; /* protected by cpuset_mutex */
1031 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1032 struct css_task_iter it; 1047 struct css_task_iter it;
1033 struct task_struct *task; 1048 struct task_struct *task;
1034 1049
1035 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1036 1051
1037 guarantee_online_mems(mems_cs, &newmems); 1052 guarantee_online_mems(cs, &newmems);
1038 1053
1039 /* 1054 /*
1040 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1055 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
1077} 1092}
1078 1093
1079/* 1094/*
1080 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1095 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1081 * @cs: the root cpuset of the hierarchy 1096 * @cs: the cpuset to consider
1082 * @update_root: update the root cpuset or not? 1097 * @new_mems: a temp variable for calculating new effective_mems
1083 * 1098 *
1084 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1099 * When configured nodemask is changed, the effective nodemasks of this cpuset
1085 * which take on nodemask of @root_cs. 1100 * and all its descendants need to be updated.
1101 *
1102 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1086 * 1103 *
1087 * Called with cpuset_mutex held 1104 * Called with cpuset_mutex held
1088 */ 1105 */
1089static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) 1106static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1090{ 1107{
1091 struct cpuset *cp; 1108 struct cpuset *cp;
1092 struct cgroup_subsys_state *pos_css; 1109 struct cgroup_subsys_state *pos_css;
1093 1110
1094 rcu_read_lock(); 1111 rcu_read_lock();
1095 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 1112 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1096 if (cp == root_cs) { 1113 struct cpuset *parent = parent_cs(cp);
1097 if (!update_root) 1114
1098 continue; 1115 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1099 } else { 1116
1100 /* skip the whole subtree if @cp have some CPU */ 1117 /*
1101 if (!nodes_empty(cp->mems_allowed)) { 1118 * If it becomes empty, inherit the effective mask of the
1102 pos_css = css_rightmost_descendant(pos_css); 1119 * parent, which is guaranteed to have some MEMs.
1103 continue; 1120 */
1104 } 1121 if (nodes_empty(*new_mems))
1122 *new_mems = parent->effective_mems;
1123
1124 /* Skip the whole subtree if the nodemask remains the same. */
1125 if (nodes_equal(*new_mems, cp->effective_mems)) {
1126 pos_css = css_rightmost_descendant(pos_css);
1127 continue;
1105 } 1128 }
1129
1106 if (!css_tryget_online(&cp->css)) 1130 if (!css_tryget_online(&cp->css))
1107 continue; 1131 continue;
1108 rcu_read_unlock(); 1132 rcu_read_unlock();
1109 1133
1134 mutex_lock(&callback_mutex);
1135 cp->effective_mems = *new_mems;
1136 mutex_unlock(&callback_mutex);
1137
1138 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1139 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1140
1110 update_tasks_nodemask(cp); 1141 update_tasks_nodemask(cp);
1111 1142
1112 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1156 goto done; 1187 goto done;
1157 1188
1158 if (!nodes_subset(trialcs->mems_allowed, 1189 if (!nodes_subset(trialcs->mems_allowed,
1159 node_states[N_MEMORY])) { 1190 top_cpuset.mems_allowed)) {
1160 retval = -EINVAL; 1191 retval = -EINVAL;
1161 goto done; 1192 goto done;
1162 } 1193 }
1163 } 1194 }
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1174 cs->mems_allowed = trialcs->mems_allowed; 1205 cs->mems_allowed = trialcs->mems_allowed;
1175 mutex_unlock(&callback_mutex); 1206 mutex_unlock(&callback_mutex);
1176 1207
1177 update_tasks_nodemask_hier(cs, true); 1208 /* use trialcs->mems_allowed as a temp variable */
1209 update_nodemasks_hier(cs, &cs->mems_allowed);
1178done: 1210done:
1179 return retval; 1211 return retval;
1180} 1212}
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1389 1421
1390 mutex_lock(&cpuset_mutex); 1422 mutex_lock(&cpuset_mutex);
1391 1423
1392 /* 1424 /* allow moving tasks into an empty cpuset if on default hierarchy */
1393 * We allow to move tasks into an empty cpuset if sane_behavior
1394 * flag is set.
1395 */
1396 ret = -ENOSPC; 1425 ret = -ENOSPC;
1397 if (!cgroup_sane_behavior(css->cgroup) && 1426 if (!cgroup_on_dfl(css->cgroup) &&
1398 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1427 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1399 goto out_unlock; 1428 goto out_unlock;
1400 1429
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1452 struct task_struct *leader = cgroup_taskset_first(tset); 1481 struct task_struct *leader = cgroup_taskset_first(tset);
1453 struct cpuset *cs = css_cs(css); 1482 struct cpuset *cs = css_cs(css);
1454 struct cpuset *oldcs = cpuset_attach_old_cs; 1483 struct cpuset *oldcs = cpuset_attach_old_cs;
1455 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1456 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1457 1484
1458 mutex_lock(&cpuset_mutex); 1485 mutex_lock(&cpuset_mutex);
1459 1486
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1461 if (cs == &top_cpuset) 1488 if (cs == &top_cpuset)
1462 cpumask_copy(cpus_attach, cpu_possible_mask); 1489 cpumask_copy(cpus_attach, cpu_possible_mask);
1463 else 1490 else
1464 guarantee_online_cpus(cpus_cs, cpus_attach); 1491 guarantee_online_cpus(cs, cpus_attach);
1465 1492
1466 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1493 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1467 1494
1468 cgroup_taskset_for_each(task, tset) { 1495 cgroup_taskset_for_each(task, tset) {
1469 /* 1496 /*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1480 * Change mm, possibly for multiple threads in a threadgroup. This is 1507 * Change mm, possibly for multiple threads in a threadgroup. This is
1481 * expensive and may sleep. 1508 * expensive and may sleep.
1482 */ 1509 */
1483 cpuset_attach_nodemask_to = cs->mems_allowed; 1510 cpuset_attach_nodemask_to = cs->effective_mems;
1484 mm = get_task_mm(leader); 1511 mm = get_task_mm(leader);
1485 if (mm) { 1512 if (mm) {
1486 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1487
1488 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1513 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1489 1514
1490 /* 1515 /*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1495 * mm from. 1520 * mm from.
1496 */ 1521 */
1497 if (is_memory_migrate(cs)) { 1522 if (is_memory_migrate(cs)) {
1498 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, 1523 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1499 &cpuset_attach_nodemask_to); 1524 &cpuset_attach_nodemask_to);
1500 } 1525 }
1501 mmput(mm); 1526 mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
1516 FILE_MEMORY_MIGRATE, 1541 FILE_MEMORY_MIGRATE,
1517 FILE_CPULIST, 1542 FILE_CPULIST,
1518 FILE_MEMLIST, 1543 FILE_MEMLIST,
1544 FILE_EFFECTIVE_CPULIST,
1545 FILE_EFFECTIVE_MEMLIST,
1519 FILE_CPU_EXCLUSIVE, 1546 FILE_CPU_EXCLUSIVE,
1520 FILE_MEM_EXCLUSIVE, 1547 FILE_MEM_EXCLUSIVE,
1521 FILE_MEM_HARDWALL, 1548 FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1694 case FILE_MEMLIST: 1721 case FILE_MEMLIST:
1695 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1722 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1696 break; 1723 break;
1724 case FILE_EFFECTIVE_CPULIST:
1725 s += cpulist_scnprintf(s, count, cs->effective_cpus);
1726 break;
1727 case FILE_EFFECTIVE_MEMLIST:
1728 s += nodelist_scnprintf(s, count, cs->effective_mems);
1729 break;
1697 default: 1730 default:
1698 ret = -EINVAL; 1731 ret = -EINVAL;
1699 goto out_unlock; 1732 goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
1779 }, 1812 },
1780 1813
1781 { 1814 {
1815 .name = "effective_cpus",
1816 .seq_show = cpuset_common_seq_show,
1817 .private = FILE_EFFECTIVE_CPULIST,
1818 },
1819
1820 {
1821 .name = "effective_mems",
1822 .seq_show = cpuset_common_seq_show,
1823 .private = FILE_EFFECTIVE_MEMLIST,
1824 },
1825
1826 {
1782 .name = "cpu_exclusive", 1827 .name = "cpu_exclusive",
1783 .read_u64 = cpuset_read_u64, 1828 .read_u64 = cpuset_read_u64,
1784 .write_u64 = cpuset_write_u64, 1829 .write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1869 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1914 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1870 if (!cs) 1915 if (!cs)
1871 return ERR_PTR(-ENOMEM); 1916 return ERR_PTR(-ENOMEM);
1872 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1917 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1873 kfree(cs); 1918 goto free_cs;
1874 return ERR_PTR(-ENOMEM); 1919 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1875 } 1920 goto free_cpus;
1876 1921
1877 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1922 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1878 cpumask_clear(cs->cpus_allowed); 1923 cpumask_clear(cs->cpus_allowed);
1879 nodes_clear(cs->mems_allowed); 1924 nodes_clear(cs->mems_allowed);
1925 cpumask_clear(cs->effective_cpus);
1926 nodes_clear(cs->effective_mems);
1880 fmeter_init(&cs->fmeter); 1927 fmeter_init(&cs->fmeter);
1881 cs->relax_domain_level = -1; 1928 cs->relax_domain_level = -1;
1882 1929
1883 return &cs->css; 1930 return &cs->css;
1931
1932free_cpus:
1933 free_cpumask_var(cs->cpus_allowed);
1934free_cs:
1935 kfree(cs);
1936 return ERR_PTR(-ENOMEM);
1884} 1937}
1885 1938
1886static int cpuset_css_online(struct cgroup_subsys_state *css) 1939static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1903 1956
1904 cpuset_inc(); 1957 cpuset_inc();
1905 1958
1959 mutex_lock(&callback_mutex);
1960 if (cgroup_on_dfl(cs->css.cgroup)) {
1961 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1962 cs->effective_mems = parent->effective_mems;
1963 }
1964 mutex_unlock(&callback_mutex);
1965
1906 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1966 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1907 goto out_unlock; 1967 goto out_unlock;
1908 1968
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
1962{ 2022{
1963 struct cpuset *cs = css_cs(css); 2023 struct cpuset *cs = css_cs(css);
1964 2024
2025 free_cpumask_var(cs->effective_cpus);
1965 free_cpumask_var(cs->cpus_allowed); 2026 free_cpumask_var(cs->cpus_allowed);
1966 kfree(cs); 2027 kfree(cs);
1967} 2028}
1968 2029
2030static void cpuset_bind(struct cgroup_subsys_state *root_css)
2031{
2032 mutex_lock(&cpuset_mutex);
2033 mutex_lock(&callback_mutex);
2034
2035 if (cgroup_on_dfl(root_css->cgroup)) {
2036 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2037 top_cpuset.mems_allowed = node_possible_map;
2038 } else {
2039 cpumask_copy(top_cpuset.cpus_allowed,
2040 top_cpuset.effective_cpus);
2041 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2042 }
2043
2044 mutex_unlock(&callback_mutex);
2045 mutex_unlock(&cpuset_mutex);
2046}
2047
1969struct cgroup_subsys cpuset_cgrp_subsys = { 2048struct cgroup_subsys cpuset_cgrp_subsys = {
1970 .css_alloc = cpuset_css_alloc, 2049 .css_alloc = cpuset_css_alloc,
1971 .css_online = cpuset_css_online, 2050 .css_online = cpuset_css_online,
1972 .css_offline = cpuset_css_offline, 2051 .css_offline = cpuset_css_offline,
1973 .css_free = cpuset_css_free, 2052 .css_free = cpuset_css_free,
1974 .can_attach = cpuset_can_attach, 2053 .can_attach = cpuset_can_attach,
1975 .cancel_attach = cpuset_cancel_attach, 2054 .cancel_attach = cpuset_cancel_attach,
1976 .attach = cpuset_attach, 2055 .attach = cpuset_attach,
1977 .base_cftypes = files, 2056 .bind = cpuset_bind,
1978 .early_init = 1, 2057 .legacy_cftypes = files,
2058 .early_init = 1,
1979}; 2059};
1980 2060
1981/** 2061/**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
1990 2070
1991 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2071 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1992 BUG(); 2072 BUG();
2073 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2074 BUG();
1993 2075
1994 cpumask_setall(top_cpuset.cpus_allowed); 2076 cpumask_setall(top_cpuset.cpus_allowed);
1995 nodes_setall(top_cpuset.mems_allowed); 2077 nodes_setall(top_cpuset.mems_allowed);
2078 cpumask_setall(top_cpuset.effective_cpus);
2079 nodes_setall(top_cpuset.effective_mems);
1996 2080
1997 fmeter_init(&top_cpuset.fmeter); 2081 fmeter_init(&top_cpuset.fmeter);
1998 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 2082 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2035 } 2119 }
2036} 2120}
2037 2121
2122static void
2123hotplug_update_tasks_legacy(struct cpuset *cs,
2124 struct cpumask *new_cpus, nodemask_t *new_mems,
2125 bool cpus_updated, bool mems_updated)
2126{
2127 bool is_empty;
2128
2129 mutex_lock(&callback_mutex);
2130 cpumask_copy(cs->cpus_allowed, new_cpus);
2131 cpumask_copy(cs->effective_cpus, new_cpus);
2132 cs->mems_allowed = *new_mems;
2133 cs->effective_mems = *new_mems;
2134 mutex_unlock(&callback_mutex);
2135
2136 /*
2137 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2138 * as the tasks will be migratecd to an ancestor.
2139 */
2140 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2141 update_tasks_cpumask(cs);
2142 if (mems_updated && !nodes_empty(cs->mems_allowed))
2143 update_tasks_nodemask(cs);
2144
2145 is_empty = cpumask_empty(cs->cpus_allowed) ||
2146 nodes_empty(cs->mems_allowed);
2147
2148 mutex_unlock(&cpuset_mutex);
2149
2150 /*
2151 * Move tasks to the nearest ancestor with execution resources,
2152 * This is full cgroup operation which will also call back into
2153 * cpuset. Should be done outside any lock.
2154 */
2155 if (is_empty)
2156 remove_tasks_in_empty_cpuset(cs);
2157
2158 mutex_lock(&cpuset_mutex);
2159}
2160
2161static void
2162hotplug_update_tasks(struct cpuset *cs,
2163 struct cpumask *new_cpus, nodemask_t *new_mems,
2164 bool cpus_updated, bool mems_updated)
2165{
2166 if (cpumask_empty(new_cpus))
2167 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2168 if (nodes_empty(*new_mems))
2169 *new_mems = parent_cs(cs)->effective_mems;
2170
2171 mutex_lock(&callback_mutex);
2172 cpumask_copy(cs->effective_cpus, new_cpus);
2173 cs->effective_mems = *new_mems;
2174 mutex_unlock(&callback_mutex);
2175
2176 if (cpus_updated)
2177 update_tasks_cpumask(cs);
2178 if (mems_updated)
2179 update_tasks_nodemask(cs);
2180}
2181
2038/** 2182/**
2039 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2183 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2040 * @cs: cpuset in interest 2184 * @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2045 */ 2189 */
2046static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2190static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2047{ 2191{
2048 static cpumask_t off_cpus; 2192 static cpumask_t new_cpus;
2049 static nodemask_t off_mems; 2193 static nodemask_t new_mems;
2050 bool is_empty; 2194 bool cpus_updated;
2051 bool sane = cgroup_sane_behavior(cs->css.cgroup); 2195 bool mems_updated;
2052
2053retry: 2196retry:
2054 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2197 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2055 2198
@@ -2064,51 +2207,20 @@ retry:
2064 goto retry; 2207 goto retry;
2065 } 2208 }
2066 2209
2067 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2210 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2068 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2211 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2069
2070 mutex_lock(&callback_mutex);
2071 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2072 mutex_unlock(&callback_mutex);
2073
2074 /*
2075 * If sane_behavior flag is set, we need to update tasks' cpumask
2076 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2077 * call update_tasks_cpumask() if the cpuset becomes empty, as
2078 * the tasks in it will be migrated to an ancestor.
2079 */
2080 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2081 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2082 update_tasks_cpumask(cs);
2083 2212
2084 mutex_lock(&callback_mutex); 2213 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2085 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2214 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2086 mutex_unlock(&callback_mutex);
2087
2088 /*
2089 * If sane_behavior flag is set, we need to update tasks' nodemask
2090 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2091 * call update_tasks_nodemask() if the cpuset becomes empty, as
2092 * the tasks in it will be migratd to an ancestor.
2093 */
2094 if ((sane && nodes_empty(cs->mems_allowed)) ||
2095 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2096 update_tasks_nodemask(cs);
2097 2215
2098 is_empty = cpumask_empty(cs->cpus_allowed) || 2216 if (cgroup_on_dfl(cs->css.cgroup))
2099 nodes_empty(cs->mems_allowed); 2217 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2218 cpus_updated, mems_updated);
2219 else
2220 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2221 cpus_updated, mems_updated);
2100 2222
2101 mutex_unlock(&cpuset_mutex); 2223 mutex_unlock(&cpuset_mutex);
2102
2103 /*
2104 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2105 *
2106 * Otherwise move tasks to the nearest ancestor with execution
2107 * resources. This is full cgroup operation which will
2108 * also call back into cpuset. Should be done outside any lock.
2109 */
2110 if (!sane && is_empty)
2111 remove_tasks_in_empty_cpuset(cs);
2112} 2224}
2113 2225
2114/** 2226/**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 static cpumask_t new_cpus; 2244 static cpumask_t new_cpus;
2133 static nodemask_t new_mems; 2245 static nodemask_t new_mems;
2134 bool cpus_updated, mems_updated; 2246 bool cpus_updated, mems_updated;
2247 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2135 2248
2136 mutex_lock(&cpuset_mutex); 2249 mutex_lock(&cpuset_mutex);
2137 2250
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2139 cpumask_copy(&new_cpus, cpu_active_mask); 2252 cpumask_copy(&new_cpus, cpu_active_mask);
2140 new_mems = node_states[N_MEMORY]; 2253 new_mems = node_states[N_MEMORY];
2141 2254
2142 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2255 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2143 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2256 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2144 2257
2145 /* synchronize cpus_allowed to cpu_active_mask */ 2258 /* synchronize cpus_allowed to cpu_active_mask */
2146 if (cpus_updated) { 2259 if (cpus_updated) {
2147 mutex_lock(&callback_mutex); 2260 mutex_lock(&callback_mutex);
2148 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2261 if (!on_dfl)
2262 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2263 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2149 mutex_unlock(&callback_mutex); 2264 mutex_unlock(&callback_mutex);
2150 /* we don't mess with cpumasks of tasks in top_cpuset */ 2265 /* we don't mess with cpumasks of tasks in top_cpuset */
2151 } 2266 }
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2153 /* synchronize mems_allowed to N_MEMORY */ 2268 /* synchronize mems_allowed to N_MEMORY */
2154 if (mems_updated) { 2269 if (mems_updated) {
2155 mutex_lock(&callback_mutex); 2270 mutex_lock(&callback_mutex);
2156 top_cpuset.mems_allowed = new_mems; 2271 if (!on_dfl)
2272 top_cpuset.mems_allowed = new_mems;
2273 top_cpuset.effective_mems = new_mems;
2157 mutex_unlock(&callback_mutex); 2274 mutex_unlock(&callback_mutex);
2158 update_tasks_nodemask(&top_cpuset); 2275 update_tasks_nodemask(&top_cpuset);
2159 } 2276 }
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
2228 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2345 top_cpuset.mems_allowed = node_states[N_MEMORY];
2229 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2346 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2230 2347
2348 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2349 top_cpuset.effective_mems = node_states[N_MEMORY];
2350
2231 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2351 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2232} 2352}
2233 2353
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
2244 2364
2245void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2365void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2246{ 2366{
2247 struct cpuset *cpus_cs;
2248
2249 mutex_lock(&callback_mutex); 2367 mutex_lock(&callback_mutex);
2250 rcu_read_lock(); 2368 rcu_read_lock();
2251 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2369 guarantee_online_cpus(task_cs(tsk), pmask);
2252 guarantee_online_cpus(cpus_cs, pmask);
2253 rcu_read_unlock(); 2370 rcu_read_unlock();
2254 mutex_unlock(&callback_mutex); 2371 mutex_unlock(&callback_mutex);
2255} 2372}
2256 2373
2257void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2374void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2258{ 2375{
2259 struct cpuset *cpus_cs;
2260
2261 rcu_read_lock(); 2376 rcu_read_lock();
2262 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2377 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2263 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2264 rcu_read_unlock(); 2378 rcu_read_unlock();
2265 2379
2266 /* 2380 /*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
2299 2413
2300nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2414nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2301{ 2415{
2302 struct cpuset *mems_cs;
2303 nodemask_t mask; 2416 nodemask_t mask;
2304 2417
2305 mutex_lock(&callback_mutex); 2418 mutex_lock(&callback_mutex);
2306 rcu_read_lock(); 2419 rcu_read_lock();
2307 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2420 guarantee_online_mems(task_cs(tsk), &mask);
2308 guarantee_online_mems(mems_cs, &mask);
2309 rcu_read_unlock(); 2421 rcu_read_unlock();
2310 mutex_unlock(&callback_mutex); 2422 mutex_unlock(&callback_mutex);
2311 2423
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc1638b33449..126f7e3f04e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8083,7 +8083,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8083 .can_attach = cpu_cgroup_can_attach, 8083 .can_attach = cpu_cgroup_can_attach,
8084 .attach = cpu_cgroup_attach, 8084 .attach = cpu_cgroup_attach,
8085 .exit = cpu_cgroup_exit, 8085 .exit = cpu_cgroup_exit,
8086 .base_cftypes = cpu_files, 8086 .legacy_cftypes = cpu_files,
8087 .early_init = 1, 8087 .early_init = 1,
8088}; 8088};
8089 8089
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
278struct cgroup_subsys cpuacct_cgrp_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
280 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
281 .base_cftypes = files, 281 .legacy_cftypes = files,
282 .early_init = 1, 282 .early_init = 1,
283}; 283};
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 493f758445e7..9aae6f47433f 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -358,9 +358,8 @@ static void __init __hugetlb_cgroup_file_init(int idx)
358 cft = &h->cgroup_files[4]; 358 cft = &h->cgroup_files[4];
359 memset(cft, 0, sizeof(*cft)); 359 memset(cft, 0, sizeof(*cft));
360 360
361 WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); 361 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
362 362 h->cgroup_files));
363 return;
364} 363}
365 364
366void __init hugetlb_cgroup_file_init(void) 365void __init hugetlb_cgroup_file_init(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1f14a430c656..f009a14918d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6007,7 +6007,6 @@ static struct cftype mem_cgroup_files[] = {
6007 }, 6007 },
6008 { 6008 {
6009 .name = "use_hierarchy", 6009 .name = "use_hierarchy",
6010 .flags = CFTYPE_INSANE,
6011 .write_u64 = mem_cgroup_hierarchy_write, 6010 .write_u64 = mem_cgroup_hierarchy_write,
6012 .read_u64 = mem_cgroup_hierarchy_read, 6011 .read_u64 = mem_cgroup_hierarchy_read,
6013 }, 6012 },
@@ -6411,6 +6410,29 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6411 __mem_cgroup_free(memcg); 6410 __mem_cgroup_free(memcg);
6412} 6411}
6413 6412
6413/**
6414 * mem_cgroup_css_reset - reset the states of a mem_cgroup
6415 * @css: the target css
6416 *
6417 * Reset the states of the mem_cgroup associated with @css. This is
6418 * invoked when the userland requests disabling on the default hierarchy
6419 * but the memcg is pinned through dependency. The memcg should stop
6420 * applying policies and should revert to the vanilla state as it may be
6421 * made visible again.
6422 *
6423 * The current implementation only resets the essential configurations.
6424 * This needs to be expanded to cover all the visible parts.
6425 */
6426static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
6427{
6428 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6429
6430 mem_cgroup_resize_limit(memcg, ULLONG_MAX);
6431 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
6432 memcg_update_kmem_limit(memcg, ULLONG_MAX);
6433 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
6434}
6435
6414#ifdef CONFIG_MMU 6436#ifdef CONFIG_MMU
6415/* Handlers for move charge at task migration. */ 6437/* Handlers for move charge at task migration. */
6416#define PRECHARGE_COUNT_AT_ONCE 256 6438#define PRECHARGE_COUNT_AT_ONCE 256
@@ -7005,16 +7027,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7005 7027
7006/* 7028/*
7007 * Cgroup retains root cgroups across [un]mount cycles making it necessary 7029 * Cgroup retains root cgroups across [un]mount cycles making it necessary
7008 * to verify sane_behavior flag on each mount attempt. 7030 * to verify whether we're attached to the default hierarchy on each mount
7031 * attempt.
7009 */ 7032 */
7010static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 7033static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7011{ 7034{
7012 /* 7035 /*
7013 * use_hierarchy is forced with sane_behavior. cgroup core 7036 * use_hierarchy is forced on the default hierarchy. cgroup core
7014 * guarantees that @root doesn't have any children, so turning it 7037 * guarantees that @root doesn't have any children, so turning it
7015 * on for the root memcg is enough. 7038 * on for the root memcg is enough.
7016 */ 7039 */
7017 if (cgroup_sane_behavior(root_css->cgroup)) 7040 if (cgroup_on_dfl(root_css->cgroup))
7018 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7041 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7019} 7042}
7020 7043
@@ -7023,11 +7046,12 @@ struct cgroup_subsys memory_cgrp_subsys = {
7023 .css_online = mem_cgroup_css_online, 7046 .css_online = mem_cgroup_css_online,
7024 .css_offline = mem_cgroup_css_offline, 7047 .css_offline = mem_cgroup_css_offline,
7025 .css_free = mem_cgroup_css_free, 7048 .css_free = mem_cgroup_css_free,
7049 .css_reset = mem_cgroup_css_reset,
7026 .can_attach = mem_cgroup_can_attach, 7050 .can_attach = mem_cgroup_can_attach,
7027 .cancel_attach = mem_cgroup_cancel_attach, 7051 .cancel_attach = mem_cgroup_cancel_attach,
7028 .attach = mem_cgroup_move_task, 7052 .attach = mem_cgroup_move_task,
7029 .bind = mem_cgroup_bind, 7053 .bind = mem_cgroup_bind,
7030 .base_cftypes = mem_cgroup_files, 7054 .legacy_cftypes = mem_cgroup_files,
7031 .early_init = 0, 7055 .early_init = 0,
7032}; 7056};
7033 7057
@@ -7044,7 +7068,8 @@ __setup("swapaccount=", enable_swap_account);
7044 7068
7045static void __init memsw_file_init(void) 7069static void __init memsw_file_init(void)
7046{ 7070{
7047 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); 7071 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
7072 memsw_cgroup_files));
7048} 7073}
7049 7074
7050static void __init enable_swap_cgroup(void) 7075static void __init enable_swap_cgroup(void)
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 30d903b19c62..1f2a126f4ffa 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -107,5 +107,5 @@ struct cgroup_subsys net_cls_cgrp_subsys = {
107 .css_online = cgrp_css_online, 107 .css_online = cgrp_css_online,
108 .css_free = cgrp_css_free, 108 .css_free = cgrp_css_free,
109 .attach = cgrp_attach, 109 .attach = cgrp_attach,
110 .base_cftypes = ss_files, 110 .legacy_cftypes = ss_files,
111}; 111};
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2f385b9bccc0..cbd0a199bf52 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -249,7 +249,7 @@ struct cgroup_subsys net_prio_cgrp_subsys = {
249 .css_online = cgrp_css_online, 249 .css_online = cgrp_css_online,
250 .css_free = cgrp_css_free, 250 .css_free = cgrp_css_free,
251 .attach = net_prio_attach, 251 .attach = net_prio_attach,
252 .base_cftypes = ss_files, 252 .legacy_cftypes = ss_files,
253}; 253};
254 254
255static int netprio_device_event(struct notifier_block *unused, 255static int netprio_device_event(struct notifier_block *unused,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7a2ec3ac584..3af522622fad 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -222,7 +222,7 @@ static struct cftype tcp_files[] = {
222 222
223static int __init tcp_memcontrol_init(void) 223static int __init tcp_memcontrol_init(void)
224{ 224{
225 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); 225 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
226 return 0; 226 return 0;
227} 227}
228__initcall(tcp_memcontrol_init); 228__initcall(tcp_memcontrol_init);
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d9d69e6930ed..188c1d26393b 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -796,7 +796,7 @@ struct cgroup_subsys devices_cgrp_subsys = {
796 .css_free = devcgroup_css_free, 796 .css_free = devcgroup_css_free,
797 .css_online = devcgroup_online, 797 .css_online = devcgroup_online,
798 .css_offline = devcgroup_offline, 798 .css_offline = devcgroup_offline,
799 .base_cftypes = dev_cgroup_files, 799 .legacy_cftypes = dev_cgroup_files,
800}; 800};
801 801
802/** 802/**